├── .gitattributes └── code └── code.ino /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /code/code.ino: -------------------------------------------------------------------------------- 1 | 2 | /* 3 | * XIAO ESP32S3 Audio Recorder with ElevenLabs Speech-to-Text 4 | * Clean STT-only version (no ESP-NOW) 5 | */ 6 | 7 | #include "driver/i2s_pdm.h" 8 | #include "driver/gpio.h" 9 | #include 10 | #include "FS.h" 11 | #include "SD.h" 12 | #include "SPI.h" 13 | #include 14 | #include 15 | #include 16 | 17 | long now, total_time; 18 | // WiFi credentials 19 | const char* ssid = "SSID"; 20 | const char* password = "PASS"; 21 | 22 | // ElevenLabs API configuration 23 | const char* elevenlabs_api_key = "API KEY"; 24 | const char* elevenlabs_stt_url = "https://api.elevenlabs.io/v1/speech-to-text"; 25 | 26 | // Audio recording settings 27 | #define WAV_FILE_NAME "recording" 28 | #define SAMPLE_RATE 16000U 29 | #define SAMPLE_BITS 16 30 | #define WAV_HEADER_SIZE 44 31 | #define VOLUME_GAIN 2 32 | 33 | // I2S PDM Configuration for XIAO ESP32S3 built-in microphone 34 | #define I2S_NUM I2S_NUM_0 35 | #define PDM_CLK_GPIO (gpio_num_t)42 36 | #define PDM_DIN_GPIO (gpio_num_t)41 37 | 38 | #define BUTTON_PIN D1 39 | bool isPressed = false; 40 | 41 | // I2S handle 42 | i2s_chan_handle_t rx_handle = NULL; 43 | 44 | // Global variables 45 | bool recording_active = false; 46 | String last_transcription = ""; 47 | bool wifi_connected = false; 48 | String current_recording_file = ""; 49 | 50 | // ===== FUNCTION DECLARATIONS ===== 51 | bool connectToWiFi(); 52 | bool init_i2s_pdm(); 53 | void deinit_i2s_pdm(); 54 | void cleanupOldRecordings(); 55 | void record_wav_streaming(); 56 | void process_recording(); 57 | String send_to_elevenlabs_stt(String filename); 58 | void generate_wav_header(uint8_t* wav_header, uint32_t wav_size, uint32_t sample_rate); 59 | 60 | // ===== IMPLEMENTATION ===== 61 | 62 | bool init_i2s_pdm() { 63 | Serial.println("Initializing I2S PDM..."); 64 | 65 | i2s_chan_config_t chan_cfg = I2S_CHANNEL_DEFAULT_CONFIG(I2S_NUM, I2S_ROLE_MASTER); 66 | chan_cfg.auto_clear = true; 67 | 68 | if (i2s_new_channel(&chan_cfg, NULL, &rx_handle) != ESP_OK) { 69 | Serial.println("Failed to create I2S channel"); 70 | return false; 71 | } 72 | 73 | i2s_pdm_rx_config_t pdm_rx_cfg = { 74 | .clk_cfg = I2S_PDM_RX_CLK_DEFAULT_CONFIG(SAMPLE_RATE), 75 | .slot_cfg = I2S_PDM_RX_SLOT_DEFAULT_CONFIG(I2S_DATA_BIT_WIDTH_16BIT, I2S_SLOT_MODE_MONO), 76 | .gpio_cfg = { 77 | .clk = PDM_CLK_GPIO, 78 | .din = PDM_DIN_GPIO, 79 | .invert_flags = { .clk_inv = false }, 80 | }, 81 | }; 82 | 83 | if (i2s_channel_init_pdm_rx_mode(rx_handle, &pdm_rx_cfg) != ESP_OK) return false; 84 | if (i2s_channel_enable(rx_handle) != ESP_OK) return false; 85 | 86 | Serial.println("I2S PDM initialized successfully"); 87 | return true; 88 | } 89 | 90 | void deinit_i2s_pdm() { 91 | if (rx_handle != NULL) { 92 | i2s_channel_disable(rx_handle); 93 | i2s_del_channel(rx_handle); 94 | rx_handle = NULL; 95 | } 96 | } 97 | 98 | bool connectToWiFi() { 99 | Serial.println("Connecting to WiFi..."); 100 | WiFi.disconnect(); 101 | WiFi.mode(WIFI_STA); 102 | WiFi.begin(ssid, password); 103 | 104 | int attempts = 0; 105 | while (WiFi.status() != WL_CONNECTED && attempts < 40) { 106 | delay(500); 107 | Serial.print("."); 108 | attempts++; 109 | } 110 | 111 | if (WiFi.status() == WL_CONNECTED) { 112 | Serial.println("\nWiFi connected!"); 113 | Serial.print("IP: "); 114 | Serial.println(WiFi.localIP()); 115 | wifi_connected = true; 116 | return true; 117 | } else { 118 | Serial.println("\nWiFi connection failed"); 119 | wifi_connected = false; 120 | return false; 121 | } 122 | } 123 | 124 | void setup() { 125 | Serial.begin(115200); 126 | pinMode(BUTTON_PIN, INPUT_PULLUP); 127 | 128 | if (!init_i2s_pdm()) { 129 | Serial.println("I2S init failed!"); 130 | while (1) 131 | ; 132 | } 133 | 134 | if (!SD.begin(21)) { 135 | Serial.println("Failed to mount SD Card!"); 136 | while (1) 137 | ; 138 | } 139 | Serial.println("SD Card initialized"); 140 | 141 | cleanupOldRecordings(); 142 | connectToWiFi(); 143 | } 144 | 145 | void loop() { 146 | bool currentState = digitalRead(BUTTON_PIN) == LOW; 147 | 148 | if (currentState && !isPressed) { 149 | isPressed = true; 150 | Serial.println("Button pressed → start recording"); 151 | record_wav_streaming(); 152 | process_recording(); 153 | } 154 | 155 | if (!currentState && isPressed) { 156 | isPressed = false; 157 | Serial.println("Button released"); 158 | } 159 | 160 | delay(50); 161 | } 162 | // remove: long now, total_time; 163 | 164 | 165 | 166 | void record_wav_streaming() { 167 | if (rx_handle == NULL) return; 168 | 169 | const uint32_t max_record_time = 30; // sec 170 | 171 | String filename = "/" + String(WAV_FILE_NAME) + "_" + String(millis()) + ".wav"; 172 | current_recording_file = filename; 173 | 174 | File file = SD.open(filename.c_str(), FILE_WRITE); 175 | if (!file) { 176 | Serial.println("Failed to open file"); 177 | current_recording_file = ""; 178 | return; 179 | } 180 | 181 | uint8_t wav_header[WAV_HEADER_SIZE]; 182 | generate_wav_header(wav_header, 0, SAMPLE_RATE); 183 | file.write(wav_header, WAV_HEADER_SIZE); 184 | 185 | uint8_t* buffer = (uint8_t*)malloc(512); 186 | if (!buffer) return; 187 | 188 | recording_active = true; 189 | size_t total_bytes = 0; 190 | unsigned long startTime = millis(); 191 | 192 | Serial.println("Recording..."); 193 | 194 | while (digitalRead(BUTTON_PIN) == LOW && (millis() - startTime < max_record_time * 1000)) { 195 | size_t bytes_read = 0; 196 | if (i2s_channel_read(rx_handle, buffer, 512, &bytes_read, pdMS_TO_TICKS(100)) != ESP_OK) continue; 197 | 198 | for (size_t i = 0; i < bytes_read; i += 2) { 199 | int16_t* sample = (int16_t*)&buffer[i]; 200 | int32_t amp = (*sample) << VOLUME_GAIN; 201 | if (amp > 32767) amp = 32767; 202 | if (amp < -32768) amp = -32768; 203 | *sample = (int16_t)amp; 204 | } 205 | 206 | file.write(buffer, bytes_read); 207 | total_bytes += bytes_read; 208 | } 209 | 210 | recording_active = false; 211 | free(buffer); 212 | 213 | file.seek(0); 214 | generate_wav_header(wav_header, total_bytes, SAMPLE_RATE); 215 | file.write(wav_header, WAV_HEADER_SIZE); 216 | file.close(); 217 | 218 | Serial.printf("Recording finished: %s (%d bytes)\n", filename.c_str(), total_bytes); 219 | } 220 | 221 | void process_recording() { 222 | if (current_recording_file.isEmpty()) return; 223 | 224 | Serial.printf("Sending %s to ElevenLabs...\n", current_recording_file.c_str()); 225 | String transcription = send_to_elevenlabs_stt(current_recording_file); 226 | 227 | if (transcription.length()) { 228 | Serial.println("Transcription:"); 229 | Serial.println(transcription); 230 | last_transcription = transcription; 231 | } else { 232 | Serial.println("STT failed"); 233 | } 234 | 235 | current_recording_file = ""; 236 | } 237 | String send_to_elevenlabs_stt(String filename) { 238 | uint32_t t_start = millis(); 239 | 240 | if (!wifi_connected || WiFi.status() != WL_CONNECTED) { 241 | Serial.println("WiFi not connected, cannot send to STT"); 242 | return ""; 243 | } 244 | 245 | File file = SD.open(filename.c_str()); 246 | if (!file) { 247 | Serial.println("Failed to open audio file"); 248 | return ""; 249 | } 250 | 251 | size_t file_size = file.size(); 252 | if (file_size > 500000) { 253 | Serial.println("File too large for STT request (>500KB)"); 254 | file.close(); 255 | return ""; 256 | } 257 | 258 | uint8_t* audio_data = (uint8_t*)malloc(file_size); 259 | if (!audio_data) { 260 | Serial.println("Failed to allocate memory for audio data!"); 261 | file.close(); 262 | return ""; 263 | } 264 | size_t bytesRead = file.read(audio_data, file_size); 265 | file.close(); 266 | 267 | uint32_t t_file_loaded = millis(); 268 | 269 | HTTPClient http; 270 | if (!http.begin(elevenlabs_stt_url)) { 271 | Serial.println("Failed to initialize HTTP connection"); 272 | free(audio_data); 273 | return ""; 274 | } 275 | 276 | http.setTimeout(30000); 277 | http.setConnectTimeout(10000); 278 | http.addHeader("xi-api-key", elevenlabs_api_key); 279 | 280 | String boundary = "----WebKitFormBoundary7MA4YWxkTrZu0gW"; 281 | http.addHeader("Content-Type", "multipart/form-data; boundary=" + boundary); 282 | 283 | String body_start = "--" + boundary + "\r\n"; 284 | body_start += "Content-Disposition: form-data; name=\"model_id\"\r\n\r\n"; 285 | body_start += "scribe_v1\r\n"; 286 | body_start += "--" + boundary + "\r\n"; 287 | body_start += "Content-Disposition: form-data; name=\"file\"; filename=\"audio.wav\"\r\n"; 288 | body_start += "Content-Type: audio/wav\r\n\r\n"; 289 | 290 | String body_end = "\r\n--" + boundary + "--\r\n"; 291 | size_t total_size = body_start.length() + file_size + body_end.length(); 292 | uint8_t* complete_body = (uint8_t*)malloc(total_size); 293 | 294 | memcpy(complete_body, body_start.c_str(), body_start.length()); 295 | memcpy(complete_body + body_start.length(), audio_data, file_size); 296 | memcpy(complete_body + body_start.length() + file_size, body_end.c_str(), body_end.length()); 297 | 298 | free(audio_data); 299 | 300 | uint32_t t_request_prepared = millis(); 301 | 302 | Serial.println("Sending request to ElevenLabs STT..."); 303 | 304 | // Start timer just before POST 305 | uint32_t t_request_sent = millis(); 306 | int httpResponseCode = http.POST(complete_body, total_size); 307 | // Stop timer after response received 308 | uint32_t t_response_received = millis(); 309 | 310 | free(complete_body); 311 | 312 | String transcription = ""; 313 | String response = http.getString(); 314 | 315 | uint32_t t_response_parsed = millis(); 316 | 317 | if (httpResponseCode == 200) { 318 | Serial.printf("HTTP 200 OK\nResponse: %s\n", response.c_str()); 319 | DynamicJsonDocument doc(2048); 320 | if (deserializeJson(doc, response) == DeserializationError::Ok) { 321 | if (doc.containsKey("text")) { 322 | transcription = doc["text"].as(); 323 | } 324 | } 325 | } else { 326 | Serial.printf("HTTP Error: %d\n", httpResponseCode); 327 | Serial.println("Response: " + response); 328 | } 329 | 330 | http.end(); 331 | 332 | // Print detailed timing information (similar to Deepgram implementation) 333 | Serial.println("---------------------------------------------------"); 334 | Serial.printf("-> Audio File [%s] size: %d bytes\n", filename.c_str(), file_size); 335 | Serial.printf("-> Latency File Loading [t_file_loaded]: %.3f sec\n", (float)(t_file_loaded - t_start) / 1000); 336 | Serial.printf("-> Latency Request Preparation: %.3f sec\n", (float)(t_request_prepared - t_file_loaded) / 1000); 337 | Serial.printf("-> Latency ElevenLabs STT Response: %.3f sec\n", (float)(t_response_received - t_request_sent) / 1000); 338 | Serial.printf("-> Latency Response Parsing: %.3f sec\n", (float)(t_response_parsed - t_response_received) / 1000); 339 | Serial.printf("=> TOTAL Duration [sec]: .................... %.3f sec\n", (float)(t_response_parsed - t_start) / 1000); 340 | Serial.printf("=> Server response length [bytes]: %d\n", response.length()); 341 | Serial.printf("=> Transcription: [%s]\n", transcription.c_str()); 342 | Serial.println("---------------------------------------------------"); 343 | 344 | return transcription; 345 | } 346 | 347 | void generate_wav_header(uint8_t* wav_header, uint32_t wav_size, uint32_t sample_rate) { 348 | uint32_t file_size = wav_size + WAV_HEADER_SIZE - 8; 349 | uint32_t byte_rate = sample_rate * SAMPLE_BITS / 8; 350 | 351 | const uint8_t header[] = { 352 | 'R', 353 | 'I', 354 | 'F', 355 | 'F', 356 | file_size, 357 | file_size >> 8, 358 | file_size >> 16, 359 | file_size >> 24, 360 | 'W', 361 | 'A', 362 | 'V', 363 | 'E', 364 | 'f', 365 | 'm', 366 | 't', 367 | ' ', 368 | 0x10, 369 | 0x00, 370 | 0x00, 371 | 0x00, 372 | 0x01, 373 | 0x00, 374 | 0x01, 375 | 0x00, 376 | sample_rate, 377 | sample_rate >> 8, 378 | sample_rate >> 16, 379 | sample_rate >> 24, 380 | byte_rate, 381 | byte_rate >> 8, 382 | byte_rate >> 16, 383 | byte_rate >> 24, 384 | 0x02, 385 | 0x00, 386 | 0x10, 387 | 0x00, 388 | 'd', 389 | 'a', 390 | 't', 391 | 'a', 392 | wav_size, 393 | wav_size >> 8, 394 | wav_size >> 16, 395 | wav_size >> 24, 396 | }; 397 | memcpy(wav_header, header, sizeof(header)); 398 | } 399 | 400 | void cleanupOldRecordings() { 401 | File root = SD.open("/"); 402 | File file = root.openNextFile(); 403 | while (file) { 404 | String filename = file.name(); 405 | if (filename.startsWith(WAV_FILE_NAME) && filename.endsWith(".wav")) { 406 | file.close(); 407 | SD.remove("/" + filename); 408 | } else { 409 | file.close(); 410 | } 411 | file = root.openNextFile(); 412 | } 413 | root.close(); 414 | } 415 | --------------------------------------------------------------------------------