├── custom_components
    └── whisper_api_stt
    │   ├── __init__.py
    │   ├── manifest.json
    │   └── stt.py
└── README.md


/custom_components/whisper_api_stt/__init__.py:
--------------------------------------------------------------------------------
1 | """Custom integration for OpenAI Whisper API STT."""


--------------------------------------------------------------------------------
/custom_components/whisper_api_stt/manifest.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "domain": "whisper_api_stt",
 3 |   "name": "OpenAI Whisper API STT",
 4 |   "codeowners": ["@davidohne"],
 5 |   "dependencies": [],
 6 |   "documentation": "https://github.com/davidohne/ha_whisper-api_stt/",
 7 |   "iot_class": "cloud_polling",
 8 |   "issue_tracker": "https://github.com/davidohne/ha_whisper-api_stt/issues",
 9 |   "requirements": ["aiohttp>=3.7.4"],
10 |   "version": "0.1.0"
11 | }
12 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Home Assistant: Whisper API Integration von Speech-to-Text
 2 | 
 3 | Integration works for Assist pipelines. 
 4 | 
 5 | ### Requirements:
 6 | - A working Whisper API Key (Try your key with curl or something else)
 7 | 
 8 | ### Configuration:
 9 | 
10 | Remarks:
11 | - Add your own API key
12 | - language MUST be set AND has to be ISO-639-1 format
13 | - There will be an error in the home assistant logs, that configuring stt is not allowed in configuration.yaml - you can ignore this
14 | 
15 | configuration.yaml:
16 | 
17 | 
18 | ```
19 | stt:
20 |   - platform: whisper_api_stt
21 |     api_key: ""
22 |     model: "whisper-1"
23 |     language: "en"
24 | ```
25 | 
26 | ### Used sources + thanks to:
27 | - sfortis/openai_tts: https://github.com/sfortis/openai_tts
28 | 
29 | 
30 | 


--------------------------------------------------------------------------------
/custom_components/whisper_api_stt/stt.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Support for Whisper API STT.
  3 | """
  4 | from typing import AsyncIterable
  5 | import aiohttp
  6 | import os
  7 | import tempfile
  8 | import voluptuous as vol
  9 | from homeassistant.components.tts import CONF_LANG
 10 | from homeassistant.components.stt import (
 11 |     AudioBitRates,
 12 |     AudioChannels,
 13 |     AudioCodecs,
 14 |     AudioFormats,
 15 |     AudioSampleRates,
 16 |     Provider,
 17 |     SpeechMetadata,
 18 |     SpeechResult,
 19 |     SpeechResultState,
 20 | )
 21 | from homeassistant.core import HomeAssistant
 22 | import homeassistant.helpers.config_validation as cv
 23 | import wave
 24 | import io
 25 | 
 26 | 
 27 | CONF_API_KEY = 'api_key'
 28 | DEFAULT_LANG = 'en-US'
 29 | OPENAI_STT_URL = "https://api.openai.com/v1/audio/transcriptions"
 30 | CONF_MODEL = 'model'
 31 | CONF_URL = 'url'
 32 | CONF_PROMPT = 'prompt'
 33 | CONF_TEMPERATURE = 'temperature'
 34 | 
 35 | PLATFORM_SCHEMA = cv.PLATFORM_SCHEMA.extend({
 36 |     vol.Required(CONF_API_KEY): cv.string,
 37 |     vol.Optional(CONF_LANG, default=DEFAULT_LANG): cv.string,
 38 |     vol.Optional(CONF_MODEL, default='whisper-1'): cv.string,
 39 |     vol.Optional(CONF_URL, default=None): cv.string,
 40 |     vol.Optional(CONF_PROMPT, default=None): cv.string,
 41 |     vol.Optional(CONF_TEMPERATURE, default=0): cv.positive_int,
 42 | })
 43 | 
 44 | 
 45 | async def async_get_engine(hass, config, discovery_info=None):
 46 |     """Set up Whisper API STT speech component."""
 47 |     api_key = config[CONF_API_KEY]
 48 |     language = config.get(CONF_LANG, DEFAULT_LANG)
 49 |     model = config.get(CONF_MODEL)
 50 |     url = config.get('url')
 51 |     prompt = config.get('prompt')
 52 |     temperature = config.get('temperature')
 53 |     return OpenAISTTProvider(hass, api_key, language, model, url, prompt, temperature)
 54 | 
 55 | 
 56 | class OpenAISTTProvider(Provider):
 57 |     """The Whisper API STT provider."""
 58 | 
 59 |     def __init__(self, hass, api_key, lang, model, url, prompt, temperature):
 60 |         """Initialize Whisper API STT provider."""
 61 |         self.hass = hass
 62 |         self._api_key = api_key
 63 |         self._language = lang
 64 |         self._model = model
 65 |         self._url = url
 66 |         self._prompt = prompt
 67 |         self._temperature = temperature
 68 | 
 69 |     @property
 70 |     def default_language(self) -> str:
 71 |         """Return the default language."""
 72 |         return self._language.split(',')[0]
 73 | 
 74 |     @property
 75 |     def supported_languages(self) -> list[str]:
 76 |         """Return the list of supported languages."""
 77 |         return self._language.split(',')
 78 | 
 79 |     @property
 80 |     def supported_formats(self) -> list[AudioFormats]:
 81 |         """Return a list of supported formats."""
 82 |         return [AudioFormats.WAV]
 83 | 
 84 |     @property
 85 |     def supported_codecs(self) -> list[AudioCodecs]:
 86 |         """Return a list of supported codecs."""
 87 |         return [AudioCodecs.PCM]
 88 | 
 89 |     @property
 90 |     def supported_bit_rates(self) -> list[AudioBitRates]:
 91 |         """Return a list of supported bitrates."""
 92 |         return [AudioBitRates.BITRATE_16]
 93 | 
 94 |     @property
 95 |     def supported_sample_rates(self) -> list[AudioSampleRates]:
 96 |         """Return a list of supported samplerates."""
 97 |         return [AudioSampleRates.SAMPLERATE_16000]
 98 | 
 99 |     @property
100 |     def supported_channels(self) -> list[AudioChannels]:
101 |         """Return a list of supported channels."""
102 |         return [AudioChannels.CHANNEL_MONO]
103 | 
104 |     async def async_process_audio_stream(self, metadata: SpeechMetadata, stream: AsyncIterable[bytes]) -> SpeechResult:
105 |         data = b''
106 |         async for chunk in stream:
107 |             data += chunk
108 | 
109 |         if not data:
110 |             return SpeechResult("", SpeechResultState.ERROR)
111 | 
112 |         try:
113 |             with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as temp_file:
114 |                 with wave.open(temp_file, 'wb') as wav_file:
115 |                     wav_file.setnchannels(metadata.channel)
116 |                     wav_file.setsampwidth(2)  # 2 bytes per sample
117 |                     wav_file.setframerate(metadata.sample_rate)
118 |                     wav_file.writeframes(data)
119 |                 temp_file_path = temp_file.name
120 | 
121 | 
122 |             url = self._url or OPENAI_STT_URL
123 | 
124 |             headers = {
125 |                 'Authorization': f'Bearer {self._api_key}',
126 |             }
127 | 
128 |             file_to_send = open(temp_file_path, 'rb')
129 |             form = aiohttp.FormData()
130 |             form.add_field('file', file_to_send, filename='audio.wav', content_type='audio/wav')
131 |             form.add_field('language', self._language)
132 |             form.add_field('model', self._model)
133 | 
134 |             async with aiohttp.ClientSession() as session:
135 |                 async with session.post(url, data=form, headers=headers) as response:
136 |                     if response.status == 200:
137 |                         json_response = await response.json()
138 |                         return SpeechResult(json_response["text"], SpeechResultState.SUCCESS)
139 |                     else:
140 |                         text = await response.text()
141 |                         return SpeechResult("", SpeechResultState.ERROR)
142 |         except Exception as e:
143 |             return SpeechResult("", SpeechResultState.ERROR)
144 |         finally:
145 |             if 'file_to_send' in locals():
146 |                 file_to_send.close()
147 |             if temp_file_path:
148 |                 os.remove(temp_file_path)
149 | 


--------------------------------------------------------------------------------