├── custom_components └── whisper_api_stt │ ├── __init__.py │ ├── manifest.json │ └── stt.py └── README.md /custom_components/whisper_api_stt/__init__.py: -------------------------------------------------------------------------------- 1 | """Custom integration for OpenAI Whisper API STT.""" -------------------------------------------------------------------------------- /custom_components/whisper_api_stt/manifest.json: -------------------------------------------------------------------------------- 1 | { 2 | "domain": "whisper_api_stt", 3 | "name": "OpenAI Whisper API STT", 4 | "codeowners": ["@davidohne"], 5 | "dependencies": [], 6 | "documentation": "https://github.com/davidohne/ha_whisper-api_stt/", 7 | "iot_class": "cloud_polling", 8 | "issue_tracker": "https://github.com/davidohne/ha_whisper-api_stt/issues", 9 | "requirements": ["aiohttp>=3.7.4"], 10 | "version": "0.1.0" 11 | } 12 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Home Assistant: Whisper API Integration von Speech-to-Text 2 | 3 | Integration works for Assist pipelines. 4 | 5 | ### Requirements: 6 | - A working Whisper API Key (Try your key with curl or something else) 7 | 8 | ### Configuration: 9 | 10 | Remarks: 11 | - Add your own API key 12 | - language MUST be set AND has to be ISO-639-1 format 13 | - There will be an error in the home assistant logs, that configuring stt is not allowed in configuration.yaml - you can ignore this 14 | 15 | configuration.yaml: 16 | 17 | 18 | ``` 19 | stt: 20 | - platform: whisper_api_stt 21 | api_key: "" 22 | model: "whisper-1" 23 | language: "en" 24 | ``` 25 | 26 | ### Used sources + thanks to: 27 | - sfortis/openai_tts: https://github.com/sfortis/openai_tts 28 | 29 | 30 | -------------------------------------------------------------------------------- /custom_components/whisper_api_stt/stt.py: -------------------------------------------------------------------------------- 1 | """ 2 | Support for Whisper API STT. 3 | """ 4 | from typing import AsyncIterable 5 | import aiohttp 6 | import os 7 | import tempfile 8 | import voluptuous as vol 9 | from homeassistant.components.tts import CONF_LANG 10 | from homeassistant.components.stt import ( 11 | AudioBitRates, 12 | AudioChannels, 13 | AudioCodecs, 14 | AudioFormats, 15 | AudioSampleRates, 16 | Provider, 17 | SpeechMetadata, 18 | SpeechResult, 19 | SpeechResultState, 20 | ) 21 | from homeassistant.core import HomeAssistant 22 | import homeassistant.helpers.config_validation as cv 23 | import wave 24 | import io 25 | 26 | 27 | CONF_API_KEY = 'api_key' 28 | DEFAULT_LANG = 'en-US' 29 | OPENAI_STT_URL = "https://api.openai.com/v1/audio/transcriptions" 30 | CONF_MODEL = 'model' 31 | CONF_URL = 'url' 32 | CONF_PROMPT = 'prompt' 33 | CONF_TEMPERATURE = 'temperature' 34 | 35 | PLATFORM_SCHEMA = cv.PLATFORM_SCHEMA.extend({ 36 | vol.Required(CONF_API_KEY): cv.string, 37 | vol.Optional(CONF_LANG, default=DEFAULT_LANG): cv.string, 38 | vol.Optional(CONF_MODEL, default='whisper-1'): cv.string, 39 | vol.Optional(CONF_URL, default=None): cv.string, 40 | vol.Optional(CONF_PROMPT, default=None): cv.string, 41 | vol.Optional(CONF_TEMPERATURE, default=0): cv.positive_int, 42 | }) 43 | 44 | 45 | async def async_get_engine(hass, config, discovery_info=None): 46 | """Set up Whisper API STT speech component.""" 47 | api_key = config[CONF_API_KEY] 48 | language = config.get(CONF_LANG, DEFAULT_LANG) 49 | model = config.get(CONF_MODEL) 50 | url = config.get('url') 51 | prompt = config.get('prompt') 52 | temperature = config.get('temperature') 53 | return OpenAISTTProvider(hass, api_key, language, model, url, prompt, temperature) 54 | 55 | 56 | class OpenAISTTProvider(Provider): 57 | """The Whisper API STT provider.""" 58 | 59 | def __init__(self, hass, api_key, lang, model, url, prompt, temperature): 60 | """Initialize Whisper API STT provider.""" 61 | self.hass = hass 62 | self._api_key = api_key 63 | self._language = lang 64 | self._model = model 65 | self._url = url 66 | self._prompt = prompt 67 | self._temperature = temperature 68 | 69 | @property 70 | def default_language(self) -> str: 71 | """Return the default language.""" 72 | return self._language.split(',')[0] 73 | 74 | @property 75 | def supported_languages(self) -> list[str]: 76 | """Return the list of supported languages.""" 77 | return self._language.split(',') 78 | 79 | @property 80 | def supported_formats(self) -> list[AudioFormats]: 81 | """Return a list of supported formats.""" 82 | return [AudioFormats.WAV] 83 | 84 | @property 85 | def supported_codecs(self) -> list[AudioCodecs]: 86 | """Return a list of supported codecs.""" 87 | return [AudioCodecs.PCM] 88 | 89 | @property 90 | def supported_bit_rates(self) -> list[AudioBitRates]: 91 | """Return a list of supported bitrates.""" 92 | return [AudioBitRates.BITRATE_16] 93 | 94 | @property 95 | def supported_sample_rates(self) -> list[AudioSampleRates]: 96 | """Return a list of supported samplerates.""" 97 | return [AudioSampleRates.SAMPLERATE_16000] 98 | 99 | @property 100 | def supported_channels(self) -> list[AudioChannels]: 101 | """Return a list of supported channels.""" 102 | return [AudioChannels.CHANNEL_MONO] 103 | 104 | async def async_process_audio_stream(self, metadata: SpeechMetadata, stream: AsyncIterable[bytes]) -> SpeechResult: 105 | data = b'' 106 | async for chunk in stream: 107 | data += chunk 108 | 109 | if not data: 110 | return SpeechResult("", SpeechResultState.ERROR) 111 | 112 | try: 113 | with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as temp_file: 114 | with wave.open(temp_file, 'wb') as wav_file: 115 | wav_file.setnchannels(metadata.channel) 116 | wav_file.setsampwidth(2) # 2 bytes per sample 117 | wav_file.setframerate(metadata.sample_rate) 118 | wav_file.writeframes(data) 119 | temp_file_path = temp_file.name 120 | 121 | 122 | url = self._url or OPENAI_STT_URL 123 | 124 | headers = { 125 | 'Authorization': f'Bearer {self._api_key}', 126 | } 127 | 128 | file_to_send = open(temp_file_path, 'rb') 129 | form = aiohttp.FormData() 130 | form.add_field('file', file_to_send, filename='audio.wav', content_type='audio/wav') 131 | form.add_field('language', self._language) 132 | form.add_field('model', self._model) 133 | 134 | async with aiohttp.ClientSession() as session: 135 | async with session.post(url, data=form, headers=headers) as response: 136 | if response.status == 200: 137 | json_response = await response.json() 138 | return SpeechResult(json_response["text"], SpeechResultState.SUCCESS) 139 | else: 140 | text = await response.text() 141 | return SpeechResult("", SpeechResultState.ERROR) 142 | except Exception as e: 143 | return SpeechResult("", SpeechResultState.ERROR) 144 | finally: 145 | if 'file_to_send' in locals(): 146 | file_to_send.close() 147 | if temp_file_path: 148 | os.remove(temp_file_path) 149 | --------------------------------------------------------------------------------