├── Dockerfile ├── Procfile ├── README.md ├── asr ├── __init__.py └── model_map.py ├── cloudbuild.yaml ├── requirements.txt └── runtime.txt /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.13-alpine 2 | RUN apk add --no-cache speex speexdsp speex-dev speexdsp-dev git build-base 3 | RUN git clone https://github.com/pebble-dev/pyspeex.git && pip install cython setuptools && cd pyspeex && make && python setup.py install && cd .. && rm -rf pyspeex 4 | RUN apk del --no-cache speex-dev speexdsp-dev git 5 | COPY requirements.txt /requirements.txt 6 | RUN pip install -r requirements.txt 7 | RUN apk del --no-cache build-base 8 | ADD . /code 9 | WORKDIR /code 10 | CMD exec gunicorn -k gevent -b 0.0.0.0:$PORT asr:app 11 | -------------------------------------------------------------------------------- /Procfile: -------------------------------------------------------------------------------- 1 | web: gunicorn -k gevent -b 0.0.0.0:$PORT asr:app 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # rebble-asr 2 | asr.rebble.io: speech recognition for rebble 3 | -------------------------------------------------------------------------------- /asr/__init__.py: -------------------------------------------------------------------------------- 1 | import io 2 | 3 | import gevent.monkey 4 | gevent.monkey.patch_all() 5 | from email.mime.multipart import MIMEMultipart 6 | from email.message import Message 7 | from .model_map import get_model_for_lang 8 | import json 9 | import os 10 | from speex import SpeexDecoder 11 | from google.cloud.speech_v2 import SpeechClient 12 | from google.cloud.speech_v2.types import cloud_speech 13 | from google.cloud import storage 14 | import time 15 | from google.api_core.exceptions import ServiceUnavailable 16 | import base64 17 | 18 | import grpc.experimental.gevent as grpc_gevent 19 | grpc_gevent.init_gevent() 20 | 21 | import requests 22 | from flask import Flask, request, Response, abort 23 | import logging 24 | import datetime 25 | 26 | import wave 27 | 28 | logging.basicConfig(level=logging.INFO) 29 | 30 | app = Flask(__name__) 31 | 32 | AUTH_URL = os.environ.get("AUTH_URL", "https://auth.rebble.io") 33 | 34 | speech_client = SpeechClient( 35 | client_options={"api_endpoint": "us-central1-speech.googleapis.com"} 36 | ) 37 | 38 | storage_client = storage.Client(project=os.environ.get("GCP_PROJECT", 'pebble-rebirth')) 39 | bucket = storage_client.bucket(os.environ.get("BUCKET_NAME", "rebble-audio-debug")) 40 | 41 | # We know gunicorn does this, but it doesn't *say* it does this, so we must signal it manually. 42 | @app.before_request 43 | def handle_chunking(): 44 | request.environ['wsgi.input_terminated'] = 1 45 | 46 | 47 | 48 | def parse_chunks(stream): 49 | boundary = b'--' + request.headers['content-type'].split(';')[1].split('=')[1].encode('utf-8').strip() # super lazy/brittle parsing. 50 | this_frame = b'' 51 | while True: 52 | content = stream.read(4096) 53 | this_frame += content 54 | end = this_frame.find(boundary) 55 | if end > -1: 56 | frame = this_frame[:end] 57 | this_frame = this_frame[end + len(boundary):] 58 | if frame != b'': 59 | try: 60 | header, content = frame.split(b'\r\n\r\n', 1) 61 | except ValueError: 62 | continue 63 | yield content[:-2] 64 | if content == b'': 65 | break 66 | 67 | 68 | @app.route('/heartbeat') 69 | def heartbeat(): 70 | return 'asr' 71 | 72 | @app.route('/NmspServlet/', methods=["POST"]) 73 | def recognise(): 74 | stream = request.stream 75 | 76 | access_token, lang = request.host.split('.', 1)[0].split('-', 1) 77 | 78 | auth_req = requests.get(f"{AUTH_URL}/api/v1/me/token", headers={'Authorization': f"Bearer {access_token}"}) 79 | if not auth_req.ok: 80 | abort(401) 81 | 82 | result = auth_req.json() 83 | if not result['is_subscribed']: 84 | abort(402) 85 | 86 | user_id = result.get('uid', None) 87 | audio_debug_enabled = result.get('audio_debug_mode', False) 88 | if user_id is None: 89 | audio_debug_enabled = False 90 | 91 | lang = model_map.get_real_lang(lang) 92 | 93 | req_start = datetime.datetime.now() 94 | logging.info("Received transcription request in language: %s", lang) 95 | chunks = iter(list(parse_chunks(stream))) 96 | logging.info("Audio received in %s", datetime.datetime.now() - req_start) 97 | content = next(chunks).decode('utf-8') 98 | logging.info("Metadata: %s", content) 99 | 100 | decode_start = datetime.datetime.now() 101 | decoder = SpeexDecoder(1) 102 | pcm = bytearray() 103 | for chunk in chunks: 104 | pcm.extend(decoder.decode(chunk)) 105 | logging.info("Decoded speex in %s", datetime.datetime.now() - decode_start) 106 | 107 | if audio_debug_enabled: 108 | upload_start = datetime.datetime.now() 109 | buffer = io.BytesIO(pcm) 110 | with wave.open(buffer, 'wb') as wav_file: 111 | wav_file.setnchannels(1) 112 | wav_file.setsampwidth(2) 113 | wav_file.setframerate(16000) 114 | wav_file.writeframes(pcm) 115 | buffer.seek(0) 116 | blob = bucket.blob(f"audio/users/{user_id}/recording-{datetime.datetime.now().isoformat()}.wav") 117 | blob.upload_from_file(buffer, rewind=True, content_type="audio/wav") 118 | logging.info("Uploaded audio in %s", datetime.datetime.now() - upload_start) 119 | 120 | asr_request_start = datetime.datetime.now() 121 | config = cloud_speech.RecognitionConfig( 122 | explicit_decoding_config=cloud_speech.ExplicitDecodingConfig( 123 | encoding=cloud_speech.ExplicitDecodingConfig.AudioEncoding.LINEAR16, 124 | sample_rate_hertz=16000, 125 | audio_channel_count=1, 126 | ), 127 | language_codes=[lang], 128 | features=cloud_speech.RecognitionFeatures( 129 | profanity_filter=True, # matches current behaviour, but do we really want it? 130 | enable_word_confidence=True, # Pebble uses (ignores) this 131 | enable_automatic_punctuation=True, 132 | enable_spoken_punctuation=True, 133 | max_alternatives=1, 134 | ), 135 | model="chirp_2", 136 | ) 137 | 138 | asr_request = cloud_speech.RecognizeRequest( 139 | recognizer=f"projects/pebble-rebirth/locations/us-central1/recognizers/_", 140 | config=config, 141 | content=bytes(pcm), 142 | ) 143 | attempts = 0 144 | while True: 145 | try: 146 | response = speech_client.recognize(asr_request, timeout=10) 147 | except ServiceUnavailable as e: 148 | logging.error("ASR request failed: %s", e) 149 | attempts += 1 150 | if attempts > 2: 151 | raise 152 | time.sleep(2) 153 | continue 154 | else: 155 | break 156 | logging.info("ASR request completed in %s", datetime.datetime.now() - asr_request_start) 157 | 158 | if audio_debug_enabled: 159 | complete_response = ''.join(result.alternatives[0].transcript for result in response.results) 160 | blob.metadata = { 161 | 'rebble-language': lang, 162 | 'rebble-transcript': base64.b64encode(complete_response.encode('utf-8')).decode('utf-8') 163 | } 164 | blob.patch() 165 | 166 | words = [] 167 | for result in response.results: 168 | words.extend({ 169 | 'word': x, 170 | 'confidence': str(result.alternatives[0].confidence), 171 | } for x in result.alternatives[0].transcript.split(' ')) 172 | 173 | # Now for some reason we also need to give back a mime/multipart message... 174 | parts = MIMEMultipart() 175 | response_part = Message() 176 | response_part.add_header('Content-Type', 'application/JSON; charset=utf-8') 177 | 178 | if len(words) > 0: 179 | response_part.add_header('Content-Disposition', 'form-data; name="QueryResult"') 180 | words[0]['word'] += '\\*no-space-before' 181 | words[0]['word'] = words[0]['word'][0].upper() + words[0]['word'][1:] 182 | response_part.set_payload(json.dumps({ 183 | 'words': [words], 184 | })) 185 | else: 186 | response_part.add_header('Content-Disposition', 'form-data; name="QueryRetry"') 187 | # Other errors probably exist, but I don't know what they are. 188 | # This is a Nuance error verbatim. 189 | response_part.set_payload(json.dumps({ 190 | "Cause": 1, 191 | "Name": "AUDIO_INFO", 192 | "Prompt": "Sorry, speech not recognized. Please try again." 193 | })) 194 | parts.attach(response_part) 195 | 196 | parts.set_boundary('--Nuance_NMSP_vutc5w1XobDdefsYG3wq') 197 | 198 | response = Response('\r\n' + parts.as_string().split("\n", 3)[3].replace('\n', '\r\n')) 199 | response.headers['Content-Type'] = f'multipart/form-data; boundary={parts.get_boundary()}' 200 | logging.info("Request complete in %s", datetime.datetime.now() - req_start) 201 | return response 202 | -------------------------------------------------------------------------------- /asr/model_map.py: -------------------------------------------------------------------------------- 1 | MODEL_MAP = { 2 | 'af-za': 'chirp_2', 3 | 'cs-cz': 'chirp_2', 4 | 'da-dk': 'chirp_2', 5 | 'de-de': 'chirp_2', 6 | 'en-au': 'chirp_2', 7 | 'en-us': 'chirp_2', 8 | 'en-gb': 'chirp_2', 9 | 'en-in': 'chirp_2', 10 | 'fi-fi': 'chirp_2', 11 | 'fil-ph': 'chirp_2', 12 | 'fr-ca': 'chirp_2', 13 | 'fr-fr': 'chirp_2', 14 | 'gl-es': 'chirp_2', 15 | 'id-id': 'chirp_2', 16 | 'is-is': 'chirp_2', 17 | 'it-it': 'chirp_2', 18 | 'ko-kr': 'chirp_2', 19 | 'lv-lv': 'chirp_2', 20 | 'lt-lt': 'chirp_2', 21 | 'hr-hr': 'chirp_2', 22 | 'hu-hu': 'chirp_2', 23 | 'ms-my': 'chirp_2', 24 | 'nl-nl': 'chirp_2', 25 | 'no-no': 'chirp_2', 26 | 'pt-pt': 'chirp_2', 27 | 'pl-pl': 'chirp_2', 28 | 'ro-ro': 'chirp_2', 29 | 'ru-ru': 'chirp_2', 30 | 'uk-ua': 'chirp_2', 31 | 'es-es': 'chirp_2', 32 | 'es-us': 'chirp_2', 33 | 'sk-sk': 'chirp_2', 34 | 'sl-si': 'chirp_2', 35 | 'sv-se': 'chirp_2', 36 | 'sw-ke': 'chirp_2', 37 | 'tr-tr': 'chirp_2', 38 | 'zu-za': 'chirp_2', 39 | } 40 | 41 | LANGUAGE_OVERRIDES = { 42 | 'en-ca': 'en-us', # Cloud Speech V2 dropped en-ca support. chirp_2 is universal, so this is probably close enough. 43 | 'es-mx': 'es-us', # also dropped es-mx, apparently 44 | 'sw-tz': 'sw-ke', # also dropped sw-tz. I don't know enough to know whether this makes sense, to be honest. 45 | 'nb-no': 'no-no', # I'm still pretty sure this one was a typo. 46 | 'auto-auto': 'auto', # this is a special case for the auto-detect language code 47 | } 48 | 49 | 50 | def get_model_for_lang(code: str) -> str: 51 | return MODEL_MAP.get(code.lower(), 'chirp_2') 52 | 53 | def get_real_lang(code: str) -> str: 54 | return LANGUAGE_OVERRIDES.get(code.lower(), code.lower()) 55 | -------------------------------------------------------------------------------- /cloudbuild.yaml: -------------------------------------------------------------------------------- 1 | steps: 2 | - name: 'gcr.io/cloud-builders/docker' 3 | args: 4 | - build 5 | - "--tag=gcr.io/pebble-rebirth/asr:g$SHORT_SHA" 6 | - "--file=./Dockerfile" 7 | - . 8 | images: 9 | - "gcr.io/pebble-rebirth/asr:g$SHORT_SHA" 10 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | blinker==1.9.0 2 | cachetools==5.5.2 3 | certifi==2025.1.31 4 | chardet==5.2.0 5 | charset-normalizer==3.4.1 6 | click==8.1.8 7 | Cython==3.0.12 8 | Flask==3.1.0 9 | gevent==24.11.1 10 | google-api-core==2.24.2 11 | google-auth==2.38.0 12 | google-cloud-speech==2.31.1 13 | google-cloud-storage==3.1.0 14 | googleapis-common-protos==1.69.2 15 | greenlet==3.1.1 16 | grpcio==1.71.0 17 | grpcio-status==1.71.0 18 | gunicorn==23.0.0 19 | idna==3.10 20 | itsdangerous==2.2.0 21 | Jinja2==3.1.6 22 | MarkupSafe==3.0.2 23 | packaging==24.2 24 | proto-plus==1.26.1 25 | protobuf==5.29.4 26 | pyasn1==0.6.1 27 | pyasn1_modules==0.4.1 28 | requests==2.32.3 29 | rsa==4.9 30 | setuptools==77.0.3 31 | # speex==0.9.1 32 | urllib3==2.3.0 33 | Werkzeug==3.1.3 34 | zope.event==5.0 35 | zope.interface==7.2 36 | -------------------------------------------------------------------------------- /runtime.txt: -------------------------------------------------------------------------------- 1 | python-3.6.6 2 | --------------------------------------------------------------------------------