├── .gitignore ├── Dockerfile ├── LICENSE ├── README.md ├── docker-compose.yml ├── main.py ├── requirements.txt └── tests ├── openai ├── test_audio_transcription.sh ├── test_chat_completions.py └── test_chat_completions_text_modality_only.py └── powershell └── Invoke-Whisper-Audio-Transcription.ps1 /.gitignore: -------------------------------------------------------------------------------- 1 | .python-version 2 | __pycache__/ 3 | tests/openai/*.log 4 | tests/openai/*.wav 5 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.11.10-slim 2 | 3 | # Run updates and install ffmpeg 4 | RUN apt-get update && \ 5 | apt-get upgrade -y && \ 6 | apt-get install -y ffmpeg && \ 7 | apt-get clean && \ 8 | rm -rf /var/lib/apt/lists/* 9 | 10 | # Copy and install the requirements 11 | COPY ./requirements.txt /requirements.txt 12 | 13 | # Pip install the dependencies 14 | RUN pip install --upgrade pip 15 | RUN pip install --no-cache-dir -r /requirements.txt 16 | 17 | # Copy the current directory contents into the container at /app 18 | COPY main.py /app/main.py 19 | 20 | # Set the working directory to /app 21 | WORKDIR /app 22 | 23 | # Expose port 8000 24 | EXPOSE 8000 25 | 26 | # Run the app 27 | CMD uvicorn main:app --host 0.0.0.0 28 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Yasuhiro Morioka 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # tiny-openai-whisper-api 2 | 3 | OpenAI Whisper API-style local server, runnig on FastAPI. This is for companies behind proxies or security firewalls. 4 | 5 | This API will be compatible with [OpenAI Whisper (speech to text) API](https://openai.com/blog/introducing-chatgpt-and-whisper-apis). See also [Create transcription - API Reference - OpenAI API](https://platform.openai.com/docs/api-reference/audio/create). 6 | 7 | Some of code has been copied from [whisper-ui](https://github.com/hayabhay/whisper-ui) 8 | 9 | 10 | Now, this server emulates the following OpenAI APIs. 11 | 12 | - (whisper) [Speech to text - OpenAI API](https://platform.openai.com/docs/guides/speech-to-text) 13 | - (chat) [Audio generation - OpenAI API](https://platform.openai.com/docs/guides/audio?audio-generation-quickstart-example=audio-in) 14 | 15 | ## Running Environment 16 | 17 | This was built & tested on Python 3.10.9, Ubutu22.04/WSL2. 18 | 19 | - openai=1.55.0 20 | - openai-whisper=20240930 21 | 22 | ## Setup 23 | 24 | ```bash 25 | sudo apt install ffmpeg 26 | pip install fastapi python-multipart pydantic uvicorn openai-whisper httpx 27 | # or pip install -r requirements.txt 28 | ``` 29 | 30 | or 31 | 32 | ```bash 33 | docker compose build 34 | ``` 35 | 36 | ## Usage 37 | 38 | ### server 39 | ```bash 40 | export WHISPER_MODEL=turbo # 'turbo' if not supecified 41 | export PYTHONPATH=. 42 | uvicorn main:app --host 0.0.0.0 43 | ``` 44 | 45 | or 46 | 47 | ```bash 48 | docker compose up 49 | ``` 50 | 51 | ### client 52 | 53 | note: Authorization header is ignored. 54 | 55 | example 1: typical usecase, identical to OpenAI Whisper API example 56 | 57 | ```bash 58 | curl --request POST \ 59 | http://127.0.0.1:8000/v1/audio/transcriptions \ 60 | -H "Authorization: Bearer $OPENAI_API_KEY" \ 61 | -H "Content-Type: multipart/form-data" \ 62 | -F model="whisper-1" \ 63 | -F file="@/path/to/file/openai.mp3" 64 | ``` 65 | 66 | example 2: set the output format as text, described in quickstart. 67 | 68 | ```bash 69 | curl --request POST \ 70 | http://127.0.0.1:8000/v1/audio/transcriptions \ 71 | -H "Content-Type: multipart/form-data" \ 72 | -F model="whisper-1" \ 73 | -F file="@/path/to/file/openai.mp3" \ 74 | -F response_format=text 75 | ``` 76 | 77 | example 3: Windows PowerShell5 78 | 79 | ```bash 80 | Set-ExecutionPolicy -Scope Process -ExecutionPolicy RemoteSigned 81 | $env:OPENAI_API_KEY="dummy" 82 | $env:OPENAI_BASE_URL="http://localhost:8000/v1" 83 | 84 | .\tests\powershell\Invoke-Whisper-Audio-Transcription.ps1 "C:\temp\alloy.wav" 85 | ``` 86 | 87 | ## experimental: gpt-4o-audio-preview, chat-completions 88 | 89 | currenly "output text only" mode is supported. 90 | If "output text and audio" is specified, the system makes "output text only" response. 91 | 92 | - output text and audio 93 | - modalities=["text", "audio"], audio={"voice": "alloy", "format": "wav"} 94 | 95 | ``` 96 | completion = client.chat.completions.create( 97 | model="gpt-4o-audio-preview-2024-10-01", 98 | modalities=["text", "audio"], 99 | audio={"voice": "alloy", "format": "wav"}, 100 | messages=[ 101 | { 102 | "role": "user", 103 | "content": [ 104 | { 105 | "type": "text", 106 | "text": "What is in this recording?" 107 | }, 108 | { 109 | "type": "input_audio", 110 | "input_audio": { 111 | "data": encoded_string, 112 | "format": "wav" 113 | } 114 | } 115 | ] 116 | }, 117 | ] 118 | ) 119 | 120 | print(completion.choices[0].message.audio.transcript) 121 | ``` 122 | 123 | ```python 124 | ChatCompletion(id='chatcmpl-AXQt8BTMW4Gh1OcJ5qStVDNZGdzSq', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content=None, refusal=None, role='assistant', audio=ChatCompletionAudio(id='audio_6744555cc6d48190b67e70798ab606c3', data='{{base64-wav}}', expires_at=1732535148, transcript='The recording contains a voice stating that the sun rises in the east and sets in the west, a fact that has been observed by humans for thousands of years.'), function_call=None, tool_calls=None))], created=1732531546, model='gpt-4o-audio-preview-2024-10-01', object='chat.completion', service_tier=None, system_fingerprint='fp_130ac2f073', usage=CompletionUsage(completion_tokens=236, prompt_tokens=86, total_tokens=322, completion_tokens_details=CompletionTokensDetails(accepted_prediction_tokens=0, audio_tokens=188, reasoning_tokens=0, rejected_prediction_tokens=0, text_tokens=48), prompt_tokens_details=PromptTokensDetails(audio_tokens=69, cached_tokens=0, text_tokens=17, image_tokens=0))) 125 | ``` 126 | 127 | - output text only 128 | - modalities=["text"] 129 | 130 | ```python 131 | completion = client.chat.completions.create( 132 | model="gpt-4o-audio-preview-2024-10-01", 133 | modalities=["text"], 134 | messages=[ 135 | { 136 | "role": "user", 137 | "content": [ 138 | { 139 | "type": "text", 140 | "text": "What is in this recording?" 141 | }, 142 | { 143 | "type": "input_audio", 144 | "input_audio": { 145 | "data": encoded_string, 146 | "format": "wav" 147 | } 148 | } 149 | ] 150 | }, 151 | ] 152 | ) 153 | 154 | print(completion.choices[0].message.content) 155 | ``` 156 | 157 | ```python 158 | ChatCompletion(id='chatcmpl-AXTBlZypmtf1CCWrR6X5uX55r4VHY', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content="The recording contains a statement about the sun's movement, stating that the sun rises in the east and sets in the west, a fact that has been observed by humans for thousands of years.", refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None))], created=1732540389, model='gpt-4o-audio-preview-2024-10-01', object='chat.completion', service_tier=None, system_fingerprint='fp_130ac2f073', usage=CompletionUsage(completion_tokens=38, prompt_tokens=86, total_tokens=124, completion_tokens_details=CompletionTokensDetails(accepted_prediction_tokens=0, audio_tokens=0, reasoning_tokens=0, rejected_prediction_tokens=0, text_tokens=38), prompt_tokens_details=PromptTokensDetails(audio_tokens=69, cached_tokens=0, text_tokens=17, image_tokens=0))) 159 | ``` 160 | 161 | ### experimental: Dify integration 162 | 163 | - LLM registration ... OK 164 | - audio file transcription ... NG 165 | - dify-0.12.1 doesn't pass an user prompot which contains audio file (data) to this tiny-openai-whisper-api server. 166 | - probably on dify-0.12.1, "native (audio) file processing capabilities" is available only for openai:gpt-4o-audio-preview. How can we give these feature to openai-compatible LLMs? 167 | 168 | ## TODO 169 | 170 | - more inference parameters should be supported. only `temperature` is supported. 171 | - text prompt (to whisper module) should be supported. currently text prompt is ignored. 172 | - some of reponse property values are dummy (static). 173 | - 'speech-to-text' chat completion available on dify 174 | - discussed at https://discord.com/channels/1082486657678311454/1236911815695400960/1311646643581353984 175 | - patch is https://github.com/fujita-h/dify/commit/39cc3a38d1762da3d5534615580590441f1c9c9b 176 | - the patch works well with this code. 177 | 178 | ## License 179 | 180 | Whisper is licensed under MIT. Everything else by [morioka](https://github.com/morioka) is licensed under MIT. 181 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | services: 2 | whisper-api: 3 | image: morioka/tiny-openai-whisper-api 4 | build: . 5 | container_name: whisper-api 6 | restart: unless-stopped 7 | init: true 8 | ports: 9 | - 8000:8000 10 | deploy: 11 | resources: 12 | reservations: 13 | devices: 14 | - driver: nvidia 15 | count: 1 16 | capabilities: [gpu] 17 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | from contextlib import asynccontextmanager 2 | from fastapi import FastAPI, Request, Form, UploadFile, File 3 | from fastapi import HTTPException, status 4 | from fastapi.responses import JSONResponse 5 | 6 | import os 7 | import shutil 8 | from pathlib import Path 9 | from typing import Any, List, Union, Optional 10 | 11 | from datetime import timedelta 12 | 13 | import numpy as np 14 | import whisper 15 | import torch 16 | 17 | import uvicorn 18 | import json 19 | import base64 20 | import tempfile 21 | 22 | 23 | #url https://api.openai.com/v1/audio/transcriptions \ 24 | # -H "Authorization: Bearer $OPENAI_API_KEY" \ 25 | # -H "Content-Type: multipart/form-data" \ 26 | # -F model="whisper-1" \ 27 | # -F file="@/path/to/file/openai.mp3" 28 | 29 | #{ 30 | # "text": "Imagine the wildest idea that you've ever had, and you're curious about how it might scale to something that's a 100, a 1,000 times bigger..." 31 | #} 32 | 33 | WHISPER_MODEL = os.environ.get('WHISPER_MODEL', 'turbo') 34 | CHAT_MODEL = os.environ.get('CHAT_MODEL', None) 35 | 36 | whisper_model = None 37 | chat_model = None 38 | 39 | @asynccontextmanager 40 | async def lifespan(app: FastAPI): 41 | global whisper_model 42 | # Load the ML model 43 | device = "cuda" if torch.cuda.is_available() else "cpu" 44 | whisper_model = whisper.load_model(WHISPER_MODEL, device=device, in_memory=True) 45 | 46 | yield 47 | 48 | # Clean up the ML models and release the resources 49 | del whisper_model 50 | whisper_model = None 51 | 52 | app = FastAPI(lifespan=lifespan) 53 | 54 | # ----- 55 | # copied from https://github.com/hayabhay/whisper-ui 56 | # Whisper transcription functions 57 | def transcribe(audio_path: str, **whisper_args): 58 | """Transcribe the audio file using whisper""" 59 | global whisper_model 60 | 61 | # Set configs & transcribe 62 | if whisper_args["temperature_increment_on_fallback"] is not None: 63 | whisper_args["temperature"] = tuple( 64 | np.arange(whisper_args["temperature"], 1.0 + 1e-6, whisper_args["temperature_increment_on_fallback"]) 65 | ) 66 | else: 67 | whisper_args["temperature"] = [whisper_args["temperature"]] 68 | 69 | del whisper_args["temperature_increment_on_fallback"] 70 | 71 | transcript = whisper_model.transcribe( 72 | audio_path, 73 | **whisper_args, 74 | ) 75 | 76 | return transcript 77 | 78 | WHISPER_DEFAULT_SETTINGS = { 79 | # "whisper_model": "turbo", 80 | "temperature": 0.0, 81 | "temperature_increment_on_fallback": 0.2, 82 | "no_speech_threshold": 0.6, 83 | "logprob_threshold": -1.0, 84 | "compression_ratio_threshold": 2.4, 85 | "condition_on_previous_text": True, 86 | "verbose": False, 87 | "task": "transcribe", 88 | } 89 | 90 | UPLOAD_DIR="/tmp" 91 | 92 | @app.get('/v1/models') 93 | async def v1_models(request: Request): 94 | content = { 95 | "object": "list", 96 | "data": [ 97 | { 98 | "id": "whisper-1", 99 | "object": "model", 100 | "created": 17078881749, 101 | "owned_by": "tiny-whisper-api" 102 | }, 103 | { 104 | "id": "gpt-4o-audio-preview", 105 | "object": "model", 106 | "created": 17078881749, 107 | "owned_by": "tiny-whisper-api" 108 | }, 109 | { 110 | "id": "gpt-4o-audio-preview-2024-10-01", 111 | "object": "model", 112 | "created": 17078881749, 113 | "owned_by": "tiny-whisper-api" 114 | } 115 | ] 116 | } 117 | 118 | headers = { 119 | 'Content-Type': 'application/json' 120 | } 121 | 122 | response_status_code = 200 123 | 124 | resp = JSONResponse( 125 | content = content, 126 | headers = headers, 127 | status_code = response_status_code 128 | ) 129 | 130 | return resp 131 | 132 | # gpt-4o-audio-preview:OpenAI の Chat Completions API でオーディオを扱う新機能を軽く見てみる【2024/10/17リリース】 #ChatGPT - Qiita 133 | # https://qiita.com/youtoy/items/5a87fd22cc88d8c34d6d 134 | 135 | # https://platform.openai.com/docs/api-reference/chat/create 136 | ''' 137 | curl "https://api.openai.com/v1/chat/completions" \ 138 | -H "Content-Type: application/json" \ 139 | -H "Authorization: Bearer $OPENAI_API_KEY" \ 140 | -d '{ 141 | "model": "gpt-4o-audio-preview", 142 | "modalities": ["text", "audio"], 143 | "audio": { "voice": "alloy", "format": "wav" }, 144 | "messages": [ 145 | { 146 | "role": "user", 147 | "content": [ 148 | { "type": "text", "text": "What is in this recording?" }, 149 | { 150 | "type": "input_audio", 151 | "input_audio": { 152 | "data": "", 153 | "format": "wav" 154 | } 155 | } 156 | ] 157 | } 158 | ] 159 | }' 160 | ''' 161 | 162 | # modalities = ["text"] 163 | CHAT_COMPLETIONS_RESPONSE_TEMPLATE=''' 164 | { 165 | "id": "chatcmpl-123", 166 | "object": "chat.completion", 167 | "created": 1677652288, 168 | "model": "gpt-4o-audio-preview-2024-10-01", 169 | "system_fingerprint": "fp_44709d6fcb", 170 | "service_tier": null, 171 | "choices": [{ 172 | "index": 0, 173 | "message": { 174 | "role": "assistant", 175 | "content": "Hello there, how may I assist you today?", 176 | "refusal": null 177 | }, 178 | "logprobs": null, 179 | "finish_reason": "stop" 180 | }], 181 | "usage": { 182 | "prompt_tokens": 86, 183 | "prompt_tokens_details": { 184 | "audio_tokens": 69, 185 | "cached_tokens": 0, 186 | "text_tokens": 17, 187 | "image_tokens": 0 188 | }, 189 | "completion_tokens": 36, 190 | "total_tokens": 122, 191 | "completion_tokens_details": { 192 | "reasoning_tokens": 0, 193 | "accepted_prediction_tokens": 0, 194 | "rejected_prediction_tokens": 0, 195 | "audio_tokens": 0, 196 | "reasoning_tokens": 0, 197 | "text_toekns": 17 198 | } 199 | } 200 | } 201 | ''' 202 | 203 | # modalities = ["text", "audio"] 204 | CHAT_COMPLETIONS_RESPONSE_AUDIO_OUTPUT_TEMPLATE=''' 205 | { 206 | "id": "chatcmpl-123", 207 | "object": "chat.completion", 208 | "created": 1677652288, 209 | "model": "gpt-4o-audio-preview-2024-10-01", 210 | "system_fingerprint": "fp_44709d6fcb", 211 | "service_tier": null, 212 | "choices": [{ 213 | "index": 0, 214 | "message": { 215 | "role": "assistant", 216 | "content": null, 217 | "refusal": null, 218 | "audio": { 219 | "id": "audio_6744555cc6d48190b67e70798ab606c3", 220 | "data": "response_audio_data_base64", 221 | "expires_at": 1732535148, 222 | "transcript": "response_transcript" 223 | } 224 | }, 225 | "logprobs": null, 226 | "finish_reason": "stop", 227 | "function_call": null, 228 | "tool_calls": null 229 | }], 230 | "usage": { 231 | "prompt_tokens": 86, 232 | "prompt_tokens_details": { 233 | "audio_tokens": 69, 234 | "cached_tokens": 0, 235 | "text_tokens": 17, 236 | "image_tokens": 0 237 | }, 238 | "completion_tokens": 236, 239 | "total_tokens": 322, 240 | "completion_tokens_details": { 241 | "reasoning_tokens": 0, 242 | "accepted_prediction_tokens": 0, 243 | "rejected_prediction_tokens": 0, 244 | "audio_tokens": 188, 245 | "reasoning_tokens": 0, 246 | "text_tokens": 48 247 | } 248 | } 249 | } 250 | ''' 251 | 252 | CHAT_COMPLETIONS_RESPONSE_DIFY_PING_TEMPLATE=''' 253 | { 254 | "id": "chatcmpl-123", 255 | "object": "chat.completion", 256 | "created": 1677652288, 257 | "model": "gpt-4o-audio-preview-2024-10-01", 258 | "system_fingerprint": "fp_44709d6fcb", 259 | "service_tier": null, 260 | "choices": [{ 261 | "index": 0, 262 | "message": { 263 | "role": "assistant", 264 | "content": "Pong", 265 | "refusal": null 266 | }, 267 | "logprobs": null, 268 | "finish_reason": "stop" 269 | }], 270 | "usage": { 271 | "prompt_tokens": 5, 272 | "prompt_tokens_details": { 273 | "audio_tokens": 0, 274 | "cached_tokens": 0, 275 | "text_tokens": 5, 276 | "image_tokens": 0 277 | }, 278 | "completion_tokens": 5, 279 | "total_tokens": 10, 280 | "completion_tokens_details": { 281 | "reasoning_tokens": 0, 282 | "accepted_prediction_tokens": 0, 283 | "rejected_prediction_tokens": 0, 284 | "audio_tokens": 0, 285 | "reasoning_tokens": 0, 286 | "text_toekns": 5 287 | } 288 | } 289 | } 290 | ''' 291 | 292 | def is_base64_encoded(s: str) -> bool: 293 | try: 294 | # パディングの調整(4の倍数に) 295 | if len(s) % 4 != 0: 296 | return False 297 | 298 | # デコードしてみる 299 | base64.b64decode(s, validate=True) 300 | return True 301 | except Exception: 302 | return False 303 | 304 | def save_base64_to_temp_file(base64_string: str) -> str: 305 | try: 306 | # BASE64文字列をデコード 307 | binary_data = base64.b64decode(base64_string) 308 | 309 | # 一時ファイルを作成 310 | with tempfile.NamedTemporaryFile(delete=False, mode='wb') as temp_file: 311 | temp_file.write(binary_data) 312 | temp_file_path = temp_file.name # 一時ファイルのパスを取得 313 | 314 | return temp_file_path 315 | 316 | except Exception as e: 317 | return None 318 | 319 | @app.post('/v1/chat/completions') 320 | async def v1_chat_completions(request: Request): 321 | 322 | global chat_model 323 | 324 | req_body = await request.json() 325 | 326 | model = req_body['model'] 327 | try: 328 | modalities = req_body['modalities'] 329 | except KeyError: 330 | modalities = ['text'] 331 | try: 332 | audio = req_body['audio'] 333 | except KeyError: 334 | audio = None 335 | 336 | if model not in ['gpt-4o-audio-preview', 'gpt-4o-audio-preview-2024-10-01']: 337 | raise HTTPException( 338 | status_code=status.HTTP_400_BAD_REQUEST, 339 | detail=f"Bad Request, not supported model" 340 | ) 341 | 342 | if 'text' not in modalities: 343 | raise HTTPException( 344 | status_code=status.HTTP_400_BAD_REQUEST, 345 | detail=f"Bad Request, 'text' is not in modalitiees" 346 | ) 347 | 348 | if 'audio' in modalities: 349 | if audio is None: 350 | raise HTTPException( 351 | status_code=status.HTTP_400_BAD_REQUEST, 352 | detail=f"Bad Request, 'audio' is in modalitiees, but attiributes are not specified." 353 | ) 354 | 355 | if audio['voice'] not in ['ash', 'ballad', 'coral', 'sage', 'verse', 'alloy', 'echo', 'shmmer']: 356 | raise HTTPException( 357 | status_code=status.HTTP_400_BAD_REQUEST, 358 | detail=f"Bad Request, not supported voice" 359 | ) 360 | if audio['format'] not in ['wav', 'mp3', 'flac', 'opus', 'pcm16']: 361 | raise HTTPException( 362 | status_code=status.HTTP_400_BAD_REQUEST, 363 | detail=f"Bad Request, not supported format" 364 | ) 365 | 366 | messages = req_body['messages'] 367 | content = None 368 | for m in messages: 369 | for c in m['content']: 370 | if 'input_audio' in c: 371 | assert 'data' in c['input_audio'] 372 | assert 'format' in c['input_audio'] 373 | content = c 374 | break 375 | 376 | if content is None: 377 | # dify ping 378 | for m in messages: 379 | if m['content'] in ['ping']: 380 | resp_body = json.loads(CHAT_COMPLETIONS_RESPONSE_DIFY_PING_TEMPLATE) 381 | resp_body['model'] = model 382 | resp = JSONResponse( 383 | content = resp_body, 384 | headers = { 385 | 'Content-Type': 'application/json' 386 | }, 387 | status_code = 200 388 | ) 389 | return resp 390 | raise HTTPException( 391 | status_code=status.HTTP_400_BAD_REQUEST, 392 | detail=f"Bad Request, missing content" 393 | ) 394 | 395 | # content data is base64-encoded? 396 | data = content['input_audio']['data'] 397 | if not is_base64_encoded(data): 398 | raise HTTPException( 399 | status_code=status.HTTP_400_BAD_REQUEST, 400 | detail=f"Bad Request, content is not base64-encoded." 401 | ) 402 | 403 | settings = WHISPER_DEFAULT_SETTINGS.copy() 404 | #settings['temperature'] = temperature 405 | 406 | temp_content_path = save_base64_to_temp_file(data) 407 | if temp_content_path is None: 408 | raise HTTPException( 409 | status_code=status.HTTP_400_BAD_REQUEST, 410 | detail=f"Bad Request, transcription failed." 411 | ) 412 | 413 | transcript = transcribe(audio_path=temp_content_path, **settings) 414 | text = transcript['text'] 415 | 416 | if temp_content_path: 417 | # TODO: 非同期で削除したい 418 | os.remove(temp_content_path) 419 | 420 | #print(transcript) 421 | #print(text) 422 | 423 | if audio is not None: 424 | resp_body = json.loads(CHAT_COMPLETIONS_RESPONSE_AUDIO_OUTPUT_TEMPLATE) 425 | resp_body['choices'][0]['message']['audio']['transcript'] = text 426 | else: 427 | resp_body = json.loads(CHAT_COMPLETIONS_RESPONSE_TEMPLATE) 428 | resp_body['choices'][0]['message']['content'] = text 429 | resp_body['choices'][0]['delta'] = resp_body['choices'][0]['message'].copy() 430 | 431 | resp_body['model'] = model 432 | 433 | if chat_model: 434 | pass 435 | # request の messagesのうち、 input_audio 部分を書き起こしに差し替えて、 436 | # chat_model に投げつけて応答を得る。 437 | # ? chat_modelのAPI_KEYは? 接続先は? 438 | # ? トークン数などの統計情報をどう補正する? 439 | # ? modalities 情報は? 音声出力しないのであれば気にしなくてよい? 440 | 441 | resp = JSONResponse( 442 | content = resp_body, 443 | headers = { 444 | 'Content-Type': 'application/json' 445 | }, 446 | status_code = 200 447 | ) 448 | 449 | return resp 450 | 451 | 452 | @app.post('/v1/audio/transcriptions') 453 | async def transcriptions(model: str = Form(...), 454 | file: UploadFile = File(...), 455 | response_format: Optional[str] = Form(None), 456 | language: Optional[str] = Form(None), 457 | prompt: Optional[str] = Form(None), 458 | temperature: Optional[float] = Form(None)): 459 | 460 | assert model == "whisper-1" 461 | if file is None: 462 | raise HTTPException( 463 | status_code=status.HTTP_400_BAD_REQUEST, 464 | detail=f"Bad Request, bad file" 465 | ) 466 | if response_format is None: 467 | response_format = 'json' 468 | if response_format not in ['json', 469 | 'text', 470 | 'srt', 471 | 'verbose_json', 472 | 'vtt']: 473 | raise HTTPException( 474 | status_code=status.HTTP_400_BAD_REQUEST, 475 | detail=f"Bad Request, bad response_format" 476 | ) 477 | if temperature is None: 478 | temperature = 0.0 479 | if temperature < 0.0 or temperature > 1.0: 480 | raise HTTPException( 481 | status_code=status.HTTP_400_BAD_REQUEST, 482 | detail=f"Bad Request, bad temperature" 483 | ) 484 | 485 | filename = file.filename 486 | fileobj = file.file 487 | 488 | # TODO: 拡張子は維持しながら、ファイル名の衝突を避けるために一時ファイルとしたい 489 | upload_name = os.path.join(UPLOAD_DIR, filename) 490 | upload_file = open(upload_name, 'wb') 491 | shutil.copyfileobj(fileobj, upload_file) 492 | upload_file.close() 493 | 494 | settings = WHISPER_DEFAULT_SETTINGS.copy() 495 | settings['temperature'] = temperature 496 | if language is not None: 497 | settings['language'] = language # TODO: check ISO-639-1 format 498 | 499 | transcript = transcribe(audio_path=upload_name, **settings) 500 | 501 | if upload_name: 502 | os.remove(upload_name) 503 | 504 | if response_format in ['text']: 505 | return transcript['text'] 506 | 507 | if response_format in ['srt']: 508 | ret = "" 509 | for seg in transcript['segments']: 510 | 511 | td_s = timedelta(milliseconds=seg["start"]*1000) 512 | td_e = timedelta(milliseconds=seg["end"]*1000) 513 | 514 | t_s = f'{td_s.seconds//3600:02}:{(td_s.seconds//60)%60:02}:{td_s.seconds%60:02}.{td_s.microseconds//1000:03}' 515 | t_e = f'{td_e.seconds//3600:02}:{(td_e.seconds//60)%60:02}:{td_e.seconds%60:02}.{td_e.microseconds//1000:03}' 516 | 517 | ret += '{}\n{} --> {}\n{}\n\n'.format(seg["id"], t_s, t_e, seg["text"]) 518 | ret += '\n' 519 | return ret 520 | 521 | if response_format in ['vtt']: 522 | ret = "WEBVTT\n\n" 523 | for seg in transcript['segments']: 524 | td_s = timedelta(milliseconds=seg["start"]*1000) 525 | td_e = timedelta(milliseconds=seg["end"]*1000) 526 | 527 | t_s = f'{td_s.seconds//3600:02}:{(td_s.seconds//60)%60:02}:{td_s.seconds%60:02}.{td_s.microseconds//1000:03}' 528 | t_e = f'{td_e.seconds//3600:02}:{(td_e.seconds//60)%60:02}:{td_e.seconds%60:02}.{td_e.microseconds//1000:03}' 529 | 530 | ret += "{} --> {}\n{}\n\n".format(t_s, t_e, seg["text"]) 531 | return ret 532 | 533 | if response_format in ['verbose_json']: 534 | transcript.setdefault('task', WHISPER_DEFAULT_SETTINGS['task']) 535 | transcript.setdefault('duration', transcript['segments'][-1]['end']) 536 | if transcript['language'] == 'ja': 537 | transcript['language'] = 'japanese' 538 | return transcript 539 | 540 | return {'text': transcript['text']} 541 | 542 | def main(): 543 | uvicorn.run("main:app", host="0.0.0.0", port=8000, log_level ="info") 544 | 545 | if __name__ == "__main__": 546 | # main() 547 | pass 548 | 549 | 550 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | fastapi 2 | pydantic 3 | python-multipart 4 | openai-whisper 5 | uvicorn 6 | -------------------------------------------------------------------------------- /tests/openai/test_audio_transcription.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | if [ ! -e alloy.wav ]; then 3 | wget https://openaiassets.blob.core.windows.net/\$web/API/docs/audio/alloy.wav 4 | fi 5 | curl http://localhost:8000/v1/audio/transcriptions -H "Content-Type: multipart/form-data" -F model="whisper-1" -F file="@alloy.wav" -F response_format=text 6 | -------------------------------------------------------------------------------- /tests/openai/test_chat_completions.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import requests 3 | from openai import OpenAI 4 | 5 | client = OpenAI() 6 | 7 | # Fetch the audio file and convert it to a base64 encoded string 8 | url = "https://openaiassets.blob.core.windows.net/$web/API/docs/audio/alloy.wav" 9 | response = requests.get(url) 10 | response.raise_for_status() 11 | wav_data = response.content 12 | encoded_string = base64.b64encode(wav_data).decode('utf-8') 13 | 14 | completion = client.chat.completions.create( 15 | model="gpt-4o-audio-preview-2024-10-01", 16 | modalities=["text", "audio"], 17 | audio={"voice": "alloy", "format": "wav"}, 18 | messages=[ 19 | { 20 | "role": "user", 21 | "content": [ 22 | { 23 | "type": "text", 24 | "text": "What is in this recording?" 25 | }, 26 | { 27 | "type": "input_audio", 28 | "input_audio": { 29 | "data": encoded_string, 30 | "format": "wav" 31 | } 32 | } 33 | ] 34 | }, 35 | ] 36 | ) 37 | 38 | print(completion) 39 | print(completion.choices[0].message.audio.transcript) 40 | -------------------------------------------------------------------------------- /tests/openai/test_chat_completions_text_modality_only.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import requests 3 | from openai import OpenAI 4 | 5 | client = OpenAI() 6 | client = OpenAI(base_url='http://localhost:8000/v1') 7 | 8 | # Fetch the audio file and convert it to a base64 encoded string 9 | url = "https://openaiassets.blob.core.windows.net/$web/API/docs/audio/alloy.wav" 10 | response = requests.get(url) 11 | response.raise_for_status() 12 | wav_data = response.content 13 | encoded_string = base64.b64encode(wav_data).decode('utf-8') 14 | 15 | completion = client.chat.completions.create( 16 | model="gpt-4o-audio-preview-2024-10-01", 17 | # modalities=["text", "audio"], 18 | # audio={"voice": "alloy", "format": "wav"}, 19 | modalities=["text"], 20 | messages=[ 21 | { 22 | "role": "user", 23 | "content": [ 24 | { 25 | "type": "text", 26 | "text": "What is in this recording?" 27 | }, 28 | { 29 | "type": "input_audio", 30 | "input_audio": { 31 | "data": encoded_string, 32 | "format": "wav" 33 | } 34 | } 35 | ] 36 | }, 37 | ] 38 | ) 39 | 40 | print(completion) 41 | print(completion.choices[0].message.content) 42 | -------------------------------------------------------------------------------- /tests/powershell/Invoke-Whisper-Audio-Transcription.ps1: -------------------------------------------------------------------------------- 1 | # 環境変数からAPIキーを取得 2 | $apiKey = $env:OPENAI_API_KEY 3 | if (-not $apiKey) { 4 | Write-Error "環境変数 'OPENAI_API_KEY' が設定されていません。APIキーを設定してください。" 5 | exit 1 6 | } 7 | 8 | # 環境変数からAPI URLを取得 9 | $apiBase = $env:OPENAI_BASE_URL 10 | if (-not $apiBase) { 11 | $apiBase = $env:OPENAI_API_BASE 12 | if (-not $apiBase) { 13 | $apiBase = "https://api.openai.com/v1" 14 | } 15 | } 16 | $openAiUrl = $apiBase + "/audio/transcriptions" 17 | #$openAiUrl = "http://localhost:8000/v1/audio/transcriptions" 18 | 19 | # コマンドライン引数で音声ファイルのパスを取得 20 | if ($args.Count -lt 1) { 21 | Write-Error "音声ファイルのパスを指定してください。" 22 | Write-Host "使用例: .\TranscribeAudio.ps1 path\to\audio.mp4" 23 | exit 1 24 | } 25 | $audioFilePath = $args[0] 26 | #$audioFilePath = "c:\temp\alloy.wav" # MP4ファイルのパスを指定 27 | 28 | # ファイルの存在を確認 29 | if (-Not (Test-Path -Path $audioFilePath)) { 30 | Write-Error "指定したファイルが見つかりません: $audioFilePath" 31 | exit 1 32 | } 33 | 34 | # ファイルの拡張子を確認し、MIME タイプを設定 35 | $extension = [System.IO.Path]::GetExtension($audioFilePath).ToLower() 36 | switch ($extension) { 37 | ".mp4" { $mimeType = "audio/mp4" } 38 | ".wav" { $mimeType = "audio/wav" } 39 | ".mp3" { $mimeType = "audio/mpeg" } 40 | default { 41 | Write-Error "サポートされていないファイル形式: $extension" 42 | exit 1 43 | } 44 | } 45 | 46 | # マルチパートリクエスト用データを作成 47 | $boundary = [System.Guid]::NewGuid().ToString() 48 | $LF = "`r`n" 49 | 50 | $fileBytes = [System.IO.File]::ReadAllBytes($audioFilePath) 51 | $fileContent = [System.Text.Encoding]::GetEncoding("ISO-8859-1").GetString($fileBytes) 52 | $fileName = [System.IO.Path]::GetFileName($audioFilePath) 53 | 54 | $body = ( 55 | "--$boundary$LF" + 56 | "Content-Disposition: form-data; name=`"file`"; filename=`"$fileName`"$LF" + 57 | "Content-Type: $mimeType$LF$LF" + 58 | $fileContent + $LF + 59 | "--$boundary$LF" + 60 | "Content-Disposition: form-data; name=`"model`"$LF$LF" + 61 | "whisper-1$LF" + 62 | "--$boundary--$LF" 63 | ) 64 | 65 | $headers = @{ 66 | "Authorization" = "Bearer $apiKey" 67 | "Content-Type" = "multipart/form-data; boundary=$boundary" 68 | } 69 | 70 | # APIを呼び出す 71 | try { 72 | $response = Invoke-RestMethod -Uri $openAiUrl -Headers $headers -Method Post -Body $body 73 | # UTF-8でコンソールに出力する 74 | #[Console]::OutputEncoding = [System.Text.Encoding]::UTF8 75 | #Write-Output $response.text 76 | 77 | $garbledText = $response.text 78 | # 文字化けしている文字列をバイト配列に変換 79 | $byteArray = [System.Text.Encoding]::GetEncoding("ISO-8859-1").GetBytes($garbledText) 80 | # バイト配列をUTF-8としてデコード 81 | $decodedText = [System.Text.Encoding]::UTF8.GetString($byteArray) 82 | 83 | Write-Output $decodedText 84 | 85 | } catch { 86 | Write-Error "API呼び出し中にエラーが発生しました: $_" 87 | } 88 | --------------------------------------------------------------------------------