├── .gitignore
├── Dockerfile
├── LICENSE
├── README.md
├── docker-compose.yml
├── main.py
├── requirements.txt
└── tests
    ├── openai
        ├── test_audio_transcription.sh
        ├── test_chat_completions.py
        └── test_chat_completions_text_modality_only.py
    └── powershell
        └── Invoke-Whisper-Audio-Transcription.ps1


/.gitignore:
--------------------------------------------------------------------------------
1 | .python-version
2 | __pycache__/
3 | tests/openai/*.log
4 | tests/openai/*.wav
5 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.11.10-slim
 2 | 
 3 | # Run updates and install ffmpeg
 4 | RUN apt-get update && \
 5 |     apt-get upgrade -y && \
 6 |     apt-get install -y ffmpeg && \
 7 |     apt-get clean && \
 8 |     rm -rf /var/lib/apt/lists/*
 9 | 
10 | # Copy and install the requirements
11 | COPY ./requirements.txt /requirements.txt
12 | 
13 | # Pip install the dependencies
14 | RUN pip install --upgrade pip 
15 | RUN pip install --no-cache-dir -r /requirements.txt
16 | 
17 | # Copy the current directory contents into the container at /app
18 | COPY main.py /app/main.py
19 | 
20 | # Set the working directory to /app
21 | WORKDIR /app
22 | 
23 | # Expose port 8000
24 | EXPOSE 8000
25 | 
26 | # Run the app
27 | CMD uvicorn main:app --host 0.0.0.0
28 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Yasuhiro Morioka
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # tiny-openai-whisper-api
  2 | 
  3 | OpenAI Whisper API-style local server, runnig on FastAPI. This is for companies behind proxies or security firewalls.
  4 | 
  5 | This API will be compatible with [OpenAI Whisper (speech to text) API](https://openai.com/blog/introducing-chatgpt-and-whisper-apis). See also  [Create transcription - API Reference - OpenAI API](https://platform.openai.com/docs/api-reference/audio/create).
  6 | 
  7 | Some of code has been copied from [whisper-ui](https://github.com/hayabhay/whisper-ui)
  8 | 
  9 | 
 10 | Now, this server emulates the following OpenAI APIs. 
 11 | 
 12 | - (whisper) [Speech to text - OpenAI API](https://platform.openai.com/docs/guides/speech-to-text)
 13 | - (chat) [Audio generation - OpenAI API](https://platform.openai.com/docs/guides/audio?audio-generation-quickstart-example=audio-in)
 14 | 
 15 | ## Running Environment
 16 | 
 17 | This was built & tested on Python 3.10.9, Ubutu22.04/WSL2.
 18 | 
 19 | - openai=1.55.0
 20 | - openai-whisper=20240930
 21 | 
 22 | ## Setup
 23 | 
 24 | ```bash
 25 | sudo apt install ffmpeg
 26 | pip install fastapi python-multipart pydantic uvicorn openai-whisper httpx
 27 | # or pip install -r requirements.txt
 28 | ```
 29 | 
 30 | or 
 31 | 
 32 | ```bash
 33 | docker compose build
 34 | ```
 35 | 
 36 | ## Usage
 37 | 
 38 | ### server
 39 | ```bash
 40 | export WHISPER_MODEL=turbo  # 'turbo' if not supecified
 41 | export PYTHONPATH=.
 42 | uvicorn main:app --host 0.0.0.0
 43 | ```
 44 | 
 45 | or 
 46 | 
 47 | ```bash
 48 | docker compose up
 49 | ```
 50 | 
 51 | ### client
 52 | 
 53 | note: Authorization header is ignored.
 54 | 
 55 | example 1: typical usecase, identical to OpenAI Whisper API example
 56 | 
 57 | ```bash
 58 | curl --request POST \
 59 |   http://127.0.0.1:8000/v1/audio/transcriptions \
 60 |   -H "Authorization: Bearer $OPENAI_API_KEY" \
 61 |   -H "Content-Type: multipart/form-data" \
 62 |   -F model="whisper-1" \
 63 |   -F file="@/path/to/file/openai.mp3"
 64 | ```
 65 | 
 66 | example 2: set the output format as text, described in quickstart.
 67 | 
 68 | ```bash
 69 | curl --request POST \
 70 |   http://127.0.0.1:8000/v1/audio/transcriptions \
 71 |   -H "Content-Type: multipart/form-data" \
 72 |   -F model="whisper-1" \
 73 |   -F file="@/path/to/file/openai.mp3" \
 74 |   -F response_format=text
 75 | ```
 76 | 
 77 | example 3: Windows PowerShell5
 78 | 
 79 | ```bash
 80 | Set-ExecutionPolicy -Scope Process -ExecutionPolicy RemoteSigned
 81 | $env:OPENAI_API_KEY="dummy"
 82 | $env:OPENAI_BASE_URL="http://localhost:8000/v1"
 83 | 
 84 | .\tests\powershell\Invoke-Whisper-Audio-Transcription.ps1 "C:\temp\alloy.wav"
 85 | ```
 86 | 
 87 | ## experimental: gpt-4o-audio-preview, chat-completions
 88 | 
 89 | currenly "output text only" mode is supported. 
 90 | If "output text and audio" is specified, the system makes "output text only" response.
 91 | 
 92 | - output text and audio
 93 |   - modalities=["text", "audio"], audio={"voice": "alloy", "format": "wav"}
 94 | 
 95 | ```
 96 | completion = client.chat.completions.create(
 97 |     model="gpt-4o-audio-preview-2024-10-01",
 98 |     modalities=["text", "audio"],
 99 |     audio={"voice": "alloy", "format": "wav"},
100 |     messages=[
101 |         {
102 |             "role": "user",
103 |             "content": [
104 |                 { 
105 |                     "type": "text",
106 |                     "text": "What is in this recording?"
107 |                 },
108 |                 {
109 |                     "type": "input_audio",
110 |                     "input_audio": {
111 |                         "data": encoded_string,
112 |                         "format": "wav"
113 |                     }
114 |                 }
115 |             ]
116 |         },
117 |     ]
118 | )
119 | 
120 | print(completion.choices[0].message.audio.transcript)
121 | ```
122 | 
123 | ```python
124 | ChatCompletion(id='chatcmpl-AXQt8BTMW4Gh1OcJ5qStVDNZGdzSq', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content=None, refusal=None, role='assistant', audio=ChatCompletionAudio(id='audio_6744555cc6d48190b67e70798ab606c3', data='{{base64-wav}}', expires_at=1732535148, transcript='The recording contains a voice stating that the sun rises in the east and sets in the west, a fact that has been observed by humans for thousands of years.'), function_call=None, tool_calls=None))], created=1732531546, model='gpt-4o-audio-preview-2024-10-01', object='chat.completion', service_tier=None, system_fingerprint='fp_130ac2f073', usage=CompletionUsage(completion_tokens=236, prompt_tokens=86, total_tokens=322, completion_tokens_details=CompletionTokensDetails(accepted_prediction_tokens=0, audio_tokens=188, reasoning_tokens=0, rejected_prediction_tokens=0, text_tokens=48), prompt_tokens_details=PromptTokensDetails(audio_tokens=69, cached_tokens=0, text_tokens=17, image_tokens=0)))
125 | ```
126 | 
127 | - output text only
128 |   - modalities=["text"]
129 | 
130 | ```python
131 | completion = client.chat.completions.create(
132 |     model="gpt-4o-audio-preview-2024-10-01",
133 |     modalities=["text"],
134 |     messages=[
135 |         {
136 |             "role": "user",
137 |             "content": [
138 |                 { 
139 |                     "type": "text",
140 |                     "text": "What is in this recording?"
141 |                 },
142 |                 {
143 |                     "type": "input_audio",
144 |                     "input_audio": {
145 |                         "data": encoded_string,
146 |                         "format": "wav"
147 |                     }
148 |                 }
149 |             ]
150 |         },
151 |     ]
152 | )
153 | 
154 | print(completion.choices[0].message.content)
155 | ```
156 | 
157 | ```python
158 | ChatCompletion(id='chatcmpl-AXTBlZypmtf1CCWrR6X5uX55r4VHY', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content="The recording contains a statement about the sun's movement, stating that the sun rises in the east and sets in the west, a fact that has been observed by humans for thousands of years.", refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None))], created=1732540389, model='gpt-4o-audio-preview-2024-10-01', object='chat.completion', service_tier=None, system_fingerprint='fp_130ac2f073', usage=CompletionUsage(completion_tokens=38, prompt_tokens=86, total_tokens=124, completion_tokens_details=CompletionTokensDetails(accepted_prediction_tokens=0, audio_tokens=0, reasoning_tokens=0, rejected_prediction_tokens=0, text_tokens=38), prompt_tokens_details=PromptTokensDetails(audio_tokens=69, cached_tokens=0, text_tokens=17, image_tokens=0)))
159 | ```
160 | 
161 | ### experimental: Dify integration
162 | 
163 | - LLM registration ... OK
164 | - audio file transcription ... NG
165 |   - dify-0.12.1 doesn't pass an user prompot which contains audio file (data) to this tiny-openai-whisper-api server.
166 |   - probably on dify-0.12.1, "native (audio) file processing capabilities" is available only for openai:gpt-4o-audio-preview. How can we give these feature to openai-compatible LLMs?
167 | 
168 | ## TODO
169 | 
170 | - more inference parameters should be supported. only `temperature` is supported.
171 | - text prompt (to whisper module) should be supported. currently text prompt is ignored.
172 | - some of reponse property values are dummy (static).
173 | - 'speech-to-text' chat completion available on dify
174 | - discussed at https://discord.com/channels/1082486657678311454/1236911815695400960/1311646643581353984
175 |   - patch is https://github.com/fujita-h/dify/commit/39cc3a38d1762da3d5534615580590441f1c9c9b
176 |   - the patch works well with this code.
177 | 
178 | ## License
179 | 
180 | Whisper is licensed under MIT. Everything else by [morioka](https://github.com/morioka) is licensed under MIT.
181 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | services:
 2 |   whisper-api:
 3 |     image: morioka/tiny-openai-whisper-api
 4 |     build: .
 5 |     container_name: whisper-api
 6 |     restart: unless-stopped
 7 |     init: true
 8 |     ports:
 9 |       - 8000:8000
10 |     deploy:
11 |       resources:
12 |         reservations:
13 |           devices:
14 |             - driver: nvidia
15 |               count: 1
16 |               capabilities: [gpu]
17 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | from contextlib import asynccontextmanager
  2 | from fastapi import FastAPI, Request, Form, UploadFile, File
  3 | from fastapi import HTTPException, status
  4 | from fastapi.responses import JSONResponse
  5 | 
  6 | import os
  7 | import shutil
  8 | from pathlib import Path
  9 | from typing import Any, List, Union, Optional
 10 | 
 11 | from datetime import timedelta
 12 | 
 13 | import numpy as np
 14 | import whisper
 15 | import torch
 16 | 
 17 | import uvicorn
 18 | import json
 19 | import base64
 20 | import tempfile
 21 | 
 22 | 
 23 | #url https://api.openai.com/v1/audio/transcriptions \
 24 | #  -H "Authorization: Bearer $OPENAI_API_KEY" \
 25 | #  -H "Content-Type: multipart/form-data" \
 26 | #  -F model="whisper-1" \
 27 | #  -F file="@/path/to/file/openai.mp3"
 28 | 
 29 | #{
 30 | #  "text": "Imagine the wildest idea that you've ever had, and you're curious about how it might scale to something that's a 100, a 1,000 times bigger..."
 31 | #}
 32 | 
 33 | WHISPER_MODEL = os.environ.get('WHISPER_MODEL', 'turbo')
 34 | CHAT_MODEL = os.environ.get('CHAT_MODEL', None)
 35 | 
 36 | whisper_model = None
 37 | chat_model = None
 38 | 
 39 | @asynccontextmanager
 40 | async def lifespan(app: FastAPI):
 41 |     global whisper_model
 42 |     # Load the ML model
 43 |     device = "cuda" if torch.cuda.is_available() else "cpu"
 44 |     whisper_model = whisper.load_model(WHISPER_MODEL, device=device, in_memory=True)
 45 | 
 46 |     yield
 47 | 
 48 |     # Clean up the ML models and release the resources
 49 |     del whisper_model
 50 |     whisper_model = None
 51 | 
 52 | app = FastAPI(lifespan=lifespan)
 53 | 
 54 | # -----
 55 | # copied from https://github.com/hayabhay/whisper-ui
 56 | # Whisper transcription functions
 57 | def transcribe(audio_path: str, **whisper_args):
 58 |     """Transcribe the audio file using whisper"""
 59 |     global whisper_model
 60 | 
 61 |     # Set configs & transcribe
 62 |     if whisper_args["temperature_increment_on_fallback"] is not None:
 63 |         whisper_args["temperature"] = tuple(
 64 |             np.arange(whisper_args["temperature"], 1.0 + 1e-6, whisper_args["temperature_increment_on_fallback"])
 65 |         )
 66 |     else:
 67 |         whisper_args["temperature"] = [whisper_args["temperature"]]
 68 | 
 69 |     del whisper_args["temperature_increment_on_fallback"]
 70 | 
 71 |     transcript = whisper_model.transcribe(
 72 |         audio_path,
 73 |         **whisper_args,
 74 |     )
 75 | 
 76 |     return transcript
 77 | 
 78 | WHISPER_DEFAULT_SETTINGS = {
 79 | #    "whisper_model": "turbo",
 80 |     "temperature": 0.0,
 81 |     "temperature_increment_on_fallback": 0.2,
 82 |     "no_speech_threshold": 0.6,
 83 |     "logprob_threshold": -1.0,
 84 |     "compression_ratio_threshold": 2.4,
 85 |     "condition_on_previous_text": True,
 86 |     "verbose": False,
 87 |     "task": "transcribe",
 88 | }
 89 | 
 90 | UPLOAD_DIR="/tmp"
 91 | 
 92 | @app.get('/v1/models')
 93 | async def v1_models(request: Request):
 94 |     content = {
 95 |         "object": "list",
 96 |         "data": [
 97 |             {
 98 |                 "id": "whisper-1",
 99 |                 "object": "model",
100 |                 "created": 17078881749,
101 |                 "owned_by": "tiny-whisper-api"
102 |             },
103 |             {
104 |                 "id": "gpt-4o-audio-preview",
105 |                 "object": "model",
106 |                 "created": 17078881749,
107 |                 "owned_by": "tiny-whisper-api"
108 |             },
109 |             {
110 |                 "id": "gpt-4o-audio-preview-2024-10-01",
111 |                 "object": "model",
112 |                 "created": 17078881749,
113 |                 "owned_by": "tiny-whisper-api"
114 |             }
115 |         ]
116 |     }
117 | 
118 |     headers = {
119 |         'Content-Type': 'application/json'
120 |     }
121 | 
122 |     response_status_code = 200
123 | 
124 |     resp = JSONResponse(
125 |         content = content,
126 |         headers = headers,
127 |         status_code = response_status_code
128 |     )
129 | 
130 |     return resp
131 | 
132 | # gpt-4o-audio-preview：OpenAI の Chat Completions API でオーディオを扱う新機能を軽く見てみる【2024/10/17リリース】 #ChatGPT - Qiita
133 | # https://qiita.com/youtoy/items/5a87fd22cc88d8c34d6d
134 | 
135 | # https://platform.openai.com/docs/api-reference/chat/create
136 | '''
137 | curl "https://api.openai.com/v1/chat/completions" \
138 |     -H "Content-Type: application/json" \
139 |     -H "Authorization: Bearer $OPENAI_API_KEY" \
140 |     -d '{
141 |       "model": "gpt-4o-audio-preview",
142 |       "modalities": ["text", "audio"],
143 |       "audio": { "voice": "alloy", "format": "wav" },
144 |       "messages": [
145 |         {
146 |           "role": "user",
147 |           "content": [
148 |             { "type": "text", "text": "What is in this recording?" },
149 |             {
150 |               "type": "input_audio",
151 |               "input_audio": {
152 |                 "data": "<base64 bytes here>",
153 |                 "format": "wav"
154 |               }
155 |             }
156 |           ]
157 |         }
158 |       ]
159 |     }'
160 | '''
161 | 
162 | # modalities = ["text"]
163 | CHAT_COMPLETIONS_RESPONSE_TEMPLATE='''
164 | {
165 |   "id": "chatcmpl-123",
166 |   "object": "chat.completion",
167 |   "created": 1677652288,
168 |   "model": "gpt-4o-audio-preview-2024-10-01",
169 |   "system_fingerprint": "fp_44709d6fcb",
170 |   "service_tier": null,
171 |   "choices": [{
172 |     "index": 0,
173 |     "message": {
174 |       "role": "assistant",
175 |       "content": "Hello there, how may I assist you today?",
176 |       "refusal": null
177 |     },
178 |     "logprobs": null,
179 |     "finish_reason": "stop"
180 |   }],
181 |   "usage": {
182 |     "prompt_tokens": 86,
183 |     "prompt_tokens_details": {
184 |       "audio_tokens": 69,
185 |       "cached_tokens": 0,
186 |       "text_tokens": 17,
187 |       "image_tokens": 0
188 |     },
189 |     "completion_tokens": 36,
190 |     "total_tokens": 122,
191 |     "completion_tokens_details": {
192 |       "reasoning_tokens": 0,
193 |       "accepted_prediction_tokens": 0,
194 |       "rejected_prediction_tokens": 0,
195 |       "audio_tokens": 0,
196 |       "reasoning_tokens": 0,
197 |       "text_toekns": 17
198 |     }
199 |   }
200 | }
201 | '''
202 | 
203 | # modalities = ["text", "audio"]
204 | CHAT_COMPLETIONS_RESPONSE_AUDIO_OUTPUT_TEMPLATE='''
205 | {
206 |   "id": "chatcmpl-123",
207 |   "object": "chat.completion",
208 |   "created": 1677652288,
209 |   "model": "gpt-4o-audio-preview-2024-10-01",
210 |   "system_fingerprint": "fp_44709d6fcb",
211 |   "service_tier": null,
212 |   "choices": [{
213 |     "index": 0,
214 |     "message": {
215 |       "role": "assistant",
216 |       "content": null,
217 |       "refusal": null,
218 |       "audio": {
219 |         "id": "audio_6744555cc6d48190b67e70798ab606c3",
220 |         "data": "response_audio_data_base64",
221 |         "expires_at": 1732535148,
222 |         "transcript": "response_transcript"
223 |       }
224 |     },
225 |     "logprobs": null,
226 |     "finish_reason": "stop",
227 |     "function_call": null,
228 |     "tool_calls": null
229 |   }],
230 |   "usage": {
231 |     "prompt_tokens": 86,
232 |     "prompt_tokens_details": {
233 |       "audio_tokens": 69,
234 |       "cached_tokens": 0,
235 |       "text_tokens": 17,
236 |       "image_tokens": 0
237 |     },
238 |     "completion_tokens": 236,
239 |     "total_tokens": 322,
240 |     "completion_tokens_details": {
241 |       "reasoning_tokens": 0,
242 |       "accepted_prediction_tokens": 0,
243 |       "rejected_prediction_tokens": 0,
244 |       "audio_tokens": 188,
245 |       "reasoning_tokens": 0,
246 |       "text_tokens": 48
247 |     }
248 |   }
249 | }
250 | '''
251 | 
252 | CHAT_COMPLETIONS_RESPONSE_DIFY_PING_TEMPLATE='''
253 | {
254 |   "id": "chatcmpl-123",
255 |   "object": "chat.completion",
256 |   "created": 1677652288,
257 |   "model": "gpt-4o-audio-preview-2024-10-01",
258 |   "system_fingerprint": "fp_44709d6fcb",
259 |   "service_tier": null,
260 |   "choices": [{
261 |     "index": 0,
262 |     "message": {
263 |       "role": "assistant",
264 |       "content": "Pong",
265 |       "refusal": null
266 |     },
267 |     "logprobs": null,
268 |     "finish_reason": "stop"
269 |   }],
270 |   "usage": {
271 |     "prompt_tokens": 5,
272 |     "prompt_tokens_details": {
273 |       "audio_tokens": 0,
274 |       "cached_tokens": 0,
275 |       "text_tokens": 5,
276 |       "image_tokens": 0
277 |     },
278 |     "completion_tokens": 5,
279 |     "total_tokens": 10,
280 |     "completion_tokens_details": {
281 |       "reasoning_tokens": 0,
282 |       "accepted_prediction_tokens": 0,
283 |       "rejected_prediction_tokens": 0,
284 |       "audio_tokens": 0,
285 |       "reasoning_tokens": 0,
286 |       "text_toekns": 5
287 |     }
288 |   }
289 | }
290 | '''
291 | 
292 | def is_base64_encoded(s: str) -> bool:
293 |     try:
294 |         # パディングの調整（4の倍数に）
295 |         if len(s) % 4 != 0:
296 |             return False
297 | 
298 |         # デコードしてみる
299 |         base64.b64decode(s, validate=True)
300 |         return True
301 |     except Exception:
302 |         return False
303 | 
304 | def save_base64_to_temp_file(base64_string: str) -> str:
305 |     try:
306 |         # BASE64文字列をデコード
307 |         binary_data = base64.b64decode(base64_string)
308 | 
309 |         # 一時ファイルを作成
310 |         with tempfile.NamedTemporaryFile(delete=False, mode='wb') as temp_file:
311 |             temp_file.write(binary_data)
312 |             temp_file_path = temp_file.name  # 一時ファイルのパスを取得
313 |         
314 |         return temp_file_path
315 | 
316 |     except Exception as e:
317 |         return None
318 | 
319 | @app.post('/v1/chat/completions')
320 | async def v1_chat_completions(request: Request):
321 | 
322 |     global chat_model
323 | 
324 |     req_body = await request.json()
325 | 
326 |     model = req_body['model']
327 |     try:
328 |         modalities = req_body['modalities']
329 |     except KeyError:
330 |         modalities = ['text']
331 |     try:
332 |         audio = req_body['audio']
333 |     except KeyError:
334 |         audio = None
335 | 
336 |     if model not in ['gpt-4o-audio-preview', 'gpt-4o-audio-preview-2024-10-01']:
337 |         raise HTTPException(
338 |             status_code=status.HTTP_400_BAD_REQUEST,
339 |             detail=f"Bad Request, not supported model"
340 |             )
341 | 
342 |     if 'text' not in modalities:
343 |         raise HTTPException(
344 |             status_code=status.HTTP_400_BAD_REQUEST,
345 |             detail=f"Bad Request, 'text' is not in modalitiees"
346 |             )
347 |     
348 |     if 'audio' in modalities:
349 |         if audio is None:
350 |             raise HTTPException(
351 |                 status_code=status.HTTP_400_BAD_REQUEST,
352 |                 detail=f"Bad Request, 'audio' is in modalitiees, but attiributes are not specified."
353 |                 )
354 | 
355 |         if audio['voice'] not in ['ash', 'ballad', 'coral', 'sage', 'verse', 'alloy', 'echo', 'shmmer']:
356 |             raise HTTPException(
357 |                 status_code=status.HTTP_400_BAD_REQUEST,
358 |                 detail=f"Bad Request, not supported voice"
359 |                 )
360 |         if audio['format'] not in ['wav', 'mp3', 'flac', 'opus', 'pcm16']:
361 |             raise HTTPException(
362 |                 status_code=status.HTTP_400_BAD_REQUEST,
363 |                 detail=f"Bad Request, not supported format"
364 |                 )
365 | 
366 |     messages = req_body['messages']
367 |     content = None
368 |     for m in messages:
369 |         for c in m['content']:
370 |             if 'input_audio' in c:
371 |                 assert 'data' in c['input_audio']
372 |                 assert 'format' in c['input_audio']
373 |                 content = c
374 |                 break
375 | 
376 |     if content is None:
377 |         # dify ping
378 |         for m in messages:
379 |             if m['content'] in ['ping']:
380 |                 resp_body = json.loads(CHAT_COMPLETIONS_RESPONSE_DIFY_PING_TEMPLATE)
381 |                 resp_body['model'] = model
382 |                 resp = JSONResponse(
383 |                     content = resp_body,
384 |                     headers = {
385 |                         'Content-Type': 'application/json'
386 |                     },
387 |                     status_code = 200
388 |                 )
389 |                 return resp
390 |         raise HTTPException(
391 |             status_code=status.HTTP_400_BAD_REQUEST,
392 |             detail=f"Bad Request, missing content"
393 |             )
394 | 
395 |     # content data is base64-encoded?
396 |     data = content['input_audio']['data']
397 |     if not is_base64_encoded(data):
398 |         raise HTTPException(
399 |             status_code=status.HTTP_400_BAD_REQUEST,
400 |             detail=f"Bad Request, content is not base64-encoded."
401 |             )
402 | 
403 |     settings = WHISPER_DEFAULT_SETTINGS.copy()
404 |     #settings['temperature'] = temperature
405 | 
406 |     temp_content_path = save_base64_to_temp_file(data)
407 |     if temp_content_path is None:
408 |         raise HTTPException(
409 |             status_code=status.HTTP_400_BAD_REQUEST,
410 |             detail=f"Bad Request, transcription failed."
411 |             )
412 | 
413 |     transcript = transcribe(audio_path=temp_content_path, **settings)
414 |     text = transcript['text']
415 | 
416 |     if temp_content_path:
417 |         # TODO: 非同期で削除したい
418 |         os.remove(temp_content_path)
419 | 
420 |     #print(transcript)
421 |     #print(text)
422 | 
423 |     if audio is not None:
424 |         resp_body = json.loads(CHAT_COMPLETIONS_RESPONSE_AUDIO_OUTPUT_TEMPLATE)
425 |         resp_body['choices'][0]['message']['audio']['transcript'] = text
426 |     else:
427 |         resp_body = json.loads(CHAT_COMPLETIONS_RESPONSE_TEMPLATE)
428 |         resp_body['choices'][0]['message']['content'] = text
429 |         resp_body['choices'][0]['delta'] = resp_body['choices'][0]['message'].copy()
430 | 
431 |     resp_body['model'] = model
432 | 
433 |     if chat_model:
434 |         pass
435 |         # request の messagesのうち、 input_audio 部分を書き起こしに差し替えて、
436 |         # chat_model に投げつけて応答を得る。
437 |         # ? chat_modelのAPI_KEYは? 接続先は?
438 |         # ? トークン数などの統計情報をどう補正する?
439 |         # ? modalities 情報は? 音声出力しないのであれば気にしなくてよい?
440 | 
441 |     resp = JSONResponse(
442 |         content = resp_body,
443 |         headers = {
444 |             'Content-Type': 'application/json'
445 |         },
446 |         status_code = 200
447 |     )
448 |     
449 |     return resp
450 | 
451 | 
452 | @app.post('/v1/audio/transcriptions')
453 | async def transcriptions(model: str = Form(...),
454 |                          file: UploadFile = File(...),
455 |                          response_format: Optional[str] = Form(None),
456 |                          language: Optional[str] = Form(None),
457 |                          prompt: Optional[str] = Form(None),
458 |                          temperature: Optional[float] = Form(None)):
459 | 
460 |     assert model == "whisper-1"
461 |     if file is None:
462 |         raise HTTPException(
463 |             status_code=status.HTTP_400_BAD_REQUEST,
464 |             detail=f"Bad Request, bad file"
465 |             )
466 |     if response_format is None:
467 |         response_format = 'json'
468 |     if response_format not in ['json',
469 |                            'text',
470 |                            'srt',
471 |                            'verbose_json',
472 |                            'vtt']:
473 |         raise HTTPException(
474 |             status_code=status.HTTP_400_BAD_REQUEST,
475 |             detail=f"Bad Request, bad response_format"
476 |             )
477 |     if temperature is None:
478 |         temperature = 0.0
479 |     if temperature < 0.0 or temperature > 1.0:
480 |         raise HTTPException(
481 |             status_code=status.HTTP_400_BAD_REQUEST,
482 |             detail=f"Bad Request, bad temperature"
483 |             )
484 | 
485 |     filename = file.filename
486 |     fileobj = file.file
487 |     
488 |     # TODO: 拡張子は維持しながら、ファイル名の衝突を避けるために一時ファイルとしたい
489 |     upload_name = os.path.join(UPLOAD_DIR, filename)
490 |     upload_file = open(upload_name, 'wb')
491 |     shutil.copyfileobj(fileobj, upload_file)
492 |     upload_file.close()
493 | 
494 |     settings = WHISPER_DEFAULT_SETTINGS.copy()
495 |     settings['temperature'] = temperature
496 |     if language is not None:
497 |         settings['language'] = language # TODO: check  ISO-639-1  format
498 | 
499 |     transcript = transcribe(audio_path=upload_name, **settings)
500 | 
501 |     if upload_name:
502 |         os.remove(upload_name)
503 | 
504 |     if response_format in ['text']:
505 |         return transcript['text']
506 | 
507 |     if response_format in ['srt']:
508 |         ret = ""
509 |         for seg in transcript['segments']:
510 |             
511 |             td_s = timedelta(milliseconds=seg["start"]*1000)
512 |             td_e = timedelta(milliseconds=seg["end"]*1000)
513 | 
514 |             t_s = f'{td_s.seconds//3600:02}:{(td_s.seconds//60)%60:02}:{td_s.seconds%60:02}.{td_s.microseconds//1000:03}'
515 |             t_e = f'{td_e.seconds//3600:02}:{(td_e.seconds//60)%60:02}:{td_e.seconds%60:02}.{td_e.microseconds//1000:03}'
516 | 
517 |             ret += '{}\n{} --> {}\n{}\n\n'.format(seg["id"], t_s, t_e, seg["text"])
518 |         ret += '\n'
519 |         return ret
520 | 
521 |     if response_format in ['vtt']:
522 |         ret = "WEBVTT\n\n"
523 |         for seg in transcript['segments']:
524 |             td_s = timedelta(milliseconds=seg["start"]*1000)
525 |             td_e = timedelta(milliseconds=seg["end"]*1000)
526 | 
527 |             t_s = f'{td_s.seconds//3600:02}:{(td_s.seconds//60)%60:02}:{td_s.seconds%60:02}.{td_s.microseconds//1000:03}'
528 |             t_e = f'{td_e.seconds//3600:02}:{(td_e.seconds//60)%60:02}:{td_e.seconds%60:02}.{td_e.microseconds//1000:03}'
529 | 
530 |             ret += "{} --> {}\n{}\n\n".format(t_s, t_e, seg["text"])
531 |         return ret
532 | 
533 |     if response_format in ['verbose_json']:
534 |         transcript.setdefault('task', WHISPER_DEFAULT_SETTINGS['task'])
535 |         transcript.setdefault('duration', transcript['segments'][-1]['end'])
536 |         if transcript['language'] == 'ja':
537 |             transcript['language'] = 'japanese'
538 |         return transcript
539 | 
540 |     return {'text': transcript['text']}
541 | 
542 | def main():
543 |     uvicorn.run("main:app", host="0.0.0.0", port=8000, log_level ="info")
544 | 
545 | if __name__ == "__main__":
546 | #    main()
547 |     pass
548 | 
549 | 
550 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | fastapi
2 | pydantic
3 | python-multipart
4 | openai-whisper
5 | uvicorn
6 | 


--------------------------------------------------------------------------------
/tests/openai/test_audio_transcription.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | if [ ! -e alloy.wav ]; then
3 |   wget https://openaiassets.blob.core.windows.net/\$web/API/docs/audio/alloy.wav
4 | fi
5 | curl http://localhost:8000/v1/audio/transcriptions   -H "Content-Type: multipart/form-data"   -F model="whisper-1"   -F file="@alloy.wav"   -F response_format=text
6 | 


--------------------------------------------------------------------------------
/tests/openai/test_chat_completions.py:
--------------------------------------------------------------------------------
 1 | import base64
 2 | import requests
 3 | from openai import OpenAI
 4 | 
 5 | client = OpenAI()
 6 | 
 7 | # Fetch the audio file and convert it to a base64 encoded string
 8 | url = "https://openaiassets.blob.core.windows.net/$web/API/docs/audio/alloy.wav"
 9 | response = requests.get(url)
10 | response.raise_for_status()
11 | wav_data = response.content
12 | encoded_string = base64.b64encode(wav_data).decode('utf-8')
13 | 
14 | completion = client.chat.completions.create(
15 |     model="gpt-4o-audio-preview-2024-10-01",
16 |     modalities=["text", "audio"],
17 |     audio={"voice": "alloy", "format": "wav"},
18 |     messages=[
19 |         {
20 |             "role": "user",
21 |             "content": [
22 |                 { 
23 |                     "type": "text",
24 |                     "text": "What is in this recording?"
25 |                 },
26 |                 {
27 |                     "type": "input_audio",
28 |                     "input_audio": {
29 |                         "data": encoded_string,
30 |                         "format": "wav"
31 |                     }
32 |                 }
33 |             ]
34 |         },
35 |     ]
36 | )
37 | 
38 | print(completion)
39 | print(completion.choices[0].message.audio.transcript)
40 | 


--------------------------------------------------------------------------------
/tests/openai/test_chat_completions_text_modality_only.py:
--------------------------------------------------------------------------------
 1 | import base64
 2 | import requests
 3 | from openai import OpenAI
 4 | 
 5 | client = OpenAI()
 6 | client = OpenAI(base_url='http://localhost:8000/v1')
 7 | 
 8 | # Fetch the audio file and convert it to a base64 encoded string
 9 | url = "https://openaiassets.blob.core.windows.net/$web/API/docs/audio/alloy.wav"
10 | response = requests.get(url)
11 | response.raise_for_status()
12 | wav_data = response.content
13 | encoded_string = base64.b64encode(wav_data).decode('utf-8')
14 | 
15 | completion = client.chat.completions.create(
16 |     model="gpt-4o-audio-preview-2024-10-01",
17 | #    modalities=["text", "audio"],
18 | #    audio={"voice": "alloy", "format": "wav"},
19 |     modalities=["text"],
20 |     messages=[
21 |         {
22 |             "role": "user",
23 |             "content": [
24 |                 { 
25 |                     "type": "text",
26 |                     "text": "What is in this recording?"
27 |                 },
28 |                 {
29 |                     "type": "input_audio",
30 |                     "input_audio": {
31 |                         "data": encoded_string,
32 |                         "format": "wav"
33 |                     }
34 |                 }
35 |             ]
36 |         },
37 |     ]
38 | )
39 | 
40 | print(completion)
41 | print(completion.choices[0].message.content)
42 | 


--------------------------------------------------------------------------------
/tests/powershell/Invoke-Whisper-Audio-Transcription.ps1:
--------------------------------------------------------------------------------
 1 | # 環境変数からAPIキーを取得
 2 | $apiKey = $env:OPENAI_API_KEY
 3 | if (-not $apiKey) {
 4 |     Write-Error "環境変数 'OPENAI_API_KEY' が設定されていません。APIキーを設定してください。"
 5 |     exit 1
 6 | }
 7 | 
 8 | # 環境変数からAPI URLを取得
 9 | $apiBase = $env:OPENAI_BASE_URL
10 | if (-not $apiBase) {
11 |     $apiBase = $env:OPENAI_API_BASE
12 |     if (-not $apiBase) {
13 |         $apiBase = "https://api.openai.com/v1"
14 |     }
15 | }
16 | $openAiUrl = $apiBase + "/audio/transcriptions"
17 | #$openAiUrl = "http://localhost:8000/v1/audio/transcriptions"
18 | 
19 | # コマンドライン引数で音声ファイルのパスを取得
20 | if ($args.Count -lt 1) {
21 |     Write-Error "音声ファイルのパスを指定してください。"
22 |     Write-Host "使用例: .\TranscribeAudio.ps1 path\to\audio.mp4"
23 |     exit 1
24 | }
25 | $audioFilePath = $args[0]
26 | #$audioFilePath = "c:\temp\alloy.wav"  # MP4ファイルのパスを指定
27 | 
28 | # ファイルの存在を確認
29 | if (-Not (Test-Path -Path $audioFilePath)) {
30 |     Write-Error "指定したファイルが見つかりません: $audioFilePath"
31 |     exit 1
32 | }
33 | 
34 | # ファイルの拡張子を確認し、MIME タイプを設定
35 | $extension = [System.IO.Path]::GetExtension($audioFilePath).ToLower()
36 | switch ($extension) {
37 |     ".mp4" { $mimeType = "audio/mp4" }
38 |     ".wav" { $mimeType = "audio/wav" }
39 |     ".mp3" { $mimeType = "audio/mpeg" }
40 |     default {
41 |         Write-Error "サポートされていないファイル形式: $extension"
42 |         exit 1
43 |     }
44 | }
45 | 
46 | # マルチパートリクエスト用データを作成
47 | $boundary = [System.Guid]::NewGuid().ToString()
48 | $LF = "`r`n"
49 | 
50 | $fileBytes = [System.IO.File]::ReadAllBytes($audioFilePath)
51 | $fileContent = [System.Text.Encoding]::GetEncoding("ISO-8859-1").GetString($fileBytes)
52 | $fileName = [System.IO.Path]::GetFileName($audioFilePath)
53 | 
54 | $body = (
55 |   "--$boundary$LF" + 
56 |   "Content-Disposition: form-data; name=`"file`"; filename=`"$fileName`"$LF" +
57 |   "Content-Type: $mimeType$LF$LF" +
58 |   $fileContent + $LF +
59 |   "--$boundary$LF" +
60 |   "Content-Disposition: form-data; name=`"model`"$LF$LF" +
61 |   "whisper-1$LF" +
62 |   "--$boundary--$LF"
63 | )
64 | 
65 | $headers = @{
66 |     "Authorization" = "Bearer $apiKey"
67 |     "Content-Type"  = "multipart/form-data; boundary=$boundary"
68 | }
69 | 
70 | # APIを呼び出す
71 | try {
72 |     $response = Invoke-RestMethod -Uri $openAiUrl -Headers $headers -Method Post -Body $body
73 |     # UTF-8でコンソールに出力する
74 |     #[Console]::OutputEncoding = [System.Text.Encoding]::UTF8
75 |     #Write-Output $response.text
76 | 
77 |     $garbledText = $response.text
78 |     # 文字化けしている文字列をバイト配列に変換
79 |     $byteArray = [System.Text.Encoding]::GetEncoding("ISO-8859-1").GetBytes($garbledText)
80 |     # バイト配列をUTF-8としてデコード
81 |     $decodedText = [System.Text.Encoding]::UTF8.GetString($byteArray)
82 | 
83 |     Write-Output $decodedText
84 | 
85 | } catch {
86 |     Write-Error "API呼び出し中にエラーが発生しました: $_"
87 | }
88 | 


--------------------------------------------------------------------------------