├── speech_gateway
    ├── __init__.py
    ├── converter
    │   ├── __init__.py
    │   ├── pcm.py
    │   ├── mp3.py
    │   ├── wave.py
    │   └── mulaw.py
    ├── source
    │   ├── sbv2.py
    │   ├── voicevox.py
    │   ├── nijivoice_encoded.py
    │   ├── azure.py
    │   ├── openai_speech.py
    │   ├── nijivoice.py
    │   └── __init__.py
    ├── cache
    │   ├── __init__.py
    │   └── file.py
    ├── performance_recorder
    │   ├── __init__.py
    │   ├── sqlite.py
    │   └── postgres.py
    └── gateway
    │   ├── unified.py
    │   ├── voicevox.py
    │   ├── openai_speech.py
    │   ├── sbv2.py
    │   ├── azure.py
    │   ├── nijivoice_encoded.py
    │   ├── nijivoice.py
    │   └── __init__.py
├── requirements.txt
├── .gitattributes
├── tests
    ├── data
    │   └── test.wav
    ├── converter
    │   ├── test_mp3.py
    │   └── test_wave.py
    ├── conftest.py
    ├── source
    │   ├── test_sbv2_source.py
    │   ├── test_azure_source.py
    │   ├── test_nijivoice_encoded_source.py
    │   ├── test_nijivoice_source.py
    │   ├── test_voicevox_source.py
    │   └── test_openai_speech_source.py
    ├── gateway
    │   ├── test_sbv2.py
    │   ├── test_voicevox.py
    │   ├── test_unified.py
    │   ├── test_azure.py
    │   ├── test_openai_speech.py
    │   ├── test_azure_openai_speech.py
    │   ├── test_nijivoice.py
    │   └── test_nijivoice_encoded.py
    ├── cache
    │   └── test_file.py
    └── performance_recorder
    │   ├── test_sqlite.py
    │   └── test_postgres.py
├── docker
    ├── requirements.txt
    ├── pgadmin-servers.json
    ├── init-db.sh
    ├── README.md
    ├── setup-volumes.sh
    ├── Dockerfile.app
    ├── .env.sample
    ├── docker-compose.yaml
    └── run.py
├── setup.py
├── run.py
├── .gitignore
├── README.md
└── LICENSE


/speech_gateway/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | aiofiles==24.1.0
2 | fastapi==0.115.6
3 | httpx==0.28.1
4 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/tests/data/test.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uezo/speech-gateway/HEAD/tests/data/test.wav


--------------------------------------------------------------------------------
/docker/requirements.txt:
--------------------------------------------------------------------------------
1 | psycopg2-binary==2.9.9
2 | python-dotenv==1.0.0
3 | git+https://github.com/uezo/speech-gateway
4 | 


--------------------------------------------------------------------------------
/docker/pgadmin-servers.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "Servers": {
 3 |         "1": {
 4 |             "Name": "speech-gateway",
 5 |             "Group": "Servers",
 6 |             "Host": "spgw-db",
 7 |             "Port": 5432,
 8 |             "MaintenanceDB": "postgres",
 9 |             "Username": "postgres",
10 |             "SSLMode": "prefer"
11 |         }
12 |     }
13 | }
14 | 


--------------------------------------------------------------------------------
/speech_gateway/converter/__init__.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from typing import AsyncIterator
 3 | 
 4 | 
 5 | class FormatConverter(ABC):
 6 |     @abstractmethod
 7 |     async def convert(self, input_stream: AsyncIterator[bytes]) -> AsyncIterator[bytes]:
 8 |         pass
 9 | 
10 | 
11 | class FormatConverterError(Exception):
12 |     def __init__(self, message: str):
13 |         super().__init__(message)
14 | 
15 | 
16 | from .mp3 import MP3Converter
17 | 


--------------------------------------------------------------------------------
/docker/init-db.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | 
 5 | psql -v ON_ERROR_STOP=1 --username "$POSTGRES_USER" <<-EOSQL
 6 |     CREATE DATABASE $SPGW_DB_NAME;
 7 |     CREATE USER "$SPGW_DB_USER" WITH PASSWORD '$SPGW_DB_PASSWORD';
 8 |     GRANT ALL PRIVILEGES ON DATABASE $SPGW_DB_NAME TO "$SPGW_DB_USER";
 9 | EOSQL
10 | 
11 | psql -v ON_ERROR_STOP=1 --username "$POSTGRES_USER" --dbname "$SPGW_DB_NAME" <<-EOSQL
12 |     GRANT ALL ON SCHEMA public TO "$SPGW_DB_USER";
13 |     ALTER SCHEMA public OWNER TO "$SPGW_DB_USER";
14 | EOSQL
15 | 


--------------------------------------------------------------------------------
/speech_gateway/source/sbv2.py:
--------------------------------------------------------------------------------
 1 | from . import StreamSource
 2 | 
 3 | 
 4 | class StyleBertVits2StreamSource(StreamSource):
 5 |     def get_cache_key(self, audio_format: str, query_params: dict, **kwargs) -> str:
 6 |         return f"{hash(str(query_params))}.{audio_format or 'wav'}"
 7 | 
 8 |     def parse_text(self, query_params: dict, **kwargs) -> str:
 9 |         return query_params.get("text")
10 | 
11 |     def make_stream_request(self, query_params: dict, **kwargs):
12 |         return {
13 |             "method": "GET",
14 |             "url": self.base_url + "/voice",
15 |             "params": query_params,
16 |         }
17 | 


--------------------------------------------------------------------------------
/speech_gateway/cache/__init__.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from typing import AsyncIterator
 3 | 
 4 | 
 5 | class CacheStorage(ABC):
 6 |     @abstractmethod
 7 |     async def has_cache(self, cache_key: str) -> bool:
 8 |         pass
 9 | 
10 |     @abstractmethod
11 |     async def fetch_cache_stream(self, cache_key: str) -> AsyncIterator[bytes]:
12 |         pass
13 | 
14 |     @abstractmethod
15 |     async def write_cache(self, input_stream: AsyncIterator[bytes], cache_key: str) -> AsyncIterator[bytes]:
16 |         pass
17 | 
18 | 
19 | class CacheStorageError(Exception):
20 |     def __init__(self, message: str):
21 |         super().__init__(message)
22 | 
23 | 
24 | from .file import FileCacheStorage
25 | 


--------------------------------------------------------------------------------
/speech_gateway/converter/pcm.py:
--------------------------------------------------------------------------------
 1 | import io
 2 | import wave
 3 | import soundfile as sf
 4 | import numpy as np
 5 | 
 6 | 
 7 | def convert_float32bit_to_int16bit(input_data: bytes) -> bytes:
 8 |     data, samplerate = sf.read(io.BytesIO(input_data))
 9 |     pcm16_data = (data * 32767).astype(np.int16)
10 |     channels = pcm16_data.shape[1] if pcm16_data.ndim > 1 else 1
11 | 
12 |     wav_bytes_io = io.BytesIO()
13 |     with wave.open(wav_bytes_io, "wb") as wav_file:
14 |         wav_file.setnchannels(channels)
15 |         wav_file.setsampwidth(2)
16 |         wav_file.setframerate(samplerate)
17 |         wav_file.writeframes(pcm16_data.tobytes())
18 | 
19 |     wav_bytes = wav_bytes_io.getvalue()
20 |     return wav_bytes
21 | 


--------------------------------------------------------------------------------
/docker/README.md:
--------------------------------------------------------------------------------
 1 | # Speech Gateway Docker Setup
 2 | 
 3 | ## Quick Start
 4 | 
 5 | ### 1. Setup Environment
 6 | ```bash
 7 | cp .env.sample .env
 8 | # Edit .env and set your API keys
 9 | ```
10 | 
11 | ### 2. Create Volume Directories
12 | ```bash
13 | ./setup-volumes.sh
14 | ```
15 | 
16 | ### 3. Start Services
17 | ```bash
18 | docker compose up -d
19 | ```
20 | 
21 | ## Access
22 | 
23 | - Application: http://localhost:18000
24 | - PgAdmin: http://localhost:18001
25 | 
26 | ## Configuration
27 | 
28 | Edit `.env` file to:
29 | - Set API keys (AZURE_API_KEY, OPENAI_API_KEY, etc.)
30 | - Enable/disable services (AZURE_ENABLED=true/false)
31 | - Change ports if needed
32 | 
33 | ## Stop Services
34 | 
35 | ```bash
36 | docker compose down
37 | ```
38 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | setup(
 4 |     name="speech_gateway",
 5 |     version="0.1.7",
 6 |     url="https://github.com/uezo/speech-gateway",
 7 |     author="uezo",
 8 |     author_email="uezo@uezo.net",
 9 |     maintainer="uezo",
10 |     maintainer_email="uezo@uezo.net",
11 |     description="A reverse proxy server that enhances speech synthesis with essential, extensible features. 🦉💬",
12 |     long_description=open("README.md").read(),
13 |     long_description_content_type="text/markdown",
14 |     packages=find_packages(exclude=["tests*"]),
15 |     install_requires=["aiofiles==24.1.0", "fastapi==0.115.6", "httpx==0.28.1", "uvicorn==0.34.0"],
16 |     license="Apache v2",
17 |     classifiers=[
18 |         "Programming Language :: Python :: 3"
19 |     ]
20 | )
21 | 


--------------------------------------------------------------------------------
/speech_gateway/performance_recorder/__init__.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from dataclasses import dataclass
 3 | 
 4 | 
 5 | class PerformanceRecorder(ABC):
 6 |     @abstractmethod
 7 |     def record(
 8 |         self,
 9 |         *,
10 |         process_id: str,
11 |         source: str = None,
12 |         text: str = None,
13 |         audio_format: str = None,
14 |         cached: int = 0,
15 |         elapsed: float = None,
16 |     ):
17 |         pass
18 | 
19 |     @abstractmethod
20 |     def close(self):
21 |         pass
22 | 
23 | 
24 | @dataclass
25 | class PerformanceRecord:
26 |     process_id: str
27 |     source: str = None
28 |     text: str = None
29 |     audio_format: str = None
30 |     cached: int = 0,
31 |     elapsed: float = None,
32 | 
33 | 
34 | from .sqlite import SQLitePerformanceRecorder
35 | 


--------------------------------------------------------------------------------
/docker/setup-volumes.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Setup script for Docker volumes
 4 | # This script creates necessary directories for Docker named volumes
 5 | 
 6 | set -e
 7 | 
 8 | # Load environment variables
 9 | if [ -f .env ]; then
10 |     export $(grep -v '^#' .env | xargs)
11 | fi
12 | 
13 | # Default to ./data if DATA_PATH is not set
14 | DATA_PATH=${DATA_PATH:-./data}
15 | 
16 | echo "Setting up volumes at: $DATA_PATH"
17 | 
18 | # Create directories if they don't exist
19 | mkdir -p "$DATA_PATH/postgres"
20 | mkdir -p "$DATA_PATH/pgadmin"
21 | mkdir -p "$DATA_PATH/cache"
22 | 
23 | # Set appropriate permissions
24 | # PostgreSQL needs UID 999 (in most PostgreSQL Docker images)
25 | # PgAdmin needs UID 5050
26 | if [ "$(uname)" = "Linux" ]; then
27 |     sudo chown -R 999:999 "$DATA_PATH/postgres" 2>/dev/null || true
28 |     sudo chown -R 5050:5050 "$DATA_PATH/pgadmin" 2>/dev/null || true
29 | fi
30 | 
31 | echo "Volume directories created successfully:"
32 | echo "  - $DATA_PATH/postgres"
33 | echo "  - $DATA_PATH/pgadmin"
34 | echo "  - $DATA_PATH/cache"
35 | echo ""
36 | echo "You can now run: docker compose up -d"
37 | 


--------------------------------------------------------------------------------
/docker/Dockerfile.app:
--------------------------------------------------------------------------------
 1 | # Multi-stage build for optimized image
 2 | FROM python:3.11-slim AS builder
 3 | 
 4 | # Install build dependencies (git needed for GitHub installation)
 5 | RUN apt-get update && apt-get install -y \
 6 |     gcc \
 7 |     git \
 8 |     && rm -rf /var/lib/apt/lists/*
 9 | 
10 | # Create virtual environment
11 | RUN python -m venv /opt/venv
12 | ENV PATH="/opt/venv/bin:$PATH"
13 | 
14 | # Copy requirements and install dependencies
15 | COPY requirements.txt /tmp/
16 | RUN pip install --upgrade pip && \
17 |     pip install --no-cache-dir -r /tmp/requirements.txt
18 | 
19 | # Runtime stage
20 | FROM python:3.11-slim
21 | 
22 | # Install runtime dependencies
23 | RUN apt-get update && apt-get install -y \
24 |     curl \
25 |     && rm -rf /var/lib/apt/lists/* \
26 |     && useradd -m -u 1000 app
27 | 
28 | # Copy virtual environment from builder
29 | COPY --from=builder /opt/venv /opt/venv
30 | ENV PATH="/opt/venv/bin:$PATH"
31 | 
32 | WORKDIR /app
33 | 
34 | # Copy application
35 | COPY --chown=app:app run.py /app/
36 | 
37 | # Switch to non-root user
38 | USER app
39 | 
40 | EXPOSE 8000
41 | 
42 | CMD ["uvicorn", "run:app", "--host", "0.0.0.0", "--port", "8000"]
43 | 


--------------------------------------------------------------------------------
/speech_gateway/source/voicevox.py:
--------------------------------------------------------------------------------
 1 | import urllib.parse
 2 | import httpx
 3 | from . import StreamSource, StreamSourceError
 4 | 
 5 | 
 6 | class VoicevoxStreamSource(StreamSource):
 7 |     def get_cache_key(self, audio_format: str, speaker: str, audio_query: dict, **kwargs) -> str:
 8 |         return f"{speaker}_{hash(str(audio_query))}.{audio_format or 'wav'}"
 9 | 
10 |     def parse_text(self, audio_query: dict, **kwargs) -> str:
11 |         return audio_query.get("kana")
12 | 
13 |     def make_stream_request(self, speaker: str, audio_query: dict, **kwargs):
14 |         return {
15 |             "method": "POST",
16 |             "url": self.base_url + "/synthesis",
17 |             "params": {"speaker": speaker},
18 |             "json": audio_query
19 |         }
20 | 
21 |     async def get_audio_query(self, speaker: str, text: str, **kwargs):
22 |         try:
23 |             url = f"{self.base_url}/audio_query"
24 | 
25 |             response = await self.http_client.post(url, params={"speaker": speaker, "text": text})
26 |             response.raise_for_status()
27 | 
28 |             return response.json()
29 | 
30 |         except httpx.RequestError as ex:
31 |             raise StreamSourceError(f"HTTP request failed: {ex}") from ex
32 | 


--------------------------------------------------------------------------------
/docker/.env.sample:
--------------------------------------------------------------------------------
 1 | COMPOSE_PROJECT_NAME=speech-gateway
 2 | 
 3 | # Database settings
 4 | POSTGRES_USER=postgres
 5 | POSTGRES_PASSWORD=postgres
 6 | 
 7 | SPGW_DB_NAME=speech_gateway
 8 | SPGW_DB_USER=spgw-runtime
 9 | SPGW_DB_PASSWORD=spgw-runtime-password
10 | 
11 | # Port settings
12 | PORT_SPGW=18000
13 | PORT_DB=5432
14 | PORT_PGADMIN=18001
15 | 
16 | # PgAdmin settings
17 | PGADMIN_USER=pgadmin@example.com
18 | PGADMIN_PASSWORD=pgadmin-password
19 | 
20 | 
21 | # Application settings
22 | DEBUG=true
23 | 
24 | # Service enable/disable flags
25 | AZURE_ENABLED=true
26 | OPENAI_ENABLED=true
27 | VOICEVOX_ENABLED=false
28 | SBV2_ENABLED=false
29 | NIJIVOICE_ENABLED=false
30 | 
31 | # Azure TTS
32 | AZURE_API_KEY=
33 | AZURE_REGION=
34 | # AZURE_LANGUAGES=en-US,zh-CN,fr-FR
35 | 
36 | # OpenAI TTS
37 | OPENAI_API_KEY=
38 | # OPENAI_LANGUAGES=ja-JP,es-ES
39 | 
40 | # VOICEVOX
41 | VOICEVOX_URL=http://voicevox-host:50021
42 | # VOICEVOX_LANGUAGES=ja-JP
43 | 
44 | # Style-Bert-VITS2
45 | SBV2_URL==http://sbv2-host:5000
46 | # SBV2_LANGUAGES=ja-JP
47 | 
48 | # NIJIVOICE
49 | NIJIVOICE_API_KEY=
50 | # NIJIVOICE_LANGUAGES=ja-JP
51 | 
52 | # Data storage path (for external disk mounting)
53 | # Examples:
54 | #   DATA_PATH=./data                    # Local directory (default)
55 | #   DATA_PATH=/mnt/external-disk/spgw   # External disk
56 | DATA_PATH=./data
57 | 


--------------------------------------------------------------------------------
/tests/converter/test_mp3.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import os
 3 | from typing import AsyncIterator
 4 | from speech_gateway.converter import MP3Converter, FormatConverterError
 5 | 
 6 | @pytest.fixture
 7 | def mp3_converter():
 8 |     # Create an instance of MP3Converter for testing
 9 |     return MP3Converter()
10 | 
11 | @pytest.mark.asyncio
12 | async def test_mp3_conversion(mp3_converter):
13 |     # Test the convert method using a real WAV file
14 |     input_file = "tests/data/test.wav"
15 | 
16 |     async def input_stream() -> AsyncIterator[bytes]:
17 |         with open(input_file, "rb") as f:
18 |             while chunk := f.read(1024):
19 |                 yield chunk
20 | 
21 |     output = b""
22 |     try:
23 |         async for chunk in mp3_converter.convert(input_stream()):
24 |             output += chunk
25 |     except FormatConverterError as e:
26 |         pytest.fail(f"MP3 conversion failed with error: {e}")
27 | 
28 |     # Assert that the output is not empty (indicating conversion occurred)
29 |     assert output != b""
30 | 
31 | @pytest.mark.asyncio
32 | async def test_mp3_conversion_error_handling(mp3_converter):
33 |     # Test error handling in the convert method with invalid input
34 | 
35 |     async def input_stream() -> AsyncIterator[bytes]:
36 |         yield b"Invalid input data"
37 | 
38 |     with pytest.raises(FormatConverterError):
39 |         async for _ in mp3_converter.convert(input_stream()):
40 |             pass
41 | 


--------------------------------------------------------------------------------
/run.py:
--------------------------------------------------------------------------------
 1 | from contextlib import asynccontextmanager
 2 | import logging
 3 | from fastapi import FastAPI
 4 | from speech_gateway.gateway.voicevox import VoicevoxGateway
 5 | from speech_gateway.gateway.nijivoice import NijiVoiceGateway
 6 | from speech_gateway.gateway.sbv2 import StyleBertVits2Gateway
 7 | from speech_gateway.gateway.unified import UnifiedGateway
 8 | 
 9 | # Configure root logger
10 | logger = logging.getLogger("speech_gateway")
11 | logger.setLevel(logging.INFO)
12 | log_format = logging.Formatter("[%(levelname)s] %(asctime)s : %(message)s")
13 | streamHandler = logging.StreamHandler()
14 | streamHandler.setFormatter(log_format)
15 | logger.addHandler(streamHandler)
16 | 
17 | NIJIVOICE_API_KEY = "YOUR_API_KEY"
18 | 
19 | # Create gateways
20 | aivisspeech_gateway = VoicevoxGateway(base_url="http://127.0.0.1:10101", debug=True)
21 | sbv2_gateway = StyleBertVits2Gateway(base_url="http://127.0.0.1:5000", debug=True)
22 | nijivoice_gateway = NijiVoiceGateway(api_key=NIJIVOICE_API_KEY, prefix="/nijivoice", debug=True)
23 | 
24 | # Unified gateway
25 | unified_gateway = UnifiedGateway(debug=True)
26 | unified_gateway.add_gateway("aivisspeech", aivisspeech_gateway, True)
27 | unified_gateway.add_gateway("sbv2", sbv2_gateway)
28 | unified_gateway.add_gateway("nijivoice", nijivoice_gateway)
29 | 
30 | # Create app
31 | app = FastAPI()
32 | 
33 | # Add gateways to app
34 | app.include_router(aivisspeech_gateway.get_router(), prefix="/aivisspeech")
35 | app.include_router(sbv2_gateway.get_router(), prefix="/sbv2")
36 | app.include_router(nijivoice_gateway.get_router(), prefix="/nijivoice")
37 | app.include_router(unified_gateway.get_router())
38 | 
39 | # On app down
40 | @asynccontextmanager
41 | async def lifespan(app: FastAPI):
42 |     yield
43 |     await aivisspeech_gateway.shutdown()
44 |     await sbv2_gateway.shutdown()
45 |     await nijivoice_gateway.shutdown()
46 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import random
 3 | import pytest
 4 | import httpx
 5 | 
 6 | OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
 7 | 
 8 | 
 9 | def make_random_text():
10 |     random_key = "{:,}".format(random.randint(100000, 999999))
11 |     return f"これは音声合成のテストです。ランダムキーは、{random_key}です。"
12 | 
13 | 
14 | def is_wave(data: bytes) -> bool:
15 |     if len(data) < 12:
16 |         return False
17 |     return data[:4] == b"RIFF" and data[8:12] == b"WAVE"
18 | 
19 | 
20 | def is_mp3(data: bytes) -> bool:
21 |     if data[:3] == b"ID3":
22 |         id3_size = 10
23 |         if len(data) >= 10:
24 |             tag_size = (
25 |                 (data[6] << 21)
26 |                 | (data[7] << 14)
27 |                 | (data[8] << 7)
28 |                 | data[9]
29 |             )
30 |             id3_size += tag_size
31 |         data = data[id3_size:]
32 |     
33 |     if len(data) < 2:
34 |         return False
35 |     return data[:2] in [b"\xFF\xFB", b"\xFF\xF3", b"\xFF\xF2"]
36 | 
37 | 
38 | def transcribe(data: bytes, audio_format: str) -> str:
39 |     headers = {"Authorization": f"Bearer {OPENAI_API_KEY}"}
40 |     form_data = {"model": "whisper-1"}
41 |     files = {"file": (f"voice.{audio_format}", data, f"audio/{audio_format}")}
42 |     resp = httpx.post(
43 |         "https://api.openai.com/v1/audio/transcriptions",
44 |         headers=headers,
45 |         data=form_data,
46 |         files=files
47 |     )
48 |     return resp.json().get("text")
49 | 
50 | 
51 | @pytest.fixture
52 | def random_text():
53 |     random_key = "{:,}".format(random.randint(100000, 999999))
54 |     return f"これは音声合成のテストです。ランダムキーは、{random_key}です。"
55 | 
56 | @pytest.fixture
57 | def wave_checker():
58 |     return is_wave
59 | 
60 | @pytest.fixture
61 | def mp3_checker():
62 |     return is_mp3
63 | 
64 | @pytest.fixture
65 | def audio_transcriber():
66 |     return transcribe
67 | 


--------------------------------------------------------------------------------
/speech_gateway/source/nijivoice_encoded.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict
 2 | from . import StreamSource
 3 | from ..cache import CacheStorage
 4 | from ..cache.file import FileCacheStorage
 5 | from ..converter import FormatConverter
 6 | from ..performance_recorder import PerformanceRecorder
 7 | 
 8 | 
 9 | class NijiVoiceEncodedStreamSource(StreamSource):
10 |     def __init__(self,
11 |         *,
12 |         api_key: str = None,
13 |         base_url: str = "https://api.nijivoice.com",
14 |         cache_storage: CacheStorage = None,
15 |         format_converters: Dict[str, FormatConverter] = None,
16 |         max_connections: int = 100,
17 |         max_keepalive_connections: int = 20,
18 |         timeout: float = 10.0,
19 |         performance_recorder: PerformanceRecorder = None,
20 |         debug: bool = False
21 |     ):
22 |         super().__init__(
23 |             base_url=base_url,
24 |             cache_storage=cache_storage or FileCacheStorage(cache_dir="nijivoice_encoded_cache"),
25 |             format_converters=format_converters,
26 |             max_connections=max_connections,
27 |             max_keepalive_connections=max_keepalive_connections,
28 |             timeout=timeout,
29 |             performance_recorder=performance_recorder,
30 |             debug=debug
31 |         )
32 |         self.base_url = base_url
33 |         self.api_key = api_key
34 | 
35 |     def get_cache_key(self, audio_format: str, voice_actor_id: str, request_json: dict, **kwargs) -> str:
36 |         if not audio_format:
37 |             audio_format = request_json.get("format", "mp3")
38 |         return f"{voice_actor_id}_{hash(str(request_json))}.{audio_format}.json"
39 | 
40 |     def parse_text(self, request_json: dict, **kwargs) -> str:
41 |         return request_json.get("script")
42 | 
43 |     def make_stream_request(self, voice_actor_id: str, request_json: dict, **kwargs):
44 |         return {
45 |             "method": "POST",
46 |             "url": self.base_url + f"/api/platform/v1/voice-actors/{voice_actor_id}/generate-encoded-voice",
47 |             "headers": {"x-api-key": self.api_key},
48 |             "json": request_json
49 |         }
50 | 


--------------------------------------------------------------------------------
/speech_gateway/converter/mp3.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | from typing import AsyncIterator
 3 | from . import FormatConverter, FormatConverterError
 4 | 
 5 | 
 6 | class MP3Converter(FormatConverter):
 7 |     def __init__(self, ffmpeg_path: str = "ffmpeg", bitrate: str = "64k", output_chunksize: int = 1024):
 8 |         self.ffmpeg_path = ffmpeg_path
 9 |         self.bitrate = bitrate
10 |         self.output_chunksize = output_chunksize
11 | 
12 |     async def convert(self, input_stream: AsyncIterator[bytes]) -> AsyncIterator[bytes]:
13 |         try:
14 |             ffmpeg_proc = await asyncio.create_subprocess_exec(
15 |                 self.ffmpeg_path,
16 |                 "-y",
17 |                 "-i", "-",  # Read from stdin
18 |                 "-f", "mp3",
19 |                 "-b:a", self.bitrate,
20 |                 "-",  # Write to stdout
21 |                 stdin=asyncio.subprocess.PIPE,
22 |                 stdout=asyncio.subprocess.PIPE,
23 |                 stderr=asyncio.subprocess.PIPE
24 |             )
25 | 
26 |             async def feed_ffmpeg_stdin():
27 |                 try:
28 |                     async for chunk in input_stream:
29 |                         ffmpeg_proc.stdin.write(chunk)
30 |                         await ffmpeg_proc.stdin.drain()
31 |                     ffmpeg_proc.stdin.close()
32 | 
33 |                 except Exception as ex:
34 |                     ffmpeg_proc.stdin.close()
35 |                     raise FormatConverterError(f"Error feeding data to ffmpeg: {str(ex)}")
36 | 
37 |             asyncio.create_task(feed_ffmpeg_stdin())
38 | 
39 |             while True:
40 |                 chunk = await ffmpeg_proc.stdout.read(self.output_chunksize)
41 |                 if not chunk:
42 |                     break
43 |                 yield chunk
44 | 
45 |             await ffmpeg_proc.wait()
46 | 
47 |             if ffmpeg_proc.returncode != 0:
48 |                 stderr = await ffmpeg_proc.stderr.read()
49 |                 raise FormatConverterError(f"FFmpeg conversion error: {stderr.decode('utf-8')}")
50 | 
51 |         except Exception as ex:
52 |             raise FormatConverterError(f"Error during MP3 conversion: {str(ex)}")
53 | 


--------------------------------------------------------------------------------
/speech_gateway/source/azure.py:
--------------------------------------------------------------------------------
 1 | from . import StreamSource
 2 | from typing import Dict
 3 | from ..cache import CacheStorage
 4 | from ..cache.file import FileCacheStorage
 5 | from ..converter import FormatConverter
 6 | from ..performance_recorder import PerformanceRecorder
 7 | 
 8 | 
 9 | class AzureStreamSource(StreamSource):
10 |     def __init__(self,
11 |         *,
12 |         api_key: str = None,
13 |         region: str = None,
14 |         base_url: str = "https://{region}.tts.speech.microsoft.com/cognitiveservices/v1",
15 |         cache_storage: CacheStorage = None,
16 |         format_converters: Dict[str, FormatConverter] = None,
17 |         max_connections: int = 100,
18 |         max_keepalive_connections: int = 20,
19 |         timeout: float = 10.0,
20 |         performance_recorder: PerformanceRecorder = None,
21 |         debug: bool = False
22 |     ):
23 |         super().__init__(
24 |             base_url=base_url,
25 |             cache_storage=cache_storage or FileCacheStorage(cache_dir="azure_cache"),
26 |             format_converters=format_converters,
27 |             max_connections=max_connections,
28 |             max_keepalive_connections=max_keepalive_connections,
29 |             timeout=timeout,
30 |             performance_recorder=performance_recorder,
31 |             debug=debug
32 |         )
33 |         self.api_key = api_key
34 |         self.region = region
35 | 
36 |     def get_cache_key(self, audio_format: str, encoded_ssml: bytes, **kwargs) -> str:
37 |         return f"{hash(encoded_ssml)}.{audio_format or 'wav'}"
38 | 
39 |     def parse_text(self, encoded_ssml: bytes, **kwargs) -> str:
40 |         return encoded_ssml.decode("utf-8")
41 | 
42 |     def make_stream_request(self, encoded_ssml: bytes, azure_audio_format: str, **kwargs):
43 |         return {
44 |             "method": "POST",
45 |             "url": self.base_url.format(region=self.region),
46 |             "headers": {
47 |                 "X-Microsoft-OutputFormat": azure_audio_format,
48 |                 "Content-Type": "application/ssml+xml",
49 |                 "Ocp-Apim-Subscription-Key": self.api_key
50 |             },
51 |             "data": encoded_ssml
52 |         }
53 | 


--------------------------------------------------------------------------------
/speech_gateway/source/openai_speech.py:
--------------------------------------------------------------------------------
 1 | from . import StreamSource
 2 | from typing import Dict
 3 | from ..cache import CacheStorage
 4 | from ..cache.file import FileCacheStorage
 5 | from ..converter import FormatConverter
 6 | from ..performance_recorder import PerformanceRecorder
 7 | 
 8 | 
 9 | class OpenAIStreamSource(StreamSource):
10 |     def __init__(self,
11 |         *,
12 |         api_key: str = None,
13 |         base_url: str = "https://api.openai.com/v1",
14 |         cache_storage: CacheStorage = None,
15 |         format_converters: Dict[str, FormatConverter] = None,
16 |         max_connections: int = 100,
17 |         max_keepalive_connections: int = 20,
18 |         timeout: float = 10.0,
19 |         performance_recorder: PerformanceRecorder = None,
20 |         debug: bool = False
21 |     ):
22 |         super().__init__(
23 |             base_url=base_url,
24 |             cache_storage=cache_storage or FileCacheStorage(cache_dir="openai_cache"),
25 |             format_converters=format_converters,
26 |             max_connections=max_connections,
27 |             max_keepalive_connections=max_keepalive_connections,
28 |             timeout=timeout,
29 |             performance_recorder=performance_recorder,
30 |             debug=debug
31 |         )
32 |         self.base_url = base_url
33 |         self.api_key = api_key
34 | 
35 |     def get_cache_key(self, audio_format: str, request_json: dict, **kwargs) -> str:
36 |         if not audio_format:
37 |             audio_format = request_json.get("response_format", "mp3")
38 |         return f"{hash(str(request_json))}.{audio_format}"
39 | 
40 |     def parse_text(self, request_json: dict, **kwargs) -> str:
41 |         return request_json.get("input")
42 | 
43 |     def make_stream_request(self, request_json: dict, **kwargs):
44 |         if "azure" in self.base_url:
45 |             url = self.base_url
46 |             headers = {"api-key": self.api_key}
47 |         else:
48 |             url = f"{self.base_url}/audio/speech"
49 |             headers = {"Authorization": f"Bearer {self.api_key}"}
50 | 
51 |         return {
52 |             "method": "POST",
53 |             "url": url,
54 |             "headers": headers,
55 |             "json": request_json
56 |         }
57 | 


--------------------------------------------------------------------------------
/tests/source/test_sbv2_source.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import os
 3 | from speech_gateway.source.sbv2 import StyleBertVits2StreamSource
 4 | 
 5 | SBV2_URL = os.getenv("SBV2_URL")
 6 | 
 7 | 
 8 | @pytest.fixture
 9 | def source():
10 |     # Create an instance of StyleBertVits2StreamSource
11 |     return StyleBertVits2StreamSource(base_url=SBV2_URL)
12 | 
13 | @pytest.mark.asyncio
14 | async def test_get_cache_key(source):
15 |     # Test get_cache_key method
16 |     query_params = {"text": "こんにちは。これはテストです。", "voice": "test"}
17 |     cache_key = source.get_cache_key("mp3", query_params)
18 |     assert cache_key.endswith(".mp3")
19 | 
20 |     cache_key = source.get_cache_key("wav", query_params)
21 |     assert cache_key.endswith(".wav")
22 | 
23 | @pytest.mark.asyncio
24 | async def test_parse_text(source):
25 |     # Test parse_text method
26 |     query_params = {"text": "こんにちは。これはテストです。", "voice": "test"}
27 |     text = source.parse_text(query_params)
28 |     assert text == "こんにちは。これはテストです。"
29 | 
30 | @pytest.mark.asyncio
31 | async def test_make_stream_request(source):
32 |     # Test make_stream_request method
33 |     query_params = {"text": "こんにちは。これはテストです。", "voice": "test"}
34 |     request = source.make_stream_request(query_params)
35 |     assert request["method"] == "GET"
36 |     assert request["url"] == f"{SBV2_URL}/voice"
37 |     assert request["params"] == query_params
38 | 
39 | @pytest.mark.asyncio
40 | async def test_fetch_stream_raw(source):
41 |     # Test fetch_stream_raw with a real request (ensure server is running locally)
42 |     query_params = {"text": "こんにちは。これはテストです。", "voice": "test"}
43 |     http_request = source.make_stream_request(query_params)
44 | 
45 |     try:
46 |         # Replace this part with a live test against the actual service
47 |         async for chunk in source.fetch_stream_raw(http_request):
48 |             assert isinstance(chunk, bytes)
49 |     except Exception as e:
50 |         pytest.fail(f"fetch_stream_raw failed: {e}")
51 | 
52 | @pytest.mark.asyncio
53 | async def test_fetch_stream(source):
54 |     # Test fetch_stream method with conversion and caching
55 |     query_params = {"text": "こんにちは。", "voice": "test"}
56 |     audio_format = "mp3"
57 | 
58 |     try:
59 |         async for chunk in await source.fetch_stream(audio_format, query_params=query_params):
60 |             assert isinstance(chunk, bytes)
61 |     except Exception as e:
62 |         pytest.fail(f"fetch_stream failed: {e}")
63 | 


--------------------------------------------------------------------------------
/docker/docker-compose.yaml:
--------------------------------------------------------------------------------
 1 | services:
 2 |   app:
 3 |     container_name: spgw-app
 4 |     build:
 5 |       context: .
 6 |       dockerfile: Dockerfile.app
 7 |     env_file:
 8 |       - .env
 9 |     environment:
10 |       - DATABASE_URL=postgresql://${SPGW_DB_USER}:${SPGW_DB_PASSWORD}@db:5432/${SPGW_DB_NAME}
11 |     ports:
12 |       - "${PORT_SPGW}:8000"
13 |     volumes:
14 |       - spgw-app-cache:/app/cache
15 |     depends_on:
16 |       db:
17 |         condition: service_healthy
18 |     healthcheck:
19 |       test: ["CMD", "curl", "-f", "http://localhost:8000/docs"]
20 |       interval: 30s
21 |       timeout: 10s
22 |       retries: 3
23 |       start_period: 40s
24 |     restart: unless-stopped
25 | 
26 |   db:
27 |     container_name: spgw-db
28 |     image: postgres:16
29 |     environment:
30 |       - POSTGRES_USER=${POSTGRES_USER}
31 |       - POSTGRES_PASSWORD=${POSTGRES_PASSWORD}
32 |       - SPGW_DB_NAME=${SPGW_DB_NAME}
33 |       - SPGW_DB_USER=${SPGW_DB_USER}
34 |       - SPGW_DB_PASSWORD=${SPGW_DB_PASSWORD}
35 |     ports:
36 |       - "${PORT_DB}:5432"
37 |     volumes:
38 |       - spgw-postgres-data:/var/lib/postgresql/data
39 |       - ./init-db.sh:/docker-entrypoint-initdb.d/init-db.sh:ro
40 |     healthcheck:
41 |       test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER} -d ${SPGW_DB_NAME}"]
42 |       interval: 10s
43 |       timeout: 5s
44 |       retries: 5
45 |       start_period: 30s
46 |     restart: unless-stopped
47 | 
48 |   pgadmin4:
49 |     container_name: spgw-pgadmin4
50 |     image: dpage/pgadmin4:8.14
51 |     environment:
52 |       PGADMIN_DEFAULT_EMAIL: ${PGADMIN_USER}
53 |       PGADMIN_DEFAULT_PASSWORD: ${PGADMIN_PASSWORD}
54 |       PGADMIN_CONFIG_SERVER_MODE: "True"
55 |     ports:
56 |       - "${PORT_PGADMIN}:80"
57 |     volumes:
58 |       - spgw-pgadmin-data:/var/lib/pgadmin
59 |       - ./pgadmin-servers.json:/pgadmin4/servers.json
60 |     depends_on:
61 |       db:
62 |         condition: service_healthy
63 |     restart: unless-stopped
64 | 
65 | volumes:
66 |   spgw-postgres-data:
67 |     driver: local
68 |     driver_opts:
69 |       type: none
70 |       o: bind
71 |       device: ${DATA_PATH:-./data}/postgres
72 |   spgw-pgadmin-data:
73 |     driver: local
74 |     driver_opts:
75 |       type: none
76 |       o: bind
77 |       device: ${DATA_PATH:-./data}/pgadmin
78 |   spgw-app-cache:
79 |     driver: local
80 |     driver_opts:
81 |       type: none
82 |       o: bind
83 |       device: ${DATA_PATH:-./data}/cache
84 | 


--------------------------------------------------------------------------------
/speech_gateway/gateway/unified.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict, List
 2 | from fastapi import HTTPException
 3 | from fastapi import Request, APIRouter
 4 | from . import SpeechGateway, UnifiedTTSRequest
 5 | 
 6 | 
 7 | class UnifiedGateway(SpeechGateway):
 8 |     def __init__(self, *, default_gateway: SpeechGateway = None, default_language: str = "ja-JP", debug = False):
 9 |         super().__init__(stream_source=None, debug=debug)
10 |         self.service_map: Dict[str, SpeechGateway] = {}
11 |         self.language_map: Dict[str, SpeechGateway] = {}
12 |         self.default_speakers: Dict[SpeechGateway, str] = {}
13 |         self.default_gateway: SpeechGateway = default_gateway
14 |         self.default_language = default_language
15 | 
16 |     def add_gateway(self, service_name: str, gateway: SpeechGateway, *, languages: List[str] = None, default_speaker: str = None, default: bool = False):
17 |         self.service_map[service_name] = gateway
18 |         if languages:
19 |             for lang in languages:
20 |                 self.language_map[lang] = gateway
21 |         if default:
22 |             self.default_gateway = gateway
23 |             self.language_map[self.default_language] = gateway
24 |         self.default_speakers[gateway] = default_speaker
25 | 
26 |     def get_gateway(self, tts_request: UnifiedTTSRequest):
27 |         if tts_request.service_name:
28 |             return self.service_map.get(tts_request.service_name)
29 |         elif tts_request.language:
30 |             return self.language_map.get(tts_request.language)
31 |         elif self.default_gateway:
32 |             return self.default_gateway
33 |         return None
34 | 
35 |     def get_router(self) -> APIRouter:
36 |         router = APIRouter()
37 |         self.register_endpoint(router)
38 |         return router
39 | 
40 |     def register_endpoint(self, router: APIRouter):
41 |         @router.post("/tts")
42 |         async def post_tts(request: Request, tts_request: UnifiedTTSRequest, x_audio_format: str = "wav"):
43 |             gateway = self.get_gateway(tts_request)
44 | 
45 |             if not gateway:
46 |                 raise HTTPException(status_code=404, detail="No gateway found.")
47 |             
48 |             if not tts_request.speaker:
49 |                 tts_request.speaker = self.default_speakers.get(gateway)
50 | 
51 |             return await gateway.unified_tts_handler(request, tts_request, x_audio_format)
52 | 
53 |     async def shutdown(self):
54 |         pass
55 | 


--------------------------------------------------------------------------------
/speech_gateway/converter/wave.py:
--------------------------------------------------------------------------------
 1 | import audioop
 2 | import io
 3 | import wave
 4 | from typing import AsyncIterator
 5 | from . import FormatConverter, FormatConverterError
 6 | 
 7 | 
 8 | class WaveConverter(FormatConverter):
 9 |     def __init__(self, output_sample_rate: int = 16000, output_sample_width: int = 2):
10 |         self.output_sample_rate = output_sample_rate
11 |         self.output_sample_width = output_sample_width
12 | 
13 |     def convert_wave_bytes(self, input_bytes, output_sample_rate, output_sample_width):
14 |         input_io = io.BytesIO(input_bytes)
15 |         with wave.open(input_io, 'rb') as wf:
16 |             input_sample_rate = wf.getframerate()
17 |             input_sample_width = wf.getsampwidth()
18 |             channels = wf.getnchannels()
19 |             frames = wf.readframes(wf.getnframes())
20 |         
21 |         # Convert sample rate
22 |         if input_sample_rate != output_sample_rate:
23 |             frames, _ = audioop.ratecv(frames, input_sample_width, channels, input_sample_rate, output_sample_rate, None)
24 |         
25 |         # Convert sample width
26 |         if input_sample_width != output_sample_width:
27 |             # 16 -> 8
28 |             if input_sample_width == 2 and output_sample_width == 1:
29 |                 frames = audioop.lin2lin(frames, 2, 1)
30 |                 frames = audioop.bias(frames, 1, 128)
31 |             # 8 -> 16
32 |             elif input_sample_width == 1 and output_sample_width == 2:
33 |                 frames = audioop.bias(frames, 1, -128)
34 |                 frames = audioop.lin2lin(frames, 1, 2)
35 |             else:
36 |                 frames = audioop.lin2lin(frames, input_sample_width, output_sample_width)
37 |         
38 |         output_io = io.BytesIO()
39 |         with wave.open(output_io, "wb") as wf_out:
40 |             wf_out.setframerate(output_sample_rate)
41 |             wf_out.setsampwidth(output_sample_width)
42 |             wf_out.setnchannels(channels)
43 |             wf_out.writeframes(frames)
44 | 
45 |         return output_io.getvalue()
46 | 
47 |     async def convert(self, input_stream: AsyncIterator[bytes]) -> AsyncIterator[bytes]:
48 |         try:
49 |             wav_data = b""
50 |             async for chunk in input_stream:
51 |                 wav_data += chunk
52 | 
53 |             yield self.convert_wave_bytes(wav_data, self.output_sample_rate, self.output_sample_width)
54 | 
55 |         except Exception as ex:
56 |             raise FormatConverterError(f"Error during Mu-Law conversion: {str(ex)}")
57 | 


--------------------------------------------------------------------------------
/speech_gateway/cache/file.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | from typing import AsyncIterator
 3 | import aiofiles
 4 | from . import CacheStorage, CacheStorageError
 5 | 
 6 | 
 7 | class FileCacheStorage(CacheStorage):
 8 |     def __init__(self, cache_dir: str = "voice_cache"):
 9 |         self.cache_dir = Path(cache_dir)
10 |         if not self.cache_dir.exists():
11 |             self.cache_dir.mkdir(parents=True)
12 | 
13 |     async def has_cache(self, cache_key: str) -> bool:
14 |         file_path = self.cache_dir / cache_key
15 |         if not file_path.exists():
16 |             return False
17 | 
18 |         if file_path.stat().st_size == 0:
19 |             await self.delete_cache(cache_key)
20 |             return False
21 | 
22 |         return True
23 | 
24 |     async def fetch_cache_stream(self, cache_key: str) -> AsyncIterator[bytes]:
25 |         try:
26 |             file_path = self.cache_dir / cache_key
27 |             async with aiofiles.open(file_path, mode="rb") as file:
28 |                 while chunk := await file.read(1024):
29 |                     yield chunk
30 | 
31 |         except Exception as ex:
32 |             raise IOError(f"Error reading file {file_path}: {str(ex)}")
33 | 
34 |     async def write_cache(self, input_stream: AsyncIterator[bytes], cache_key: str) -> AsyncIterator[bytes]:
35 |         file_path = self.cache_dir / cache_key
36 |         try:
37 |             async with aiofiles.open(file_path, "wb") as file:
38 |                 async for chunk in input_stream:
39 |                     await file.write(chunk)
40 |                     await file.flush()
41 |                     yield chunk
42 | 
43 |         except Exception as ex:
44 |             # Clean up partial file if it was created
45 |             if file_path.exists():
46 |                 try:
47 |                     file_path.unlink()
48 |                 except:
49 |                     pass
50 |             raise CacheStorageError(f"Error during file save operation: {str(ex)}")
51 | 
52 |     async def delete_cache(self, cache_key: str) -> None:
53 |         file_path = self.cache_dir / cache_key
54 |         try:
55 |             if file_path.exists():
56 |                 file_path.unlink()
57 | 
58 |         except Exception as ex:
59 |             raise CacheStorageError(f"Error deleting cache file {file_path}: {str(ex)}")
60 | 
61 |     async def clear_all_cache(self) -> None:
62 |         try:
63 |             for file_path in self.cache_dir.iterdir():
64 |                 if file_path.is_file():
65 |                     file_path.unlink()
66 | 
67 |         except Exception as ex:
68 |             raise CacheStorageError(f"Error clearing cache directory {self.cache_dir}: {str(ex)}")
69 | 


--------------------------------------------------------------------------------
/tests/source/test_azure_source.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import os
 3 | from speech_gateway.source.azure import AzureStreamSource
 4 | 
 5 | AZURE_API_KEY = os.getenv("AZURE_API_KEY")
 6 | AZURE_REGION = os.getenv("AZURE_REGION")
 7 | 
 8 | 
 9 | @pytest.fixture
10 | def source():
11 |     # Create an instance of Azure Speech
12 |     return AzureStreamSource(api_key=AZURE_API_KEY, region=AZURE_REGION)
13 | 
14 | @pytest.mark.asyncio
15 | async def test_get_cache_key(source):
16 |     cache_key = source.get_cache_key("mp3", b"dummy")
17 |     assert cache_key.endswith(".mp3")
18 | 
19 |     cache_key = source.get_cache_key("wav", b"dummy")
20 |     assert cache_key.endswith(".wav")
21 | 
22 | @pytest.mark.asyncio
23 | async def test_parse_text(source):
24 |     text = source.parse_text(encoded_ssml=b"dummy")
25 |     assert text == "dummy"
26 | 
27 | @pytest.mark.asyncio
28 | async def test_make_stream_request(source):
29 |     # Test make_stream_request method
30 |     request = source.make_stream_request(encoded_ssml=b"dummy", azure_audio_format="dummy_mp3")
31 |     assert request["method"] == "POST"
32 |     assert request["url"] == f"https://{AZURE_REGION}.tts.speech.microsoft.com/cognitiveservices/v1"
33 |     assert request["headers"]["X-Microsoft-OutputFormat"] == "dummy_mp3"
34 |     assert request["headers"]["Content-Type"] == "application/ssml+xml"
35 |     assert request["headers"]["Ocp-Apim-Subscription-Key"] == source.api_key
36 |     assert request["data"] == b"dummy"
37 | 
38 | @pytest.mark.asyncio
39 | async def test_fetch_stream_raw(source):
40 |     # Test fetch_stream_raw with a real request (ensure server is running locally)
41 |     ssml_text = f"<speak version='1.0' xml:lang='ja-JP'><voice xml:lang='ja-JP' name='zh-CN-XiaoyuMultilingualNeural'>こんにちは。これは音声合成のテストです。</voice></speak>"
42 |     http_request = source.make_stream_request(ssml_text.encode("utf-8"), "riff-16khz-16bit-mono-pcm")
43 | 
44 |     try:
45 |         # Replace this part with a live test against the actual service
46 |         async for chunk in source.fetch_stream_raw(http_request):
47 |             assert isinstance(chunk, bytes)
48 |     except Exception as e:
49 |         pytest.fail(f"fetch_stream_raw failed: {e}")
50 | 
51 | @pytest.mark.asyncio
52 | async def test_fetch_stream(source):
53 |     # Test fetch_stream method with conversion and caching
54 |     ssml_text = f"<speak version='1.0' xml:lang='ja-JP'><voice xml:lang='ja-JP' name='zh-CN-XiaoyuMultilingualNeural'>こんにちは。これは音声合成のテストです。</voice></speak>"
55 |     audio_format = "mp3"
56 | 
57 |     try:
58 |         async for chunk in await source.fetch_stream(audio_format, azure_audio_format="audio-16khz-32kbitrate-mono-mp3", encoded_ssml=ssml_text.encode("utf-8")):
59 |             assert isinstance(chunk, bytes)
60 |     except Exception as e:
61 |         pytest.fail(f"fetch_stream failed: {e}")
62 | 


--------------------------------------------------------------------------------
/tests/source/test_nijivoice_encoded_source.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import os
 3 | from speech_gateway.source.nijivoice_encoded import NijiVoiceEncodedStreamSource
 4 | 
 5 | BASE_URL = "https://api.nijivoice.com"
 6 | GATEWAY_BASE_URL = "http://127.0.0.1:8000/nijivoice"
 7 | NIJIVOICE_API_KEY = os.getenv("NIJIVOICE_API_KEY")
 8 | VOICE_ACTOR_ID = "a192db5f-bd8b-4fc7-bc08-af5ca5957c12"
 9 | PAYLOAD = {
10 |     "script": "こんにちは。これはテストです。",
11 |     "speed": "1.0",
12 |     "emotionalLevel": "0.1",
13 |     "soundDuration": "0.1",
14 |     "format": "mp3",
15 | }
16 | 
17 | 
18 | @pytest.fixture
19 | def source():
20 |     # Create an instance of NijiVoiceEncodedStreamSource
21 |     return NijiVoiceEncodedStreamSource(base_url=BASE_URL, api_key=NIJIVOICE_API_KEY, debug=True)
22 | 
23 | @pytest.mark.asyncio
24 | async def test_get_cache_key(source):
25 |     # Test get_cache_key method
26 |     cache_key = source.get_cache_key("mp3", VOICE_ACTOR_ID, PAYLOAD)
27 |     assert cache_key.endswith(".mp3.json")
28 |     assert VOICE_ACTOR_ID in cache_key
29 | 
30 |     cache_key = source.get_cache_key("wav", VOICE_ACTOR_ID, PAYLOAD)
31 |     assert cache_key.endswith(".wav.json")
32 |     assert VOICE_ACTOR_ID in cache_key
33 | 
34 | @pytest.mark.asyncio
35 | async def test_parse_text(source):
36 |     # Test parse_text method
37 |     text = source.parse_text(request_json=PAYLOAD)
38 |     assert text == PAYLOAD["script"]
39 | 
40 | @pytest.mark.asyncio
41 | async def test_make_stream_request(source):
42 |     # Test make_stream_request method
43 |     request = source.make_stream_request(VOICE_ACTOR_ID, PAYLOAD)
44 |     assert request["method"] == "POST"
45 |     assert request["url"] == f"{BASE_URL}/api/platform/v1/voice-actors/{VOICE_ACTOR_ID}/generate-encoded-voice"
46 |     assert request["headers"]["x-api-key"] == NIJIVOICE_API_KEY
47 |     assert request["json"] == PAYLOAD
48 | 
49 | @pytest.mark.asyncio
50 | async def test_fetch_stream_raw(source):
51 |     # Test fetch_stream_raw with a real request (ensure server is running locally)
52 |     http_request = source.make_stream_request(VOICE_ACTOR_ID, PAYLOAD)
53 | 
54 |     try:
55 |         # Replace this part with a live test against the actual service
56 |         async for chunk in source.fetch_stream_raw(http_request):
57 |             assert isinstance(chunk, bytes)
58 |     except Exception as e:
59 |         pytest.fail(f"fetch_stream_raw failed: {e}")
60 | 
61 | @pytest.mark.asyncio
62 | async def test_fetch_stream(source):
63 |     # Test fetch_stream method with conversion and caching
64 |     try:
65 |         async for chunk in await source.fetch_stream(
66 |             audio_format="mp3",
67 |             voice_actor_id=VOICE_ACTOR_ID,
68 |             request_json=PAYLOAD,
69 |         ):
70 |             assert isinstance(chunk, bytes)
71 |     except Exception as e:
72 |         pytest.fail(f"fetch_stream failed: {e}")
73 | 


--------------------------------------------------------------------------------
/speech_gateway/gateway/voicevox.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict
 2 | from fastapi import APIRouter, Request
 3 | from fastapi.responses import StreamingResponse
 4 | from . import SpeechGateway, UnifiedTTSRequest
 5 | from ..cache.file import FileCacheStorage
 6 | from ..converter.mp3 import MP3Converter
 7 | from ..performance_recorder import SQLitePerformanceRecorder
 8 | from ..source.voicevox import VoicevoxStreamSource
 9 | 
10 | 
11 | class VoicevoxGateway(SpeechGateway):
12 |     def __init__(self, *, stream_source: VoicevoxStreamSource = None, base_url: str = None, cache_dir: str = None, style_mapper: Dict[str, Dict[str, str]] = None, debug = False):
13 |         self.stream_source: VoicevoxStreamSource = None
14 |         if stream_source:
15 |             super().__init__(stream_source=stream_source, debug=debug)
16 |         else:
17 |             super().__init__(
18 |                 stream_source=VoicevoxStreamSource(
19 |                     base_url=base_url or "http://127.0.0.1:50021",
20 |                     cache_storage=FileCacheStorage(cache_dir=cache_dir or "voicevox_cache"),
21 |                     format_converters={"mp3": MP3Converter(bitrate="64k")},
22 |                     performance_recorder=SQLitePerformanceRecorder(),
23 |                     debug=debug
24 |                 ),
25 |                 debug=debug
26 |             )
27 |         self.style_mapper = style_mapper or {}
28 | 
29 |     def register_endpoint(self, router: APIRouter):
30 |         @router.post("/synthesis")
31 |         async def synthesis_handler(speaker: str, request: Request, x_audio_format: str = "wav"):
32 |             audio_format = "mp3" if x_audio_format == "mp3" else "wav"
33 |             stream_resp = await self.stream_source.fetch_stream(
34 |                 audio_format=audio_format,
35 |                 speaker=speaker,
36 |                 audio_query=await request.json(),
37 |             )
38 |             return StreamingResponse(stream_resp, media_type=f"audio/{audio_format}")
39 | 
40 |     async def unified_tts_handler(self, request: Request, tts_request: UnifiedTTSRequest, x_audio_format: str = "wav"):
41 |         speaker = tts_request.speaker
42 | 
43 |         # Apply style
44 |         if tts_request.style is not None and (styles_for_speaker := self.style_mapper.get(tts_request.speaker)):
45 |             for k, v in styles_for_speaker.items():
46 |                 if k.lower() == tts_request.style.lower():
47 |                     speaker = v
48 |                     break
49 | 
50 |         audio_query = await self.stream_source.get_audio_query(speaker, tts_request.text)
51 | 
52 |         if tts_request.speed:
53 |             audio_query["speedScale"] = tts_request.speed
54 | 
55 |         stream_resp = await self.stream_source.fetch_stream(
56 |             audio_format=x_audio_format,
57 |             speaker=speaker,
58 |             audio_query=audio_query,
59 |         )
60 |         return StreamingResponse(stream_resp, media_type=f"audio/{x_audio_format}")
61 | 


--------------------------------------------------------------------------------
/tests/gateway/test_sbv2.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pytest
 3 | import httpx
 4 | 
 5 | 
 6 | @pytest.mark.asyncio
 7 | async def test_sbv2(random_text, wave_checker, audio_transcriber):
 8 |     query_params = {
 9 |         "text": random_text,
10 |         "model_id": "0",
11 |         "speaker_id": "0"
12 |     }
13 |     resp = httpx.get("http://127.0.0.1:8000/sbv2/voice", params=query_params)
14 |     audio_data = resp.content
15 |     assert wave_checker(audio_data)
16 |     assert "音声合成" in audio_transcriber(audio_data, "wav")
17 | 
18 | 
19 | @pytest.mark.asyncio
20 | async def test_sbv2_wav(random_text, wave_checker, audio_transcriber):
21 |     query_params = {
22 |         "text": random_text,
23 |         "model_id": "0",
24 |         "speaker_id": "0",
25 |         "x_audio_format": "wav"
26 |     }
27 |     resp = httpx.get("http://127.0.0.1:8000/sbv2/voice", params=query_params)
28 |     audio_data = resp.content
29 |     assert wave_checker(audio_data)
30 |     assert "音声合成" in audio_transcriber(audio_data, "wav")
31 | 
32 | 
33 | @pytest.mark.asyncio
34 | async def test_sbv2_mp3(random_text, mp3_checker, audio_transcriber):
35 |     query_params = {
36 |         "text": random_text,
37 |         "model_id": "0",
38 |         "speaker_id": "0",
39 |         "x_audio_format": "mp3"
40 |     }
41 |     resp = httpx.get("http://127.0.0.1:8000/sbv2/voice", params=query_params)
42 |     audio_data = resp.content
43 |     assert mp3_checker(audio_data)
44 |     assert "音声合成" in audio_transcriber(audio_data, "mp3")
45 | 
46 | 
47 | @pytest.mark.asyncio
48 | async def test_sbv2_unified(random_text, wave_checker, audio_transcriber):
49 |     req = {
50 |         "text": random_text,
51 |         "speaker": "0-0",
52 |         "service_name": "sbv2"
53 |     }
54 |     resp = httpx.post("http://127.0.0.1:8000/tts", json=req)
55 |     audio_data = resp.content
56 |     assert wave_checker(audio_data)
57 |     assert "音声合成" in audio_transcriber(audio_data, "wav")
58 | 
59 | 
60 | @pytest.mark.asyncio
61 | async def test_sbv2_unified_wav(random_text, wave_checker, audio_transcriber):
62 |     req = {
63 |         "text": random_text,
64 |         "speaker": "0-0",
65 |         "service_name": "sbv2"
66 |     }
67 |     query_params = {
68 |         "x_audio_format": "wav"
69 |     }
70 |     resp = httpx.post("http://127.0.0.1:8000/tts", params=query_params, json=req)
71 |     audio_data = resp.content
72 |     assert wave_checker(audio_data)
73 |     assert "音声合成" in audio_transcriber(audio_data, "wav")
74 | 
75 | 
76 | @pytest.mark.asyncio
77 | async def test_sbv2_unified_mp3(random_text, mp3_checker, audio_transcriber):
78 |     req = {
79 |         "text": random_text,
80 |         "speaker": "0-0",
81 |         "service_name": "sbv2"
82 |     }
83 |     query_params = {
84 |         "x_audio_format": "mp3"
85 |     }
86 |     resp = httpx.post("http://127.0.0.1:8000/tts", params=query_params, json=req)
87 |     audio_data = resp.content
88 |     assert mp3_checker(audio_data)
89 |     assert "音声合成" in audio_transcriber(audio_data, "mp3")
90 | 


--------------------------------------------------------------------------------
/tests/cache/test_file.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from speech_gateway.cache import FileCacheStorage
 3 | 
 4 | 
 5 | @pytest.fixture
 6 | def temp_cache_dir(tmp_path):
 7 |     # Create a temporary cache directory for testing
 8 |     cache_dir = tmp_path / "test_cache"
 9 |     cache_dir.mkdir()
10 |     return cache_dir
11 | 
12 | 
13 | @pytest.fixture
14 | def file_cache_storage(temp_cache_dir):
15 |     # Create a FileCacheStorage instance using the temporary directory
16 |     return FileCacheStorage(cache_dir=str(temp_cache_dir))
17 | 
18 | 
19 | @pytest.mark.asyncio
20 | async def test_has_cache(file_cache_storage, temp_cache_dir):
21 |     # Test has_cache method
22 |     cache_key = "test_file"
23 |     file_path = temp_cache_dir / cache_key
24 | 
25 |     # Case 1: File does not exist
26 |     assert not await file_cache_storage.has_cache(cache_key)
27 | 
28 |     # Case 2: File exists and has content
29 |     file_path.write_text("test content")
30 |     assert await file_cache_storage.has_cache(cache_key)
31 | 
32 |     # Case 3: File exists but is empty
33 |     file_path.write_text("")
34 |     assert not await file_cache_storage.has_cache(cache_key)
35 |     assert not file_path.exists()  # Should be deleted
36 | 
37 | 
38 | @pytest.mark.asyncio
39 | async def test_fetch_cache_stream(file_cache_storage, temp_cache_dir):
40 |     # Test fetch_cache_stream method
41 |     cache_key = "test_file"
42 |     file_path = temp_cache_dir / cache_key
43 |     content = b"This is test content."
44 |     file_path.write_bytes(content)
45 | 
46 |     result = b""
47 |     async for chunk in file_cache_storage.fetch_cache_stream(cache_key):
48 |         result += chunk
49 | 
50 |     assert result == content
51 | 
52 | 
53 | @pytest.mark.asyncio
54 | async def test_write_cache(file_cache_storage, temp_cache_dir):
55 |     # Test write_cache method
56 |     cache_key = "test_file"
57 |     file_path = temp_cache_dir / cache_key
58 | 
59 |     async def input_stream():
60 |         yield b"Part 1 "
61 |         yield b"Part 2"
62 | 
63 |     result = b""
64 |     async for chunk in file_cache_storage.write_cache(input_stream(), cache_key):
65 |         result += chunk
66 | 
67 |     assert file_path.exists()
68 |     assert file_path.read_bytes() == b"Part 1 Part 2"
69 |     assert result == b"Part 1 Part 2"
70 | 
71 | 
72 | @pytest.mark.asyncio
73 | async def test_delete_cache(file_cache_storage, temp_cache_dir):
74 |     # Test delete_cache method
75 |     cache_key = "test_file"
76 |     file_path = temp_cache_dir / cache_key
77 |     file_path.write_text("test content")
78 | 
79 |     await file_cache_storage.delete_cache(cache_key)
80 |     assert not file_path.exists()
81 | 
82 | 
83 | @pytest.mark.asyncio
84 | async def test_clear_all_cache(file_cache_storage, temp_cache_dir):
85 |     # Test clear_all_cache method
86 |     (temp_cache_dir / "file1").write_text("content1")
87 |     (temp_cache_dir / "file2").write_text("content2")
88 | 
89 |     await file_cache_storage.clear_all_cache()
90 | 
91 |     assert len(list(temp_cache_dir.iterdir())) == 0
92 | 


--------------------------------------------------------------------------------
/tests/converter/test_wave.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import os
 3 | import wave
 4 | import io
 5 | from typing import AsyncIterator
 6 | from speech_gateway.converter.wave import WaveConverter, FormatConverterError
 7 | 
 8 | 
 9 | @pytest.fixture
10 | def wave_converter():
11 |     return WaveConverter()
12 | 
13 | 
14 | @pytest.fixture
15 | def wave_converter_custom():
16 |     return WaveConverter(output_sample_rate=8000, output_sample_width=1)
17 | 
18 | 
19 | @pytest.mark.asyncio
20 | async def test_wave_conversion(wave_converter):
21 |     input_file = "tests/data/test.wav"
22 | 
23 |     async def input_stream() -> AsyncIterator[bytes]:
24 |         with open(input_file, "rb") as f:
25 |             while chunk := f.read(1024):
26 |                 yield chunk
27 | 
28 |     output = b""
29 |     try:
30 |         async for chunk in wave_converter.convert(input_stream()):
31 |             output += chunk
32 |     except FormatConverterError as e:
33 |         pytest.fail(f"Wave conversion failed with error: {e}")
34 | 
35 |     assert output != b""
36 | 
37 |     with wave.open(io.BytesIO(output), 'rb') as wf:
38 |         assert wf.getframerate() == 16000
39 |         assert wf.getsampwidth() == 2
40 | 
41 | 
42 | @pytest.mark.asyncio
43 | async def test_wave_conversion_custom_params(wave_converter_custom):
44 |     input_file = "tests/data/test.wav"
45 | 
46 |     async def input_stream() -> AsyncIterator[bytes]:
47 |         with open(input_file, "rb") as f:
48 |             while chunk := f.read(1024):
49 |                 yield chunk
50 | 
51 |     output = b""
52 |     try:
53 |         async for chunk in wave_converter_custom.convert(input_stream()):
54 |             output += chunk
55 |     except FormatConverterError as e:
56 |         pytest.fail(f"Wave conversion failed with error: {e}")
57 | 
58 |     assert output != b""
59 | 
60 |     with wave.open(io.BytesIO(output), 'rb') as wf:
61 |         assert wf.getframerate() == 8000
62 |         assert wf.getsampwidth() == 1
63 | 
64 | 
65 | @pytest.mark.asyncio
66 | async def test_wave_conversion_error_handling(wave_converter):
67 |     async def input_stream() -> AsyncIterator[bytes]:
68 |         yield b"Invalid wave data"
69 | 
70 |     with pytest.raises(FormatConverterError) as exc_info:
71 |         async for _ in wave_converter.convert(input_stream()):
72 |             pass
73 | 
74 |     assert "Error during Mu-Law conversion" in str(exc_info.value)
75 | 
76 | 
77 | @pytest.mark.asyncio
78 | async def test_convert_wave_bytes():
79 |     converter = WaveConverter(output_sample_rate=8000, output_sample_width=1)
80 | 
81 |     input_io = io.BytesIO()
82 |     with wave.open(input_io, 'wb') as wf:
83 |         wf.setframerate(16000)
84 |         wf.setsampwidth(2)
85 |         wf.setnchannels(1)
86 |         wf.writeframes(b'\x00\x00' * 1000)
87 | 
88 |     input_bytes = input_io.getvalue()
89 |     output_bytes = converter.convert_wave_bytes(input_bytes, 8000, 1)
90 | 
91 |     assert output_bytes != b""
92 | 
93 |     with wave.open(io.BytesIO(output_bytes), 'rb') as wf:
94 |         assert wf.getframerate() == 8000
95 |         assert wf.getsampwidth() == 1
96 |         assert wf.getnchannels() == 1
97 | 


--------------------------------------------------------------------------------
/speech_gateway/performance_recorder/sqlite.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import fields
 2 | from datetime import datetime, timezone
 3 | import queue
 4 | import sqlite3
 5 | import threading
 6 | from . import PerformanceRecorder, PerformanceRecord
 7 | 
 8 | 
 9 | class SQLitePerformanceRecorder(PerformanceRecorder):
10 |     def __init__(self, db_path="performance.db"):
11 |         self.db_path = db_path
12 |         self.record_queue = queue.Queue()
13 |         self.stop_event = threading.Event()
14 | 
15 |         self.init_db()
16 | 
17 |         self.worker_thread = threading.Thread(target=self.start_worker, daemon=True)
18 |         self.worker_thread.start()
19 | 
20 |     def init_db(self):
21 |         conn = sqlite3.connect(self.db_path)
22 |         try:
23 |             with conn:
24 |                 conn.execute(
25 |                     """
26 |                     CREATE TABLE IF NOT EXISTS performance_records (
27 |                         id INTEGER PRIMARY KEY AUTOINCREMENT,
28 |                         process_id TEXT NOT NULL,
29 |                         created_at TEXT NOT NULL,
30 |                         source TEXT,
31 |                         text TEXT,
32 |                         audio_format TEXT,
33 |                         cached INTEGER,
34 |                         elapsed REAL
35 |                     )
36 |                     """
37 |                 )
38 |         finally:
39 |             conn.close()
40 | 
41 |     def start_worker(self):
42 |         conn = sqlite3.connect(self.db_path)
43 |         try:
44 |             while not self.stop_event.is_set() or not self.record_queue.empty():
45 |                 try:
46 |                     record = self.record_queue.get(timeout=0.5)
47 |                 except queue.Empty:
48 |                     continue
49 | 
50 |                 self.insert_record(conn, record)
51 |                 self.record_queue.task_done()
52 |         finally:
53 |             conn.close()
54 | 
55 |     def insert_record(self, conn: sqlite3.Connection, record: PerformanceRecord):
56 |         columns = [field.name for field in fields(PerformanceRecord)] + ["created_at"]
57 |         placeholders = ["?"] * len(columns)
58 |         values = [getattr(record, field.name) for field in fields(PerformanceRecord)] + [datetime.now(timezone.utc)]
59 |         sql = f"INSERT INTO performance_records ({', '.join(columns)}) VALUES ({', '.join(placeholders)})"
60 |         conn.execute(sql, values)
61 |         conn.commit()
62 | 
63 |     def record(
64 |         self,
65 |         *,
66 |         process_id: str,
67 |         source: str = None,
68 |         text: str = None,
69 |         audio_format: str = None,
70 |         cached: int = 0,
71 |         elapsed: float = None,
72 |     ):
73 |         performance_record = PerformanceRecord(
74 |             process_id=process_id,
75 |             source=source,
76 |             text=text,
77 |             audio_format=audio_format,
78 |             cached = cached,
79 |             elapsed = elapsed
80 |         )
81 | 
82 |         self.record_queue.put(performance_record)
83 | 
84 |     def close(self):
85 |         self.stop_event.set()
86 |         self.record_queue.join()
87 |         self.worker_thread.join()
88 | 


--------------------------------------------------------------------------------
/speech_gateway/gateway/openai_speech.py:
--------------------------------------------------------------------------------
 1 | from fastapi import APIRouter, Request
 2 | from fastapi.responses import StreamingResponse
 3 | from . import SpeechGateway, UnifiedTTSRequest
 4 | from ..cache.file import FileCacheStorage
 5 | from ..performance_recorder import PerformanceRecorder, SQLitePerformanceRecorder
 6 | from ..source.openai_speech import OpenAIStreamSource
 7 | 
 8 | 
 9 | class OpenAIGateway(SpeechGateway):
10 |     def __init__(self, *, stream_source: OpenAIStreamSource = None, api_key: str = None, model: str = "tts-1", speed: float = 1.0, instructions: str = None, base_url: str = None, cache_dir: str = None, performance_recorder: PerformanceRecorder = None, debug = False):
11 |         self.stream_source: OpenAIStreamSource = None
12 |         if stream_source:
13 |             super().__init__(stream_source=stream_source, debug=debug)
14 |         else:
15 |             super().__init__(
16 |                 stream_source=OpenAIStreamSource(
17 |                     api_key=api_key,
18 |                     base_url=base_url or "https://api.openai.com/v1",
19 |                     cache_storage=FileCacheStorage(cache_dir=cache_dir or "openai_cache"),
20 |                     format_converters={},
21 |                     performance_recorder=performance_recorder or SQLitePerformanceRecorder(),
22 |                     debug=debug
23 |                 ),
24 |                 debug=debug
25 |             )
26 |             self.model = model
27 |             self.speed = speed
28 |             self.instructions = instructions
29 | 
30 |     def register_endpoint(self, router: APIRouter):
31 |         @router.post("/audio/speech")
32 |         async def synthesis_handler(request: Request, x_audio_format: str = None):
33 |             request_json = await request.json()
34 | 
35 |             if x_audio_format:
36 |                 if x_audio_format in ["mp3", "opus", "aac", "flac", "wav", "pcm"]:
37 |                     request_json["response_format"] = x_audio_format
38 |                 else:
39 |                     # Set wave to convert to other format later
40 |                     request_json["response_format"] = "wav"
41 |             else:
42 |                 x_audio_format = request_json.get("response_format", "mp3")
43 | 
44 |             stream_resp = await self.stream_source.fetch_stream(
45 |                 request_json=request_json,
46 |                 audio_format=x_audio_format
47 |             )
48 |             return StreamingResponse(stream_resp, media_type=f"audio/{x_audio_format}")
49 | 
50 |     async def unified_tts_handler(self, request: Request, tts_request: UnifiedTTSRequest, x_audio_format: str = "wav"):
51 |         request_json = {
52 |             "model": self.model,
53 |             "voice": tts_request.speaker,
54 |             "input": tts_request.text,
55 |             "speed": tts_request.speed or self.speed,
56 |             "instructions": self.instructions,
57 |             "response_format": x_audio_format
58 |         }
59 | 
60 |         stream_resp = await self.stream_source.fetch_stream(
61 |             audio_format=x_audio_format,
62 |             request_json=request_json,
63 |         )
64 |         return StreamingResponse(stream_resp, media_type=f"audio/{x_audio_format}")
65 | 


--------------------------------------------------------------------------------
/speech_gateway/converter/mulaw.py:
--------------------------------------------------------------------------------
 1 | import audioop
 2 | import io
 3 | import struct
 4 | from typing import AsyncIterator
 5 | import wave
 6 | from . import FormatConverter, FormatConverterError
 7 | 
 8 | 
 9 | class MuLawConverter(FormatConverter):
10 |     def __init__(self, rate: int = 8000, include_header: bool = False, to_linear16: callable = None):
11 |         self.rate = rate
12 |         self.include_header = include_header
13 |         self.to_linear16 = to_linear16
14 | 
15 |     def create_au_header(self, data_size: int, sample_rate: int, channels: int) -> bytes:
16 |         magic_number = b".snd"  # Magic number
17 |         header_size = 24        # Fixed header size (24 bytes for standard .au header)
18 |         encoding = 1            # Mu-Law encoding
19 |         reserved = 0            # Reserved field, must be 0
20 | 
21 |         # Create header
22 |         header = struct.pack(
23 |             ">4sIIIIII",    # Big-endian: 4-char string, 6 unsigned integers
24 |             magic_number,   # Magic number
25 |             header_size,    # Header size
26 |             data_size,      # Data size
27 |             encoding,       # Encoding format
28 |             sample_rate,    # Sample rate
29 |             channels,       # Number of channels
30 |             reserved        # Reserved field
31 |         )
32 |         return header
33 | 
34 |     async def convert(self, input_stream: AsyncIterator[bytes]) -> AsyncIterator[bytes]:
35 |         try:
36 |             # Load whole wave data
37 |             wav_data = b""
38 |             async for chunk in input_stream:
39 |                 wav_data += chunk
40 | 
41 |             if self.to_linear16:
42 |                 wav_data = self.to_linear16(wav_data)
43 | 
44 |             # Parse wave info
45 |             with wave.open(io.BytesIO(wav_data), "rb") as wf:
46 |                 nchannels = wf.getnchannels()
47 |                 sampwidth = wf.getsampwidth()
48 |                 framerate = wf.getframerate()
49 |                 nframes   = wf.getnframes()
50 |                 raw_frames = wf.readframes(nframes)
51 | 
52 |             # Convert channel
53 |             if nchannels > 1:
54 |                 mono_frames = audioop.tomono(raw_frames, sampwidth, 0.5, 0.5)
55 |             else:
56 |                 mono_frames = raw_frames
57 | 
58 |             # Convert sample rate
59 |             if framerate != self.rate:
60 |                 converted_frames, _ = audioop.ratecv(
61 |                     mono_frames,
62 |                     sampwidth,
63 |                     1,
64 |                     framerate,
65 |                     self.rate,
66 |                     None
67 |                 )
68 |             else:
69 |                 converted_frames = mono_frames
70 | 
71 |             # Convert format
72 |             mulaw_data = audioop.lin2ulaw(converted_frames, sampwidth)
73 | 
74 |             if self.include_header:
75 |                 # Create .au header
76 |                 header = self.create_au_header(len(mulaw_data), self.rate, 1)
77 |                 mulaw_data = header + mulaw_data
78 | 
79 |             # Return whole data at once
80 |             yield mulaw_data
81 | 
82 |         except Exception as ex:
83 |             raise FormatConverterError(f"Error during Mu-Law conversion: {str(ex)}")
84 | 


--------------------------------------------------------------------------------
/speech_gateway/gateway/sbv2.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict
 2 | from fastapi import APIRouter, Request
 3 | from fastapi.responses import StreamingResponse
 4 | from . import SpeechGateway, UnifiedTTSRequest
 5 | from ..cache.file import FileCacheStorage
 6 | from ..converter.mp3 import MP3Converter
 7 | from ..performance_recorder import PerformanceRecorder, SQLitePerformanceRecorder
 8 | from ..source.sbv2 import StyleBertVits2StreamSource
 9 | 
10 | 
11 | class StyleBertVits2Gateway(SpeechGateway):
12 |     def __init__(self, *, stream_source: StyleBertVits2StreamSource = None, base_url: str = None, cache_dir: str = None, performance_recorder: PerformanceRecorder = None, style_mapper: Dict[str, Dict[str, str]] = None, debug = False):
13 |         self.stream_source: StyleBertVits2StreamSource = None
14 |         if stream_source:
15 |             super().__init__(stream_source=stream_source, debug=debug)
16 |         else:
17 |             super().__init__(
18 |                 stream_source=StyleBertVits2StreamSource(
19 |                     base_url=base_url or "http://127.0.0.1:5000",
20 |                     cache_storage=FileCacheStorage(cache_dir=cache_dir or "sbv2_cache"),
21 |                     format_converters={"mp3": MP3Converter(bitrate="64k")},
22 |                     performance_recorder=performance_recorder or SQLitePerformanceRecorder(),
23 |                     debug=debug
24 |                 ),
25 |                 debug=debug
26 |             )
27 |         self.style_mapper = style_mapper or {}
28 | 
29 |     def register_endpoint(self, router: APIRouter):
30 |         @router.get("/voice")
31 |         async def get_voice_handler(request: Request):
32 |             query_params = dict(request.query_params)
33 |             filtered_params = {
34 |                 k: v for k, v in query_params.items() if v is not None and k not in {"x_audio_format"}
35 |             }
36 |             audio_format = query_params.get("x_audio_format", "wav")
37 | 
38 |             stream_resp = await self.stream_source.fetch_stream(
39 |                 audio_format=audio_format,
40 |                 query_params=filtered_params,
41 |             )
42 |             return StreamingResponse(stream_resp, media_type=f"audio/{audio_format}")
43 | 
44 |     async def unified_tts_handler(self, request: Request, tts_request: UnifiedTTSRequest, x_audio_format: str = "wav"):
45 |         # Basic params
46 |         model_id, speaker_id = tts_request.speaker.split("-")
47 |         query_params = {
48 |             "text": tts_request.text,
49 |             "model_id": model_id,
50 |             "speaker_id": speaker_id
51 |         }
52 | 
53 |         if tts_request.speed:
54 |             query_params["length"] = 1 / tts_request.speed
55 | 
56 |         # Apply style
57 |         if tts_request.style is not None and (styles_for_speaker := self.style_mapper.get(tts_request.speaker)):
58 |             for k, v in styles_for_speaker.items():
59 |                 if k.lower() == tts_request.style.lower():
60 |                     query_params["style"] = v
61 |                     break
62 | 
63 |         # Additional params
64 |         for k, v in dict(request.query_params).items():
65 |             if v is not None and k not in {"x_audio_format"}:
66 |                 query_params[k] = v
67 | 
68 |         stream_resp = await self.stream_source.fetch_stream(
69 |             audio_format=x_audio_format,
70 |             query_params=query_params,
71 |         )
72 | 
73 |         return StreamingResponse(stream_resp, media_type=f"audio/{x_audio_format}")
74 | 


--------------------------------------------------------------------------------
/speech_gateway/gateway/azure.py:
--------------------------------------------------------------------------------
 1 | from fastapi import APIRouter, Request
 2 | from fastapi.responses import StreamingResponse
 3 | from . import SpeechGateway, UnifiedTTSRequest
 4 | from ..cache.file import FileCacheStorage
 5 | from ..performance_recorder import PerformanceRecorder, SQLitePerformanceRecorder
 6 | from ..source.azure import AzureStreamSource
 7 | 
 8 | 
 9 | class AzureGateway(SpeechGateway):
10 |     def __init__(self, *, stream_source: AzureStreamSource = None, api_key: str = None, region: str = None, base_url: str = None, language: str = "ja-JP", cache_dir: str = None, performance_recorder: PerformanceRecorder = None, debug = False):
11 |         self.stream_source: AzureStreamSource = None
12 |         if stream_source:
13 |             super().__init__(stream_source=stream_source, debug=debug)
14 |         else:
15 |             super().__init__(
16 |                 stream_source=AzureStreamSource(
17 |                     api_key=api_key,
18 |                     region=region,
19 |                     base_url=base_url or "https://{region}.tts.speech.microsoft.com/cognitiveservices/v1",
20 |                     cache_storage=FileCacheStorage(cache_dir=cache_dir or "azure_cache"),
21 |                     format_converters={},
22 |                     performance_recorder=performance_recorder or SQLitePerformanceRecorder(),
23 |                     debug=debug
24 |                 ),
25 |                 debug=debug
26 |             )
27 |         self.default_language = language
28 | 
29 |     def register_endpoint(self, router: APIRouter):
30 |         @router.post("/cognitiveservices/v1")
31 |         async def synthesis_handler(request: Request, x_audio_format: str = None):
32 |             if x_audio_format == "wav":
33 |                 azure_audio_format = "riff-16khz-16bit-mono-pcm"
34 |             elif x_audio_format == "mp3":
35 |                 azure_audio_format = "audio-16khz-32kbitrate-mono-mp3"
36 |             else:
37 |                 azure_audio_format = request.headers["X-Microsoft-OutputFormat"]
38 |                 if "pcm" in azure_audio_format:
39 |                     x_audio_format = "wav"
40 |                 else:
41 |                     x_audio_format = "mp3"
42 | 
43 |             stream_resp = await self.stream_source.fetch_stream(
44 |                 encoded_ssml=await request.body(),
45 |                 azure_audio_format=azure_audio_format,
46 |                 audio_format=x_audio_format
47 |             )
48 |             return StreamingResponse(stream_resp, media_type=f"audio/{x_audio_format}")
49 | 
50 |     async def unified_tts_handler(self, request: Request, tts_request: UnifiedTTSRequest, x_audio_format: str = "wav"):
51 |         if x_audio_format == "wav":
52 |             azure_audio_format = "riff-16khz-16bit-mono-pcm"
53 |         elif x_audio_format == "mp3":
54 |             azure_audio_format = "audio-16khz-32kbitrate-mono-mp3"
55 | 
56 |         if tts_request.speed:
57 |             speed_percentage = (tts_request.speed - 1.0) * 100
58 |         else:
59 |             speed_percentage = 0
60 |         ssml_text = f"<speak version='1.0' xml:lang='{tts_request.language or self.default_language}'><voice xml:lang='{tts_request.language or self.default_language}' name='{tts_request.speaker}'><prosody rate='{speed_percentage:+.2f}%'>{tts_request.text}</prosody></voice></speak>"
61 | 
62 |         stream_resp = await self.stream_source.fetch_stream(
63 |             encoded_ssml=ssml_text.encode("utf-8"),
64 |             azure_audio_format=azure_audio_format,
65 |             audio_format=x_audio_format
66 |         )
67 |         return StreamingResponse(stream_resp, media_type=f"audio/{x_audio_format}")
68 | 


--------------------------------------------------------------------------------
/speech_gateway/gateway/nijivoice_encoded.py:
--------------------------------------------------------------------------------
 1 | import base64
 2 | import io
 3 | import json
 4 | from typing import Dict
 5 | from fastapi import APIRouter, Request
 6 | from fastapi.responses import StreamingResponse, Response
 7 | from . import SpeechGateway, UnifiedTTSRequest
 8 | from ..cache.file import FileCacheStorage
 9 | from ..performance_recorder import PerformanceRecorder, SQLitePerformanceRecorder
10 | from ..source.nijivoice_encoded import NijiVoiceEncodedStreamSource
11 | 
12 | 
13 | class NijiVoiceEncodedGateway(SpeechGateway):
14 |     def __init__(self, *, stream_source: NijiVoiceEncodedStreamSource = None, api_key: str = None, speeds: Dict[str, float] = None, base_url: str = None, cache_dir: str = None, performance_recorder: PerformanceRecorder = None, debug = False):
15 |         self.stream_source: NijiVoiceEncodedStreamSource = None
16 |         if stream_source:
17 |             super().__init__(stream_source=stream_source, debug=debug)
18 |         else:
19 |             super().__init__(
20 |                 stream_source=NijiVoiceEncodedStreamSource(
21 |                     api_key=api_key,
22 |                     base_url=base_url or "https://api.nijivoice.com",
23 |                     cache_storage=FileCacheStorage(cache_dir=cache_dir or "nijivoice_encoded_cache"),
24 |                     format_converters={},
25 |                     performance_recorder=performance_recorder or SQLitePerformanceRecorder(),
26 |                     debug=debug
27 |                 ),
28 |                 debug=debug
29 |             )
30 |         self.speeds = speeds or {}
31 | 
32 |     def register_endpoint(self, router: APIRouter):
33 |         @router.post("/api/platform/v1/voice-actors/{voice_actor_id}/generate-encoded-voice")
34 |         async def get_voice_handler(voice_actor_id: str, request: Request, x_audio_format: str = None):
35 |             request_json = await request.json()
36 | 
37 |             if x_audio_format:
38 |                 if x_audio_format in ["mp3", "wav"]:
39 |                     request_json["format"] = x_audio_format
40 |                 else:
41 |                     # Set wave to convert to other format later
42 |                     request_json["format"] = "wav"
43 |             else:
44 |                 x_audio_format = request_json.get("format", "mp3")
45 | 
46 |             stream_resp = await self.stream_source.fetch_stream(
47 |                 voice_actor_id=voice_actor_id,
48 |                 audio_format=x_audio_format,
49 |                 request_json=request_json,
50 |             )
51 | 
52 |             json_bytes = b""
53 |             async for chunk in stream_resp:
54 |                 json_bytes += chunk
55 | 
56 |             return Response(content=json_bytes, media_type=f"application/json")
57 | 
58 |     async def unified_tts_handler(self, request: Request, tts_request: UnifiedTTSRequest, x_audio_format: str = "wav"):
59 |         request_json = {
60 |             "script": tts_request.text,
61 |             "speed": str(tts_request.speed) if tts_request.speed else str(self.speeds.get(tts_request.speaker, "1.0")),
62 |             "format": x_audio_format if x_audio_format == "mp3" else "wav"
63 |         }
64 | 
65 |         stream_resp = await self.stream_source.fetch_stream(
66 |             voice_actor_id=tts_request.speaker,
67 |             audio_format=x_audio_format,
68 |             request_json=request_json,
69 |         )
70 | 
71 |         json_bytes = b""
72 |         async for chunk in stream_resp:
73 |             json_bytes += chunk
74 |         response_json = json.loads(json_bytes)
75 |         base64_audio = response_json["generatedVoice"]["base64Audio"]
76 |         audio_bytes = base64.b64decode(base64_audio)
77 | 
78 |         return StreamingResponse(io.BytesIO(audio_bytes), media_type=f"audio/{x_audio_format}")
79 | 


--------------------------------------------------------------------------------
/tests/gateway/test_voicevox.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pytest
  3 | import httpx
  4 | 
  5 | SPEAKER = 46
  6 | 
  7 | 
  8 | @pytest.mark.asyncio
  9 | async def test_voicevox(random_text, wave_checker, audio_transcriber):
 10 |     audio_query = httpx.post(
 11 |         "http://127.0.0.1:8000/voicevox/audio_query",
 12 |         params={"speaker": SPEAKER, "text": random_text}
 13 |     ).json()
 14 | 
 15 |     query_params = {
 16 |         "speaker": SPEAKER
 17 |     }
 18 |     resp = httpx.post(
 19 |         "http://127.0.0.1:8000/voicevox/synthesis",
 20 |         params=query_params,
 21 |         json=audio_query
 22 |     )
 23 |     audio_data = resp.content
 24 |     assert wave_checker(audio_data)
 25 |     assert "音声合成" in audio_transcriber(audio_data, "wav")
 26 | 
 27 | 
 28 | @pytest.mark.asyncio
 29 | async def test_voicevox_wav(random_text, wave_checker, audio_transcriber):
 30 |     audio_query = httpx.post(
 31 |         "http://127.0.0.1:8000/voicevox/audio_query",
 32 |         params={"speaker": SPEAKER, "text": random_text}
 33 |     ).json()
 34 | 
 35 |     query_params = {
 36 |         "speaker": SPEAKER,
 37 |         "x_audio_format": "wav"
 38 |     }
 39 |     resp = httpx.post(
 40 |         "http://127.0.0.1:8000/voicevox/synthesis",
 41 |         params=query_params,
 42 |         json=audio_query
 43 |     )
 44 |     audio_data = resp.content
 45 |     assert wave_checker(audio_data)
 46 |     assert "音声合成" in audio_transcriber(audio_data, "wav")
 47 | 
 48 | 
 49 | @pytest.mark.asyncio
 50 | async def test_voicevox_mp3(random_text, mp3_checker, audio_transcriber):
 51 |     audio_query = httpx.post(
 52 |         "http://127.0.0.1:8000/voicevox/audio_query",
 53 |         params={"speaker": SPEAKER, "text": random_text}
 54 |     ).json()
 55 | 
 56 |     query_params = {
 57 |         "speaker": SPEAKER,
 58 |         "x_audio_format": "mp3"
 59 |     }
 60 |     resp = httpx.post(
 61 |         "http://127.0.0.1:8000/voicevox/synthesis",
 62 |         params=query_params,
 63 |         json=audio_query
 64 |     )
 65 |     audio_data = resp.content
 66 |     assert mp3_checker(audio_data)
 67 |     assert "音声合成" in audio_transcriber(audio_data, "mp3")
 68 | 
 69 | 
 70 | @pytest.mark.asyncio
 71 | async def test_voicevox_unified(random_text, wave_checker, audio_transcriber):
 72 |     req = {
 73 |         "text": random_text,
 74 |         "speaker": str(SPEAKER),
 75 |         "service_name": "voicevox"
 76 |     }
 77 |     resp = httpx.post("http://127.0.0.1:8000/tts", json=req)
 78 |     audio_data = resp.content
 79 |     assert wave_checker(audio_data)
 80 |     assert "音声合成" in audio_transcriber(audio_data, "wav")
 81 | 
 82 | 
 83 | @pytest.mark.asyncio
 84 | async def test_voicevox_unified_wav(random_text, wave_checker, audio_transcriber):
 85 |     req = {
 86 |         "text": random_text,
 87 |         "speaker": str(SPEAKER),
 88 |         "service_name": "voicevox"
 89 |     }
 90 |     query_params = {
 91 |         "x_audio_format": "wav"
 92 |     }
 93 |     resp = httpx.post("http://127.0.0.1:8000/tts", params=query_params, json=req)
 94 |     audio_data = resp.content
 95 |     assert wave_checker(audio_data)
 96 |     assert "音声合成" in audio_transcriber(audio_data, "wav")
 97 | 
 98 | 
 99 | @pytest.mark.asyncio
100 | async def test_voicevox_unified_mp3(random_text, mp3_checker, audio_transcriber):
101 |     req = {
102 |         "text": random_text,
103 |         "speaker": str(SPEAKER),
104 |         "service_name": "voicevox"
105 |     }
106 |     query_params = {
107 |         "x_audio_format": "mp3"
108 |     }
109 |     resp = httpx.post("http://127.0.0.1:8000/tts", params=query_params, json=req)
110 |     audio_data = resp.content
111 |     assert mp3_checker(audio_data)
112 |     assert "音声合成" in audio_transcriber(audio_data, "mp3")
113 | 


--------------------------------------------------------------------------------
/tests/gateway/test_unified.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import os
 3 | import httpx
 4 | from speech_gateway.gateway.voicevox import VoicevoxGateway
 5 | from speech_gateway.gateway.nijivoice import NijiVoiceGateway
 6 | from speech_gateway.gateway.sbv2 import StyleBertVits2Gateway
 7 | from speech_gateway.gateway.openai_speech import OpenAIGateway
 8 | from speech_gateway.gateway.unified import UnifiedGateway
 9 | from speech_gateway.gateway import UnifiedTTSRequest
10 | 
11 | VOICEVOX_URL = os.getenv("VOICEVOX_URL")
12 | SBV2_URL = os.getenv("SBV2_URL")
13 | NIJIVOICE_API_KEY = os.getenv("NIJIVOICE_API_KEY")
14 | OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
15 | 
16 | 
17 | @pytest.mark.asyncio
18 | async def test_unified_gateway_default():
19 |     # Create gateways
20 |     voicevox_gateway = VoicevoxGateway(base_url=VOICEVOX_URL, debug=True)
21 |     sbv2_gateway = StyleBertVits2Gateway(base_url=SBV2_URL, debug=True)
22 |     nijivoice_gateway = NijiVoiceGateway(api_key=NIJIVOICE_API_KEY, prefix="/nijivoice", debug=True)
23 |     openai_gateway = OpenAIGateway(api_key=OPENAI_API_KEY, debug=True)
24 | 
25 |     # Unified gateway
26 |     unified_gateway = UnifiedGateway(debug=True)
27 |     unified_gateway.add_gateway("voicevox", voicevox_gateway, default_speaker="46", default=True)
28 |     unified_gateway.add_gateway("sbv2", sbv2_gateway)
29 |     unified_gateway.add_gateway("nijivoice", nijivoice_gateway)
30 |     unified_gateway.add_gateway("openai", openai_gateway, languages=["en-US", "zh-CN"], default_speaker="alloy")
31 | 
32 |     assert unified_gateway.get_gateway(UnifiedTTSRequest(text="hello")) == voicevox_gateway
33 | 
34 |     assert unified_gateway.get_gateway(UnifiedTTSRequest(text="hello", service_name="voicevox")) == voicevox_gateway
35 |     assert unified_gateway.get_gateway(UnifiedTTSRequest(text="hello", service_name="sbv2")) == sbv2_gateway
36 |     assert unified_gateway.get_gateway(UnifiedTTSRequest(text="hello", service_name="nijivoice")) == nijivoice_gateway
37 |     assert unified_gateway.get_gateway(UnifiedTTSRequest(text="hello", service_name="openai")) == openai_gateway
38 |     assert unified_gateway.get_gateway(UnifiedTTSRequest(text="hello", service_name="dummy")) is None
39 | 
40 |     assert unified_gateway.get_gateway(UnifiedTTSRequest(text="hello", language="ja-JP")) == voicevox_gateway
41 |     assert unified_gateway.get_gateway(UnifiedTTSRequest(text="hello", language="en-US")) == openai_gateway
42 |     assert unified_gateway.get_gateway(UnifiedTTSRequest(text="hello", language="zh-CN")) == openai_gateway
43 | 
44 |     assert unified_gateway.get_gateway(UnifiedTTSRequest(text="hello", service_name="sbv2", language="en-US")) == sbv2_gateway
45 | 
46 | 
47 | 
48 | @pytest.mark.asyncio
49 | async def test_voicevox_unified(random_text, wave_checker, audio_transcriber):
50 |     req = {
51 |         "text": random_text
52 |     }
53 |     resp = httpx.post("http://127.0.0.1:8000/tts", json=req)
54 |     audio_data = resp.content
55 |     assert wave_checker(audio_data)
56 |     assert "音声合成" in audio_transcriber(audio_data, "wav")
57 | 
58 | 
59 | @pytest.mark.asyncio
60 | async def test_voicevox_unified_wav(random_text, wave_checker, audio_transcriber):
61 |     req = {
62 |         "text": random_text
63 |     }
64 |     query_params = {
65 |         "x_audio_format": "wav"
66 |     }
67 |     resp = httpx.post("http://127.0.0.1:8000/tts", params=query_params, json=req)
68 |     audio_data = resp.content
69 |     assert wave_checker(audio_data)
70 |     assert "音声合成" in audio_transcriber(audio_data, "wav")
71 | 
72 | 
73 | @pytest.mark.asyncio
74 | async def test_voicevox_unified_mp3(random_text, mp3_checker, audio_transcriber):
75 |     req = {
76 |         "text": random_text
77 |     }
78 |     query_params = {
79 |         "x_audio_format": "mp3"
80 |     }
81 |     resp = httpx.post("http://127.0.0.1:8000/tts", params=query_params, json=req)
82 |     audio_data = resp.content
83 |     assert mp3_checker(audio_data)
84 |     assert "音声合成" in audio_transcriber(audio_data, "mp3")
85 | 


--------------------------------------------------------------------------------
/tests/gateway/test_azure.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import httpx
 3 | 
 4 | 
 5 | @pytest.mark.asyncio
 6 | async def test_azure(random_text, wave_checker, audio_transcriber):
 7 |     ssml_text = f"<speak version='1.0' xml:lang='ja-JP'><voice xml:lang='ja-JP' name='zh-CN-XiaoyuMultilingualNeural'>{random_text}</voice></speak>"
 8 |     resp = httpx.post(
 9 |         url="http://127.0.0.1:8000/azure/cognitiveservices/v1",
10 |         headers={
11 |             "X-Microsoft-OutputFormat": "riff-16khz-16bit-mono-pcm",
12 |             "Content-Type": "application/ssml+xml"
13 |         },
14 |         data=ssml_text.encode("utf-8")
15 |     )
16 |     audio_data = resp.content
17 |     assert wave_checker(audio_data)
18 |     assert "音声合成" in audio_transcriber(audio_data, "wav")
19 | 
20 | 
21 | @pytest.mark.asyncio
22 | async def test_azure_wav(random_text, wave_checker, audio_transcriber):
23 |     ssml_text = f"<speak version='1.0' xml:lang='ja-JP'><voice xml:lang='ja-JP' name='zh-CN-XiaoyuMultilingualNeural'>{random_text}</voice></speak>"
24 |     resp = httpx.post(
25 |         url="http://127.0.0.1:8000/azure/cognitiveservices/v1",
26 |         headers={
27 |             "X-Microsoft-OutputFormat": "audio-16khz-32kbitrate-mono-mp3",    # <- set mp3 to header
28 |             "Content-Type": "application/ssml+xml"
29 |         },
30 |         params={"x_audio_format": "wav"},   # <- overwrite format to wav
31 |         data=ssml_text.encode("utf-8")
32 |     )
33 |     audio_data = resp.content
34 |     assert wave_checker(audio_data)
35 |     assert "音声合成" in audio_transcriber(audio_data, "wav")
36 | 
37 | 
38 | @pytest.mark.asyncio
39 | async def test_azure_mp3(random_text, mp3_checker, audio_transcriber):
40 |     ssml_text = f"<speak version='1.0' xml:lang='ja-JP'><voice xml:lang='ja-JP' name='zh-CN-XiaoyuMultilingualNeural'>{random_text}</voice></speak>"
41 |     resp = httpx.post(
42 |         url="http://127.0.0.1:8000/azure/cognitiveservices/v1",
43 |         headers={
44 |             "X-Microsoft-OutputFormat": "riff-16khz-16bit-mono-pcm",    # <- set wav to header
45 |             "Content-Type": "application/ssml+xml"
46 |         },
47 |         params={"x_audio_format": "mp3"},   # <- overwrite format to mp3
48 |         data=ssml_text.encode("utf-8")
49 |     )
50 |     audio_data = resp.content
51 |     assert mp3_checker(audio_data)
52 |     assert "音声合成" in audio_transcriber(audio_data, "mp3")
53 | 
54 | 
55 | @pytest.mark.asyncio
56 | async def test_azure_unified(random_text, wave_checker, audio_transcriber):
57 |     req = {
58 |         "text": random_text,
59 |         "speaker": "zh-CN-XiaoyuMultilingualNeural",
60 |         "service_name": "azure"
61 |     }
62 |     resp = httpx.post("http://127.0.0.1:8000/tts", json=req)
63 |     audio_data = resp.content
64 |     assert wave_checker(audio_data)
65 |     assert "音声合成" in audio_transcriber(audio_data, "wav")
66 | 
67 | 
68 | @pytest.mark.asyncio
69 | async def test_azure_unified_wav(random_text, wave_checker, audio_transcriber):
70 |     req = {
71 |         "text": random_text,
72 |         "speaker": "zh-CN-XiaoyuMultilingualNeural",
73 |         "service_name": "azure"
74 |     }
75 |     query_params = {
76 |         "x_audio_format": "wav"
77 |     }
78 |     resp = httpx.post("http://127.0.0.1:8000/tts", params=query_params, json=req)
79 |     audio_data = resp.content
80 |     assert wave_checker(audio_data)
81 |     assert "音声合成" in audio_transcriber(audio_data, "wav")
82 | 
83 | 
84 | @pytest.mark.asyncio
85 | async def test_azure_unified_mp3(random_text, mp3_checker, audio_transcriber):
86 |     req = {
87 |         "text": random_text,
88 |         "speaker": "zh-CN-XiaoyuMultilingualNeural",
89 |         "service_name": "azure"
90 |     }
91 |     query_params = {
92 |         "x_audio_format": "mp3"
93 |     }
94 |     resp = httpx.post("http://127.0.0.1:8000/tts", params=query_params, json=req)
95 |     audio_data = resp.content
96 |     assert mp3_checker(audio_data)
97 |     assert "音声合成" in audio_transcriber(audio_data, "mp3")
98 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | 
162 | voicevox_cache/
163 | sbv2_cache/
164 | nijivoice_cache/
165 | openai_cache/
166 | example.py
167 | testrun.py
168 | client.py
169 | pytest.ini
170 | performance.db
171 | 


--------------------------------------------------------------------------------
/tests/performance_recorder/test_sqlite.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import sqlite3
  3 | import threading
  4 | from time import sleep
  5 | from speech_gateway.performance_recorder.sqlite import SQLitePerformanceRecorder
  6 | 
  7 | 
  8 | @pytest.fixture
  9 | def sqlite_recorder(tmp_path):
 10 |     """
 11 |     Create a new database file in a temporary directory for each test.
 12 |     After the test finishes, call close() to release resources.
 13 |     """
 14 |     db_path = tmp_path / "test_performance.db"
 15 |     recorder = SQLitePerformanceRecorder(str(db_path))
 16 |     yield recorder
 17 |     # Ensure that we close the recorder after the test to release all resources
 18 |     recorder.close()
 19 | 
 20 | 
 21 | def test_single_thread_record_and_close(sqlite_recorder):
 22 |     """
 23 |     Verify that the record -> close flow finishes without deadlocks 
 24 |     in a single-thread scenario, and confirm the correct number of rows is inserted.
 25 |     Also, check that the 'id' field is auto-incrementing correctly.
 26 |     """
 27 |     # Insert 5 records
 28 |     for i in range(5):
 29 |         sqlite_recorder.record(
 30 |             process_id=f"process_{i}",
 31 |             source="test_source",
 32 |             text=f"test_text_{i}",
 33 |             audio_format="wav",
 34 |             cached=0,
 35 |             elapsed=0.01 * i,
 36 |         )
 37 | 
 38 |     # Although close() will be called by the fixture teardown,
 39 |     # here we explicitly call it for clarity.
 40 |     sqlite_recorder.close()
 41 | 
 42 |     # Directly open the database to check how many records were inserted
 43 |     conn = sqlite3.connect(sqlite_recorder.db_path)
 44 |     try:
 45 |         cursor = conn.cursor()
 46 |         cursor.execute("SELECT COUNT(*) FROM performance_records;")
 47 |         count = cursor.fetchone()[0]
 48 |         assert count == 5, f"Expected 5 records, got {count}"
 49 | 
 50 |         # Retrieve all IDs in ascending order
 51 |         cursor.execute("SELECT id FROM performance_records ORDER BY id;")
 52 |         ids = [row[0] for row in cursor.fetchall()]
 53 | 
 54 |         # Confirm we have 5 IDs
 55 |         assert len(ids) == 5, f"Expected 5 IDs, got {len(ids)}"
 56 | 
 57 |         # Check they are strictly increasing by 1
 58 |         for i in range(1, len(ids)):
 59 |             assert ids[i] == ids[i - 1] + 1, "IDs are not incrementing as expected"
 60 |     finally:
 61 |         conn.close()
 62 | 
 63 | 
 64 | def test_multi_thread_record_no_deadlock(sqlite_recorder):
 65 |     """
 66 |     Verify that concurrent calls to record() do not cause deadlocks
 67 |     and that data is correctly committed to the database.
 68 |     """
 69 |     NUM_THREADS = 5
 70 |     RECORDS_PER_THREAD = 100
 71 | 
 72 |     def worker(thread_id: int):
 73 |         for i in range(RECORDS_PER_THREAD):
 74 |             sqlite_recorder.record(
 75 |                 process_id=f"thread_{thread_id}_process_{i}",
 76 |                 source="test_source",
 77 |                 text=f"test_text_{i}",
 78 |                 audio_format="wav",
 79 |                 cached=1,
 80 |                 elapsed=0.1 * i,
 81 |             )
 82 |             # Sleep a bit to make concurrency testing more likely to expose issues
 83 |             sleep(0.001)
 84 | 
 85 |     threads = []
 86 |     for t_id in range(NUM_THREADS):
 87 |         t = threading.Thread(target=worker, args=(t_id,))
 88 |         t.start()
 89 |         threads.append(t)
 90 | 
 91 |     # Wait for all threads to complete
 92 |     for t in threads:
 93 |         t.join()
 94 | 
 95 |     # Close the recorder to ensure the queue is fully processed
 96 |     sqlite_recorder.close()
 97 | 
 98 |     # Check that all records were indeed written to the database
 99 |     total_expected = NUM_THREADS * RECORDS_PER_THREAD
100 |     conn = sqlite3.connect(sqlite_recorder.db_path)
101 |     try:
102 |         cursor = conn.cursor()
103 |         cursor.execute("SELECT COUNT(*) FROM performance_records;")
104 |         count = cursor.fetchone()[0]
105 |         assert count == total_expected, f"Expected {total_expected} records, got {count}"
106 |     finally:
107 |         conn.close()
108 | 


--------------------------------------------------------------------------------
/tests/performance_recorder/test_postgres.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pytest
  3 | import threading
  4 | from time import sleep
  5 | from speech_gateway.performance_recorder.postgres import PostgreSQLPerformanceRecorder
  6 | 
  7 | POSTGRES_USER = os.getenv("POSTGRES_USER")
  8 | POSTGRES_PASSWORD = os.getenv("POSTGRES_PASSWORD")
  9 | POSTGRES_DBNAME = os.getenv("POSTGRES_DBNAME")
 10 | 
 11 | 
 12 | @pytest.fixture
 13 | def postgres_recorder(tmp_path):
 14 |     recorder = PostgreSQLPerformanceRecorder(
 15 |         dbname=POSTGRES_DBNAME,
 16 |         user=POSTGRES_USER,
 17 |         password=POSTGRES_PASSWORD
 18 |     )
 19 |     yield recorder
 20 |     conn = recorder.connect_db()
 21 |     cursor = conn.cursor()
 22 |     cursor.execute("TRUNCATE TABLE performance_records;")
 23 |     conn.commit()
 24 |     conn.close()
 25 |     recorder.close()
 26 | 
 27 | def test_single_thread_record_and_close(postgres_recorder):
 28 |     """
 29 |     Verify that the record -> close flow finishes without deadlocks 
 30 |     in a single-thread scenario, and confirm the correct number of rows is inserted.
 31 |     Also, check that the 'id' field is auto-incrementing correctly.
 32 |     """
 33 |     # Insert 5 records
 34 |     for i in range(5):
 35 |         postgres_recorder.record(
 36 |             process_id=f"process_{i}",
 37 |             source="test_source",
 38 |             text=f"test_text_{i}",
 39 |             audio_format="wav",
 40 |             cached=0,
 41 |             elapsed=0.01 * i,
 42 |         )
 43 | 
 44 |     # Although close() will be called by the fixture teardown,
 45 |     # here we explicitly call it for clarity.
 46 |     postgres_recorder.close()
 47 | 
 48 |     # Directly open the database to check how many records were inserted
 49 |     conn = postgres_recorder.connect_db()
 50 |     try:
 51 |         cursor = conn.cursor()
 52 |         cursor.execute("SELECT COUNT(*) FROM performance_records;")
 53 |         count = cursor.fetchone()[0]
 54 |         assert count == 5, f"Expected 5 records, got {count}"
 55 | 
 56 |         # Retrieve all IDs in ascending order
 57 |         cursor.execute("SELECT id FROM performance_records ORDER BY id;")
 58 |         ids = [row[0] for row in cursor.fetchall()]
 59 | 
 60 |         # Confirm we have 5 IDs
 61 |         assert len(ids) == 5, f"Expected 5 IDs, got {len(ids)}"
 62 | 
 63 |         # Check they are strictly increasing by 1
 64 |         for i in range(1, len(ids)):
 65 |             assert ids[i] == ids[i - 1] + 1, "IDs are not incrementing as expected"
 66 |     finally:
 67 |         conn.close()
 68 | 
 69 | 
 70 | def test_multi_thread_record_no_deadlock(postgres_recorder):
 71 |     """
 72 |     Verify that concurrent calls to record() do not cause deadlocks
 73 |     and that data is correctly committed to the database.
 74 |     """
 75 |     NUM_THREADS = 5
 76 |     RECORDS_PER_THREAD = 100
 77 | 
 78 |     def worker(thread_id: int):
 79 |         for i in range(RECORDS_PER_THREAD):
 80 |             postgres_recorder.record(
 81 |                 process_id=f"thread_{thread_id}_process_{i}",
 82 |                 source="test_source",
 83 |                 text=f"test_text_{i}",
 84 |                 audio_format="wav",
 85 |                 cached=1,
 86 |                 elapsed=0.1 * i,
 87 |             )
 88 |             # Sleep a bit to make concurrency testing more likely to expose issues
 89 |             sleep(0.001)
 90 | 
 91 |     threads = []
 92 |     for t_id in range(NUM_THREADS):
 93 |         t = threading.Thread(target=worker, args=(t_id,))
 94 |         t.start()
 95 |         threads.append(t)
 96 | 
 97 |     # Wait for all threads to complete
 98 |     for t in threads:
 99 |         t.join()
100 | 
101 |     # Close the recorder to ensure the queue is fully processed
102 |     postgres_recorder.close()
103 | 
104 |     # Check that all records were indeed written to the database
105 |     total_expected = NUM_THREADS * RECORDS_PER_THREAD
106 |     conn = postgres_recorder.connect_db()
107 |     try:
108 |         cursor = conn.cursor()
109 |         cursor.execute("SELECT COUNT(*) FROM performance_records;")
110 |         count = cursor.fetchone()[0]
111 |         assert count == total_expected, f"Expected {total_expected} records, got {count}"
112 |     finally:
113 |         conn.close()
114 | 


--------------------------------------------------------------------------------
/speech_gateway/gateway/nijivoice.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict
 2 | from fastapi import APIRouter, Request
 3 | from fastapi.responses import StreamingResponse, JSONResponse
 4 | from . import SpeechGateway, UnifiedTTSRequest
 5 | from ..cache.file import FileCacheStorage
 6 | from ..performance_recorder import PerformanceRecorder, SQLitePerformanceRecorder
 7 | from ..source.nijivoice import NijiVoiceStreamSource
 8 | 
 9 | 
10 | class NijiVoiceGateway(SpeechGateway):
11 |     def __init__(self, *, stream_source: NijiVoiceStreamSource = None, api_key: str = None, speeds: Dict[str, float] = None, base_url: str = None, prefix: str = None, cache_dir: str = None, performance_recorder: PerformanceRecorder = None, debug = False):
12 |         self.stream_source: NijiVoiceStreamSource = None
13 |         if stream_source:
14 |             super().__init__(stream_source=stream_source, debug=debug)
15 |         else:
16 |             super().__init__(
17 |                 stream_source=NijiVoiceStreamSource(
18 |                     api_key=api_key,
19 |                     base_url=base_url or "https://api.nijivoice.com",
20 |                     cache_storage=FileCacheStorage(cache_dir=cache_dir or "nijivoice_cache"),
21 |                     format_converters={},
22 |                     performance_recorder=performance_recorder or SQLitePerformanceRecorder(),
23 |                     debug=debug
24 |                 ),
25 |                 debug=debug
26 |             )
27 |         self.speeds = speeds or {}
28 |         self.prefix = prefix
29 | 
30 |     def register_endpoint(self, router: APIRouter):
31 |         @router.post("/api/platform/v1/voice-actors/{voice_actor_id}/generate-voice")
32 |         async def generate_voice_handler(voice_actor_id: str, request: Request, x_audio_format: str = None):
33 |             request_json = await request.json()
34 | 
35 |             if x_audio_format:
36 |                 if x_audio_format in ["mp3", "wav"]:
37 |                     request_json["format"] = x_audio_format
38 |                 else:
39 |                     # Set wave to convert to other format later
40 |                     request_json["format"] = "wav"
41 |             else:
42 |                 x_audio_format = request_json.get("format", "mp3")
43 | 
44 |             gateway_base_url = f"{request.base_url.scheme}://{request.base_url.netloc}{self.prefix}"
45 |             resp_json = await self.stream_source.generate_voice(
46 |                 voice_actor_id,
47 |                 request_json,
48 |                 gateway_base_url,
49 |                 x_audio_format
50 |             )
51 | 
52 |             return JSONResponse(resp_json)
53 | 
54 |         @router.get("/api/platform/v1/voice-actors/{voice_actor_id}/get-voice")
55 |         async def get_voice_handler(voice_actor_id: str, x_audio_format: str, url: str = None, download: str = None, cache_key: str = None):
56 |             nijivoice_resp = await self.stream_source.fetch_stream(
57 |                 voice_actor_id=voice_actor_id,
58 |                 url=url,
59 |                 download=download,
60 |                 cache_key=cache_key,
61 |                 audio_format=x_audio_format
62 |             )
63 |             return StreamingResponse(nijivoice_resp, media_type=f"audio/{x_audio_format}")
64 | 
65 |     async def unified_tts_handler(self, request: Request, tts_request: UnifiedTTSRequest, x_audio_format: str = "wav"):
66 |         gateway_base_url = f"{request.base_url.scheme}://{request.base_url.netloc}{self.prefix}"
67 | 
68 |         payload = {
69 |             "script": tts_request.text,
70 |             "speed": str(tts_request.speed) if tts_request.speed else str(self.speeds.get(tts_request.speaker, "1.0")),
71 |             "format": x_audio_format if x_audio_format == "mp3" else "wav"
72 |         }
73 | 
74 |         resp_json = await self.stream_source.generate_voice(tts_request.speaker, payload, gateway_base_url, x_audio_format, overwrite_download_urls=False)
75 | 
76 |         nijivoice_resp = await self.stream_source.fetch_stream(
77 |             voice_actor_id=tts_request.speaker,
78 |             url=resp_json["generatedVoice"]["audioFileUrl"],
79 |             download=False,
80 |             cache_key=self.stream_source.get_cache_key(x_audio_format, tts_request.speaker, payload),
81 |             audio_format=x_audio_format
82 |         )
83 | 
84 |         return StreamingResponse(nijivoice_resp, media_type=f"audio/{x_audio_format}")
85 | 


--------------------------------------------------------------------------------
/speech_gateway/performance_recorder/postgres.py:
--------------------------------------------------------------------------------
  1 | from dataclasses import fields
  2 | from datetime import datetime, timezone
  3 | import logging
  4 | import queue
  5 | import threading
  6 | import time
  7 | import psycopg2
  8 | from . import PerformanceRecorder, PerformanceRecord
  9 | 
 10 | logger = logging.getLogger(__name__)
 11 | 
 12 | 
 13 | class PostgreSQLPerformanceRecorder(PerformanceRecorder):
 14 |     def __init__(
 15 |         self,
 16 |         *,
 17 |         host: str = "localhost",
 18 |         port: int = 5432,
 19 |         dbname: str = "speech_gateway",
 20 |         user: str = "postgres",
 21 |         password: str = None,
 22 |     ):
 23 |         self.connection_params = {
 24 |             "host": host,
 25 |             "port": port,
 26 |             "dbname": dbname,
 27 |             "user": user,
 28 |             "password": password,
 29 |         }
 30 |         self.record_queue = queue.Queue()
 31 |         self.stop_event = threading.Event()
 32 | 
 33 |         self.init_db()
 34 | 
 35 |         self.worker_thread = threading.Thread(target=self.start_worker, daemon=True)
 36 |         self.worker_thread.start()
 37 | 
 38 |     def connect_db(self):
 39 |         return psycopg2.connect(**self.connection_params)
 40 | 
 41 |     def init_db(self):
 42 |         conn = self.connect_db()
 43 |         try:
 44 |             with conn:
 45 |                 with conn.cursor() as cur:
 46 |                     cur.execute(
 47 |                         """
 48 |                         CREATE TABLE IF NOT EXISTS performance_records (
 49 |                             id SERIAL PRIMARY KEY,
 50 |                             process_id TEXT NOT NULL,
 51 |                             created_at TIMESTAMPTZ NOT NULL,
 52 |                             source TEXT,
 53 |                             text TEXT,
 54 |                             audio_format TEXT,
 55 |                             cached INTEGER,
 56 |                             elapsed REAL
 57 |                         )
 58 |                         """
 59 |                     )
 60 |         finally:
 61 |             conn.close()
 62 | 
 63 |     def start_worker(self):
 64 |         conn = self.connect_db()
 65 |         try:
 66 |             while not self.stop_event.is_set() or not self.record_queue.empty():
 67 |                 try:
 68 |                     record = self.record_queue.get(timeout=0.5)
 69 |                 except queue.Empty:
 70 |                     continue
 71 | 
 72 |                 try:
 73 |                     self.insert_record(conn, record)
 74 |                 except (psycopg2.InterfaceError, psycopg2.OperationalError):
 75 |                     try:
 76 |                         conn.close()
 77 |                     except Exception:
 78 |                         pass
 79 | 
 80 |                     logger.warning("Connection is not available. Retrying insert_record with new connection...")
 81 |                     time.sleep(0.5)
 82 |                     conn = self.connect_db()
 83 |                     self.insert_record(conn, record)
 84 | 
 85 |                 self.record_queue.task_done()
 86 |         finally:
 87 |             try:
 88 |                 conn.close()
 89 |             except Exception:
 90 |                 pass
 91 | 
 92 | 
 93 |     def insert_record(self, conn, record: PerformanceRecord):
 94 |         columns = [field.name for field in fields(PerformanceRecord)] + ["created_at"]
 95 |         placeholders = ["%s"] * len(columns)
 96 |         values = [getattr(record, field.name) for field in fields(PerformanceRecord)] + [
 97 |             datetime.now(timezone.utc)
 98 |         ]
 99 | 
100 |         with conn.cursor() as cur:
101 |             cur.execute(
102 |                 f"INSERT INTO performance_records ({', '.join(columns)}) VALUES ({', '.join(placeholders)})",
103 |                 values,
104 |             )
105 |         conn.commit()
106 | 
107 |     def record(
108 |         self,
109 |         *,
110 |         process_id: str,
111 |         source: str = None,
112 |         text: str = None,
113 |         audio_format: str = None,
114 |         cached: int = 0,
115 |         elapsed: float = None,
116 |     ):
117 |         performance_record = PerformanceRecord(
118 |             process_id=process_id,
119 |             source=source,
120 |             text=text,
121 |             audio_format=audio_format,
122 |             cached=cached,
123 |             elapsed=elapsed,
124 |         )
125 |         self.record_queue.put(performance_record)
126 | 
127 |     def close(self):
128 |         self.stop_event.set()
129 |         self.record_queue.join()
130 |         self.worker_thread.join()
131 | 


--------------------------------------------------------------------------------
/speech_gateway/source/nijivoice.py:
--------------------------------------------------------------------------------
  1 | from time import time
  2 | from typing import AsyncIterator, Dict
  3 | import urllib.parse
  4 | import httpx
  5 | from . import StreamSource, StreamSourceError
  6 | from ..cache import CacheStorage
  7 | from ..cache.file import FileCacheStorage
  8 | from ..converter import FormatConverter
  9 | from ..performance_recorder import PerformanceRecorder
 10 | 
 11 | 
 12 | class NijiVoiceStreamSource(StreamSource):
 13 |     def __init__(self,
 14 |         *,
 15 |         api_key: str = None,
 16 |         base_url: str = "https://api.nijivoice.com",
 17 |         cache_storage: CacheStorage = None,
 18 |         format_converters: Dict[str, FormatConverter] = None,
 19 |         max_connections: int = 100,
 20 |         max_keepalive_connections: int = 20,
 21 |         timeout: float = 10.0,
 22 |         performance_recorder: PerformanceRecorder = None,
 23 |         debug: bool = False
 24 |     ):
 25 |         super().__init__(
 26 |             base_url=base_url,
 27 |             cache_storage=cache_storage or FileCacheStorage(cache_dir="nijivoice_cache"),
 28 |             format_converters=format_converters,
 29 |             max_connections=max_connections,
 30 |             max_keepalive_connections=max_keepalive_connections,
 31 |             timeout=timeout,
 32 |             performance_recorder=performance_recorder,
 33 |             debug=debug
 34 |         )
 35 |         self.base_url = base_url
 36 |         self.api_key = api_key
 37 | 
 38 |     def get_cache_key(self, audio_format: str, voice_actor_id: str = None, payload: dict = None, cache_key: str = None, **kwargs) -> str:
 39 |         if cache_key:
 40 |             return cache_key
 41 | 
 42 |         return f"{voice_actor_id}_{hash(str(payload))}.{audio_format or 'mp3'}"
 43 | 
 44 |     def parse_text(self, **kwargs) -> str:
 45 |         return None
 46 | 
 47 |     def make_stream_request(self, url: str, **kwargs):
 48 |         return {
 49 |             "method": "GET",
 50 |             "url": url,
 51 |         }
 52 | 
 53 |     async def generate_voice(self, voice_actor_id: str, payload: dict, gateway_base_url: str, x_audio_format: str = "mp3", overwrite_download_urls: bool = True):
 54 |         start_time = time()
 55 |         cache_key = self.get_cache_key(x_audio_format, voice_actor_id, payload)
 56 |         use_cache = self.cache_storage and await self.cache_storage.has_cache(cache_key)
 57 | 
 58 |         # Return cache info if cached
 59 |         if use_cache:
 60 |             gateway_voice_url = f"{gateway_base_url}/api/platform/v1/voice-actors/{voice_actor_id}/get-voice?cache_key={cache_key}&x_audio_format={x_audio_format}"
 61 |             data = {"generatedVoice": {
 62 |                 "audioFileUrl": gateway_voice_url,
 63 |                 "audioFileDownloadUrl": gateway_voice_url + "&download=true"
 64 |             }}
 65 | 
 66 |         else:
 67 |             try:
 68 |                 # Generate voice
 69 |                 url = f"{self.base_url}/api/platform/v1/voice-actors/{voice_actor_id}/generate-voice"
 70 |                 headers = {
 71 |                     "x-api-key": self.api_key,
 72 |                     "content-type": "application/json"
 73 |                 }
 74 |                 url_resp = await self.http_client.post(url, headers=headers, json=payload)
 75 |                 if url_resp.status_code != 200:
 76 |                     raise StreamSourceError(f"NijiVoice generate voice failed: {url_resp.status_code}")
 77 | 
 78 |                 # Get voice URL
 79 |                 data = url_resp.json()
 80 |                 audio_file_url = data.get("generatedVoice", {}).get("audioFileUrl")
 81 |                 encoded_audio_file_url = urllib.parse.quote(audio_file_url, safe='')
 82 | 
 83 |                 # Overwrite URLs
 84 |                 if overwrite_download_urls:
 85 |                     gateway_voice_url = (
 86 |                         f"{gateway_base_url}/api/platform/v1/voice-actors/{voice_actor_id}/get-voice"
 87 |                         f"?url={encoded_audio_file_url}&cache_key={cache_key}&x_audio_format={x_audio_format}"
 88 |                     )
 89 |                     data["generatedVoice"]["audioFileUrl"] = gateway_voice_url
 90 |                     data["generatedVoice"]["audioFileDownloadUrl"] = gateway_voice_url + "&download=true"
 91 | 
 92 |             except httpx.RequestError as ex:
 93 |                 raise StreamSourceError(f"HTTP request failed: {ex}") from ex
 94 | 
 95 |         # Performance record
 96 |         if self.performance_recorder:
 97 |             self.performance_recorder.record(
 98 |                 process_id=cache_key, source=self.__class__.__name__, text=payload.get("script"),
 99 |                 audio_format=x_audio_format, cached=use_cache, elapsed=time() - start_time
100 |             )
101 | 
102 |         return data
103 | 


--------------------------------------------------------------------------------
/tests/source/test_nijivoice_source.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import os
  3 | import httpx
  4 | from speech_gateway.source.nijivoice import NijiVoiceStreamSource
  5 | from speech_gateway.source import StreamSourceError
  6 | 
  7 | BASE_URL = "https://api.nijivoice.com"
  8 | GATEWAY_BASE_URL = "http://127.0.0.1:8000/nijivoice"
  9 | NIJIVOICE_API_KEY = os.getenv("NIJIVOICE_API_KEY")
 10 | VOICE_ACTOR_ID = "a192db5f-bd8b-4fc7-bc08-af5ca5957c12"
 11 | PAYLOAD = {
 12 |     "script": "こんにちは。これはテストです。",
 13 |     "speed": "1.0",
 14 |     "emotionalLevel": "0.1",
 15 |     "soundDuration": "0.1",
 16 |     "format": "mp3",
 17 | }
 18 | 
 19 | @pytest.fixture
 20 | def source():
 21 |     # Create an instance of NijiVoiceStreamSource
 22 |     return NijiVoiceStreamSource(base_url=BASE_URL, api_key=NIJIVOICE_API_KEY, debug=True)
 23 | 
 24 | @pytest.mark.asyncio
 25 | async def test_get_cache_key(source):
 26 |     # Test get_cache_key method
 27 |     cache_key = source.get_cache_key("mp3", VOICE_ACTOR_ID, PAYLOAD)
 28 |     assert cache_key.endswith(".mp3")
 29 |     assert VOICE_ACTOR_ID in cache_key
 30 | 
 31 |     cache_key = source.get_cache_key("wav", VOICE_ACTOR_ID, PAYLOAD)
 32 |     assert cache_key.endswith(".wav")
 33 |     assert VOICE_ACTOR_ID in cache_key
 34 | 
 35 | @pytest.mark.asyncio
 36 | async def test_parse_text(source):
 37 |     # Test parse_text method
 38 |     text = source.parse_text(payload=PAYLOAD)
 39 |     assert text is None  # Since parse_text returns None in the current implementation
 40 | 
 41 | @pytest.mark.asyncio
 42 | async def test_make_stream_request(source):
 43 |     # Test make_stream_request method
 44 |     url = f"{BASE_URL}/api/platform/v1/voice-actors/{VOICE_ACTOR_ID}/generate-voice"
 45 |     request = source.make_stream_request(url=url)
 46 |     assert request["method"] == "GET"
 47 |     assert request["url"] == url
 48 | 
 49 | @pytest.mark.asyncio
 50 | async def test_generate_voice_cached(source):
 51 |     # Test generate_voice method with cache
 52 |     cache_key = source.get_cache_key("mp3", VOICE_ACTOR_ID, PAYLOAD)
 53 | 
 54 |     # Create a dummy async generator for cached data
 55 |     async def dummy_cache_data():
 56 |         yield b"cached data"
 57 | 
 58 |     # Write a dummy cache
 59 |     async for _ in source.cache_storage.write_cache(dummy_cache_data(), cache_key):
 60 |         pass  # Consume the generator to simulate writing cache
 61 | 
 62 |     # Call generate_voice and verify it uses cache
 63 |     response = await source.generate_voice(VOICE_ACTOR_ID, PAYLOAD, GATEWAY_BASE_URL)
 64 |     assert "generatedVoice" in response
 65 |     assert response["generatedVoice"]["audioFileUrl"].startswith(GATEWAY_BASE_URL)
 66 | 
 67 | @pytest.mark.asyncio
 68 | async def test_generate_voice_fresh(source):
 69 |     # Test generate_voice method without cache (actual API call)
 70 |     try:
 71 |         response = await source.generate_voice(VOICE_ACTOR_ID, PAYLOAD, GATEWAY_BASE_URL)
 72 |         assert "generatedVoice" in response
 73 |         assert response["generatedVoice"]["audioFileUrl"].startswith(GATEWAY_BASE_URL)
 74 |     except Exception as e:
 75 |         pytest.fail(f"generate_voice failed: {e}")
 76 | 
 77 | @pytest.mark.asyncio
 78 | async def test_generate_voice_error(source):
 79 |     # Test generate_voice method with invalid payload
 80 |     invalid_payload = PAYLOAD.copy()
 81 |     invalid_payload["script"] = ""  # Invalid script
 82 | 
 83 |     with pytest.raises(StreamSourceError):
 84 |         await source.generate_voice(VOICE_ACTOR_ID, invalid_payload, GATEWAY_BASE_URL)
 85 | 
 86 | @pytest.mark.asyncio
 87 | async def test_fetch_stream_raw(source):
 88 |     # Test fetch_stream_raw method (actual API call)
 89 |     url_resp = httpx.post(
 90 |         f"{GATEWAY_BASE_URL}/api/platform/v1/voice-actors/{VOICE_ACTOR_ID}/generate-voice",
 91 |         json={"script": "こんにちは。これはテストです。", "speed": "1.0"}
 92 |     )
 93 | 
 94 |     assert url_resp.status_code == 200
 95 |     url = url_resp.json()["generatedVoice"]["audioFileUrl"]
 96 |     assert GATEWAY_BASE_URL in url
 97 | 
 98 |     http_request = {
 99 |         "method": "GET",
100 |         "url": url,
101 |     }
102 | 
103 |     try:
104 |         async for chunk in source.fetch_stream_raw(http_request):
105 |             assert isinstance(chunk, bytes)
106 |     except Exception as e:
107 |         pytest.fail(f"fetch_stream_raw failed: {e}")
108 | 
109 | @pytest.mark.asyncio
110 | async def test_fetch_stream(source):
111 |     # Test fetch_stream method with a full pipeline
112 |     try:
113 |         async for chunk in await source.fetch_stream(
114 |             audio_format="mp3",
115 |             voice_actor_id=VOICE_ACTOR_ID,
116 |             payload=PAYLOAD,
117 |             gateway_base_url=GATEWAY_BASE_URL,
118 |         ):
119 |             assert isinstance(chunk, bytes)
120 |     except Exception as e:
121 |         pytest.fail(f"fetch_stream failed: {e}")
122 | 


--------------------------------------------------------------------------------
/speech_gateway/gateway/__init__.py:
--------------------------------------------------------------------------------
  1 | from abc import ABC, abstractmethod
  2 | import logging
  3 | from fastapi import Request, APIRouter, HTTPException
  4 | from fastapi.responses import Response
  5 | from pydantic import BaseModel, Field
  6 | import httpx
  7 | from ..source import StreamSource
  8 | 
  9 | logger = logging.getLogger(__name__)
 10 | 
 11 | 
 12 | class UnifiedTTSRequest(BaseModel):
 13 |     text: str = Field(..., description="The text to be synthesized into speech.", example="hello")
 14 |     speaker: str = Field(
 15 |         None, 
 16 |         description="The unique identifier for the voice in each speech service. "
 17 |                     "For Style-Bert-VITS2, specify as `{model_id}-{speaker_id}`. "
 18 |                     "If omitted, the default speaker of the speech service will be used.",
 19 |         example="888753761"
 20 |     )
 21 |     style: str = Field(
 22 |         None, 
 23 |         description="A predefined set of voice styles that includes `neutral`, `joy`, `angry`, `sorrow`, `fun`, and `surprised`. "
 24 |                     "These styles act as presets and must be mapped appropriately to the corresponding style identifiers in each speech service. "
 25 |                     "If omitted, no style will be applied.",
 26 |         example="neutral"
 27 |     )
 28 |     speed: float = Field(
 29 |         None,
 30 |         description="The speed of synthesized speech, where 1.0 is normal speed. "
 31 |                     "Values greater than 1.0 increase the speed (e.g., 1.5 is 50% faster), "
 32 |                     "and values less than 1.0 decrease the speed (e.g., 0.5 is 50% slower). "
 33 |                     "The acceptable range depends on each speech service.",
 34 |         example=1.0
 35 |     )
 36 |     service_name: str = Field(
 37 |         None, 
 38 |         description="The name of the service as specified in `add_gateway`. "
 39 |                     "If omitted, the default gateway will be used.",
 40 |         example="aivisspeech",
 41 |     )
 42 |     language: str = Field(
 43 |         None, 
 44 |         description="The language. The corresponding text-to-speech service will be used. "
 45 |                     "Specify the language code in ISO639-1 format combined with the country code using a hyphen."
 46 |                     "If omitted, the default gateway will be used.",
 47 |         example="en-US",
 48 |     )
 49 | 
 50 | 
 51 | class SpeechGateway(ABC):
 52 |     HOP_BY_HOP_HEADERS = {
 53 |         "connection",
 54 |         "keep-alive",
 55 |         "proxy-authenticate",
 56 |         "proxy-authorization",
 57 |         "te",
 58 |         "trailers",
 59 |         "transfer-encoding",
 60 |         "upgrade",
 61 |     }
 62 | 
 63 |     def __init__(
 64 |         self,
 65 |         *,
 66 |         stream_source: StreamSource = None,
 67 |         debug: bool = False
 68 |     ):
 69 |         self.stream_source = stream_source
 70 |         self.debug = debug
 71 | 
 72 |     def filter_headers(self, headers: httpx.Headers) -> dict:
 73 |         filtered = {}
 74 |         for k, v in headers.items():
 75 |             if k.lower() not in self.HOP_BY_HOP_HEADERS:
 76 |                 filtered[k] = v
 77 |         return filtered
 78 | 
 79 |     @abstractmethod
 80 |     def register_endpoint(self, router: APIRouter):
 81 |         pass
 82 | 
 83 |     async def passthrough_handler(self, request: Request, path: str):
 84 |         url = f"{self.stream_source.base_url}/{path}"
 85 |         if request.query_params:
 86 |             url += f"?{request.query_params}"
 87 | 
 88 |         headers = dict(request.headers)
 89 |         headers.pop("host", None)
 90 |         body = await request.body()
 91 | 
 92 |         r = await self.stream_source.http_client.request(
 93 |             request.method,
 94 |             url,
 95 |             headers=headers,
 96 |             content=body
 97 |         )
 98 | 
 99 |         resp_headers = self.filter_headers(r.headers)
100 | 
101 |         if self.debug:
102 |             logger.info(f"Proxy: {request.method} /{path} -> {r.status_code}")
103 | 
104 |         return Response(content=r.content, status_code=r.status_code, headers=resp_headers)
105 | 
106 |     async def unified_tts_handler(self, request: Request, tts_request: UnifiedTTSRequest, x_audio_format: str = "wav"):
107 |         raise HTTPException(status_code=400, detail=f"This speech service doesn't support unified interface for now: {self.__class__.__name__}")
108 | 
109 |     def get_router(self) -> APIRouter:
110 |         router = APIRouter()
111 |         self.register_endpoint(router)
112 |         router.add_api_route(
113 |             "/{path:path}",
114 |             self.passthrough_handler,
115 |             methods=["GET", "POST", "PUT", "DELETE", "PATCH", "OPTIONS", "HEAD"],
116 |             include_in_schema=False
117 |         )
118 | 
119 |         return router
120 | 
121 |     async def shutdown(self):
122 |         await self.stream_source.close()
123 | 


--------------------------------------------------------------------------------
/tests/source/test_voicevox_source.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import os
  3 | from speech_gateway.source.voicevox import VoicevoxStreamSource
  4 | 
  5 | VOICEVOX_URL = os.getenv("VOICEVOX_URL")
  6 | SPEAKER = "2"
  7 | 
  8 | @pytest.fixture
  9 | def source():
 10 |     # Create an instance of VoicevoxStreamSource
 11 |     return VoicevoxStreamSource(base_url=VOICEVOX_URL)
 12 | 
 13 | @pytest.fixture
 14 | def audio_query():
 15 |     # Provide the audio_query data
 16 |     return {
 17 |         "accent_phrases": [
 18 |             {
 19 |                 "moras": [
 20 |                     {"text": "コ", "consonant": "k", "consonant_length": 0, "vowel": "o", "vowel_length": 0, "pitch": 0},
 21 |                     {"text": "ン", "consonant": None, "consonant_length": None, "vowel": "N", "vowel_length": 0, "pitch": 0},
 22 |                     {"text": "ニ", "consonant": "n", "consonant_length": 0, "vowel": "i", "vowel_length": 0, "pitch": 0},
 23 |                     {"text": "チ", "consonant": "ch", "consonant_length": 0, "vowel": "i", "vowel_length": 0, "pitch": 0},
 24 |                     {"text": "ワ", "consonant": "w", "consonant_length": 0, "vowel": "a", "vowel_length": 0, "pitch": 0},
 25 |                     {"text": ".", "consonant": None, "consonant_length": None, "vowel": "pau", "vowel_length": 0, "pitch": 0}
 26 |                 ],
 27 |                 "accent": 5,
 28 |                 "pause_mora": None,
 29 |                 "is_interrogative": False
 30 |             },
 31 |             {
 32 |                 "moras": [
 33 |                     {"text": "コ", "consonant": "k", "consonant_length": 0, "vowel": "o", "vowel_length": 0, "pitch": 0},
 34 |                     {"text": "レ", "consonant": "r", "consonant_length": 0, "vowel": "e", "vowel_length": 0, "pitch": 0},
 35 |                     {"text": "ワ", "consonant": "w", "consonant_length": 0, "vowel": "a", "vowel_length": 0, "pitch": 0},
 36 |                     {"text": "テ", "consonant": "t", "consonant_length": 0, "vowel": "e", "vowel_length": 0, "pitch": 0},
 37 |                     {"text": "ス", "consonant": "s", "consonant_length": 0, "vowel": "u", "vowel_length": 0, "pitch": 0},
 38 |                     {"text": "ト", "consonant": "t", "consonant_length": 0, "vowel": "o", "vowel_length": 0, "pitch": 0},
 39 |                     {"text": "デ", "consonant": "d", "consonant_length": 0, "vowel": "e", "vowel_length": 0, "pitch": 0},
 40 |                     {"text": "ス", "consonant": "s", "consonant_length": 0, "vowel": "u", "vowel_length": 0, "pitch": 0},
 41 |                     {"text": ".", "consonant": None, "consonant_length": None, "vowel": "pau", "vowel_length": 0, "pitch": 0}
 42 |                 ],
 43 |                 "accent": 4,
 44 |                 "pause_mora": None,
 45 |                 "is_interrogative": False
 46 |             }
 47 |         ],
 48 |         "speedScale": 1,
 49 |         "intonationScale": 1,
 50 |         "tempoDynamicsScale": 1,
 51 |         "pitchScale": 0,
 52 |         "volumeScale": 1,
 53 |         "prePhonemeLength": 0.1,
 54 |         "postPhonemeLength": 0.1,
 55 |         "pauseLength": None,
 56 |         "pauseLengthScale": 1,
 57 |         "outputSamplingRate": 44100,
 58 |         "outputStereo": False,
 59 |         "kana": "こんにちは。これはテストです。"
 60 |     }
 61 | 
 62 | @pytest.mark.asyncio
 63 | async def test_get_cache_key(source, audio_query):
 64 |     # Test get_cache_key method
 65 |     cache_key = source.get_cache_key("mp3", SPEAKER, audio_query)
 66 |     assert cache_key.endswith(".mp3")
 67 |     assert SPEAKER in cache_key
 68 | 
 69 |     cache_key = source.get_cache_key("wav", SPEAKER, audio_query)
 70 |     assert cache_key.endswith(".wav")
 71 |     assert SPEAKER in cache_key
 72 | 
 73 | @pytest.mark.asyncio
 74 | async def test_parse_text(source, audio_query):
 75 |     # Test parse_text method
 76 |     text = source.parse_text(audio_query)
 77 |     assert text == "こんにちは。これはテストです。"
 78 | 
 79 | @pytest.mark.asyncio
 80 | async def test_make_stream_request(source, audio_query):
 81 |     # Test make_stream_request method
 82 |     request = source.make_stream_request(SPEAKER, audio_query)
 83 |     assert request["method"] == "POST"
 84 |     assert request["url"] == f"{VOICEVOX_URL}/synthesis"
 85 |     assert request["params"] == {"speaker": SPEAKER}
 86 |     assert request["json"] == audio_query
 87 | 
 88 | @pytest.mark.asyncio
 89 | async def test_fetch_stream_raw(source, audio_query):
 90 |     # Test fetch_stream_raw with a real request (ensure server is running locally)
 91 |     http_request = source.make_stream_request(SPEAKER, audio_query)
 92 | 
 93 |     try:
 94 |         async for chunk in source.fetch_stream_raw(http_request):
 95 |             assert isinstance(chunk, bytes)
 96 |     except Exception as e:
 97 |         pytest.fail(f"fetch_stream_raw failed: {e}")
 98 | 
 99 | @pytest.mark.asyncio
100 | async def test_fetch_stream(source, audio_query):
101 |     # Test fetch_stream method with conversion and caching
102 |     audio_format = "mp3"
103 | 
104 |     try:
105 |         async for chunk in await source.fetch_stream(audio_format, speaker=SPEAKER, audio_query=audio_query):
106 |             assert isinstance(chunk, bytes)
107 |     except Exception as e:
108 |         pytest.fail(f"fetch_stream failed: {e}")
109 | 


--------------------------------------------------------------------------------
/speech_gateway/source/__init__.py:
--------------------------------------------------------------------------------
  1 | from abc import ABC, abstractmethod
  2 | import logging
  3 | from time import time
  4 | from typing import AsyncIterator, Any, Dict
  5 | import httpx
  6 | from ..cache import CacheStorage
  7 | from ..converter import FormatConverter
  8 | from ..performance_recorder import PerformanceRecorder
  9 | 
 10 | logger = logging.getLogger(__name__)
 11 | 
 12 | 
 13 | class StreamSourceError(Exception):
 14 |     def __init__(self, message: str):
 15 |         super().__init__(message)
 16 | 
 17 | 
 18 | class StreamSource(ABC):
 19 |     def __init__(self,
 20 |         *,
 21 |         base_url: str,
 22 |         cache_storage: CacheStorage = None,
 23 |         format_converters: Dict[str, FormatConverter] = None,
 24 |         max_connections: int = 100,
 25 |         max_keepalive_connections: int = 20,
 26 |         timeout: float = 10.0,
 27 |         performance_recorder: PerformanceRecorder = None,
 28 |         debug: bool = False
 29 |     ):
 30 |         self.base_url = base_url
 31 |         self.cache_storage = cache_storage
 32 |         self.format_converters = format_converters
 33 |         self.http_client = httpx.AsyncClient(
 34 |             follow_redirects=False,
 35 |             timeout=httpx.Timeout(timeout),
 36 |             limits=httpx.Limits(
 37 |                 max_connections=max_connections,
 38 |                 max_keepalive_connections=max_keepalive_connections
 39 |             )
 40 |         )
 41 |         self.performance_recorder = performance_recorder
 42 |         self.debug = debug
 43 | 
 44 |     @abstractmethod
 45 |     def get_cache_key(self, audio_format: str, **kwargs) -> str:
 46 |         pass
 47 | 
 48 |     @abstractmethod
 49 |     def parse_text(self, **kwargs) -> str:
 50 |         pass
 51 | 
 52 |     def get_converter(self, audio_format: str) -> FormatConverter:
 53 |         if self.format_converters:
 54 |             return self.format_converters.get(audio_format)
 55 | 
 56 |     @abstractmethod
 57 |     def make_stream_request(self, **kwargs) -> dict:
 58 |         pass
 59 | 
 60 |     async def fetch_stream_raw(self, http_request: Dict[str, Any]) -> AsyncIterator[bytes]:
 61 |         try:
 62 |             async with self.http_client.stream(**http_request) as audio_resp:
 63 |                 if audio_resp.status_code != 200:
 64 |                     resp_body = ""
 65 |                     try:
 66 |                         resp_body = await audio_resp.aread()
 67 |                     except:
 68 |                         pass
 69 |                     raise StreamSourceError(f"Stream from voice service failed: {audio_resp.status_code}: {resp_body}")
 70 | 
 71 |                 async for chunk in audio_resp.aiter_bytes(1024):
 72 |                     yield chunk
 73 | 
 74 |         except httpx.RequestError as ex:
 75 |             raise StreamSourceError(f"HTTP request failed: {ex}") from ex
 76 | 
 77 |     async def fetch_stream(self, audio_format: str, **kwargs) -> AsyncIterator[bytes]:
 78 |         start_time = time()
 79 |         cache_key = self.get_cache_key(audio_format, **kwargs)
 80 |         use_cache = self.cache_storage and await self.cache_storage.has_cache(cache_key)
 81 | 
 82 |         if use_cache:
 83 |             if self.debug:
 84 |                 logger.info(f"[cache]: {cache_key}")
 85 |             # Get cache stream
 86 |             stream = self.cache_storage.fetch_cache_stream(cache_key)
 87 | 
 88 |         else:
 89 |             # Get stream from TTS service
 90 |             if self.debug:
 91 |                 logger.info(f"[generate]: {cache_key}")
 92 |             http_request = self.make_stream_request(**kwargs)
 93 | 
 94 |             if self.debug:
 95 |                 logger.info(f"Request to speech service: {http_request}")
 96 | 
 97 |             stream = self.fetch_stream_raw(http_request)
 98 | 
 99 |             # Convert format
100 |             converter = self.get_converter(audio_format)
101 |             if converter:
102 |                 stream = converter.convert(stream)
103 | 
104 |             # Write cache
105 |             if self.cache_storage:
106 |                 stream = self.cache_storage.write_cache(stream, cache_key)
107 | 
108 |         # Response time
109 |         if self.performance_recorder:
110 |             stream = self.record_time(
111 |                 stream,
112 |                 cache_key=cache_key,
113 |                 text=self.parse_text(**kwargs),
114 |                 audio_format=audio_format,
115 |                 cached=use_cache,
116 |                 start_time=start_time
117 |             )
118 | 
119 |         return stream
120 | 
121 |     async def record_time(
122 |         self,
123 |         input_stream: AsyncIterator[bytes],
124 |         *,
125 |         cache_key: str,
126 |         text: str,
127 |         audio_format: str,
128 |         cached: bool,
129 |         start_time: float
130 |     ) -> AsyncIterator[bytes]:
131 |         async for chunk in input_stream:
132 |             yield chunk
133 | 
134 |         self.performance_recorder.record(
135 |             process_id=cache_key, source=self.__class__.__name__, text=text,
136 |             audio_format=audio_format, cached=1 if cached else 0, elapsed=time() - start_time
137 |         )
138 | 
139 |     async def close(self):
140 |         await self.http_client.aclose()
141 | 


--------------------------------------------------------------------------------
/docker/run.py:
--------------------------------------------------------------------------------
  1 | from contextlib import asynccontextmanager
  2 | import logging
  3 | import os
  4 | from fastapi import FastAPI
  5 | from dotenv import load_dotenv
  6 | from speech_gateway.performance_recorder.postgres import PostgreSQLPerformanceRecorder
  7 | from speech_gateway.gateway.azure import AzureGateway
  8 | from speech_gateway.gateway.openai_speech import OpenAIGateway
  9 | from speech_gateway.gateway.voicevox import VoicevoxGateway
 10 | from speech_gateway.gateway.sbv2 import StyleBertVits2Gateway
 11 | from speech_gateway.gateway.nijivoice_encoded import NijiVoiceEncodedGateway
 12 | from speech_gateway.gateway.unified import UnifiedGateway
 13 | 
 14 | # Configure root logger
 15 | logger = logging.getLogger("speech_gateway")
 16 | logger.setLevel(logging.INFO)
 17 | log_format = logging.Formatter("[%(levelname)s] %(asctime)s : %(message)s")
 18 | streamHandler = logging.StreamHandler()
 19 | streamHandler.setFormatter(log_format)
 20 | logger.addHandler(streamHandler)
 21 | 
 22 | load_dotenv()
 23 | DEBUG = os.getenv("DEBUG", "false").lower() in ("true", "1", "yes")
 24 | 
 25 | # Azure
 26 | AZURE_ENABLED = os.getenv("AZURE_ENABLED", "false").lower() in ("true", "1", "yes")
 27 | AZURE_API_KEY = os.getenv("AZURE_API_KEY")
 28 | AZURE_REGION = os.getenv("AZURE_REGION")
 29 | AZURE_LANGUAGES = os.getenv("AZURE_LANGUAGES")
 30 | # OpenAI
 31 | OPENAI_ENABLED = os.getenv("OPENAI_ENABLED", "false").lower() in ("true", "1", "yes")
 32 | OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
 33 | OPENAI_LANGUAGES = os.getenv("OPENAI_LANGUAGES")
 34 | # VOICEVOX
 35 | VOICEVOX_ENABLED = os.getenv("VOICEVOX_ENABLED", "false").lower() in ("true", "1", "yes")
 36 | VOICEVOX_URL = os.getenv("VOICEVOX_URL")
 37 | VOICEVOX_LANGUAGES = os.getenv("VOICEVOX_LANGUAGES")
 38 | # Style-Bert-VITS2
 39 | SBV2_ENABLED = os.getenv("SBV2_ENABLED", "false").lower() in ("true", "1", "yes")
 40 | SBV2_URL = os.getenv("SBV2_URL")
 41 | SBV2_LANGUAGES = os.getenv("SBV2_LANGUAGES")
 42 | # NIJIVOICE
 43 | NIJIVOICE_ENABLED = os.getenv("NIJIVOICE_ENABLED", "false").lower() in ("true", "1", "yes")
 44 | NIJIVOICE_API_KEY = os.getenv("NIJIVOICE_API_KEY")
 45 | NIJIVOICE_LANGUAGES = os.getenv("NIJIVOICE_LANGUAGES")
 46 | # Database
 47 | DB_PORT = os.getenv("PORT_DB")
 48 | DB_USER = os.getenv("SPGW_DB_USER")
 49 | DB_PASSWORD = os.getenv("SPGW_DB_PASSWORD")
 50 | 
 51 | # Performance recorder
 52 | performance_recorder = PostgreSQLPerformanceRecorder(host="spgw-db", port=DB_PORT, user=DB_USER, password=DB_PASSWORD)
 53 | 
 54 | # On app down
 55 | @asynccontextmanager
 56 | async def lifespan(app: FastAPI):
 57 |     yield
 58 |     # Shutdown enabled gateways
 59 |     if AZURE_ENABLED and 'azure_gateway' in globals():
 60 |         await azure_gateway.shutdown()
 61 |     if OPENAI_ENABLED and 'openai_gateway' in globals():
 62 |         await openai_gateway.shutdown()
 63 |     if VOICEVOX_ENABLED and 'voicevox_gateway' in globals():
 64 |         await voicevox_gateway.shutdown()
 65 |     if SBV2_ENABLED and 'sbv2_gateway' in globals():
 66 |         await sbv2_gateway.shutdown()
 67 |     if NIJIVOICE_ENABLED and 'nijivoice_gateway' in globals():
 68 |         await nijivoice_gateway.shutdown()
 69 | 
 70 | # Create API app
 71 | app = FastAPI(lifespan=lifespan)
 72 | 
 73 | # Unified gateway
 74 | unified_gateway = UnifiedGateway(debug=True)
 75 | app.include_router(unified_gateway.get_router())
 76 | 
 77 | # Create service gateways
 78 | if AZURE_ENABLED:
 79 |     azure_gateway = AzureGateway(api_key=AZURE_API_KEY, cache_dir="cache/azure", performance_recorder=performance_recorder, region=AZURE_REGION, debug=DEBUG)
 80 |     unified_gateway.add_gateway(
 81 |         service_name="azure",
 82 |         gateway=azure_gateway,
 83 |         languages=AZURE_LANGUAGES.split(",") if AZURE_LANGUAGES else None,
 84 |     )
 85 |     app.include_router(azure_gateway.get_router(), prefix="/azure")
 86 |     logger.info("[Gateway] Azure on /azure")
 87 | 
 88 | if OPENAI_ENABLED:
 89 |     openai_gateway = OpenAIGateway(api_key=OPENAI_API_KEY, cache_dir="cache/openai", performance_recorder=performance_recorder, debug=DEBUG)
 90 |     unified_gateway.add_gateway(
 91 |         service_name="openai",
 92 |         gateway=openai_gateway,
 93 |         languages=OPENAI_LANGUAGES.split(",") if OPENAI_LANGUAGES else None,
 94 |     )
 95 |     app.include_router(openai_gateway.get_router(), prefix="/openai")
 96 |     logger.info(f"[Gateway] OpenAI on /openai")
 97 | 
 98 | if VOICEVOX_ENABLED:
 99 |     voicevox_gateway = VoicevoxGateway(base_url=VOICEVOX_URL, cache_dir="cache/voicevox", performance_recorder=performance_recorder, debug=DEBUG)
100 |     unified_gateway.add_gateway(
101 |         service_name="voicevox",
102 |         gateway=voicevox_gateway,
103 |         languages=VOICEVOX_LANGUAGES.split(",") if VOICEVOX_LANGUAGES else None,
104 |     )
105 |     app.include_router(voicevox_gateway.get_router(), prefix="/voicevox")
106 |     logger.info(f"[Gateway] VOICEVOX on /voicevox")
107 | 
108 | if SBV2_ENABLED:
109 |     sbv2_gateway = StyleBertVits2Gateway(base_url=SBV2_URL, cache_dir="cache/sbv2", performance_recorder=performance_recorder, debug=DEBUG)
110 |     unified_gateway.add_gateway(
111 |         service_name="sbv2",
112 |         gateway=sbv2_gateway,
113 |         languages=SBV2_LANGUAGES.split(",") if SBV2_LANGUAGES else None,
114 |     )
115 |     app.include_router(sbv2_gateway.get_router(), prefix="/sbv2")
116 |     logger.info(f"[Gateway] Style-Bert-VITS2 on /sbv2")
117 | 
118 | if NIJIVOICE_ENABLED:
119 |     nijivoice_gateway = NijiVoiceEncodedGateway(api_key=NIJIVOICE_API_KEY, cache_dir="cache/nijivoice", performance_recorder=performance_recorder, debug=DEBUG)
120 |     unified_gateway.add_gateway(
121 |         service_name="nijivoice",
122 |         gateway=nijivoice_gateway,
123 |         languages=NIJIVOICE_LANGUAGES.split(",") if NIJIVOICE_LANGUAGES else None,
124 |     )
125 |     app.include_router(nijivoice_gateway.get_router(), prefix="/nijivoice")
126 |     logger.info(f"[Gateway] Nijivoice on /nijivoice")
127 | 


--------------------------------------------------------------------------------
/tests/gateway/test_openai_speech.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pytest
  3 | import httpx
  4 | 
  5 | SPEAKER = "alloy"
  6 | 
  7 | 
  8 | @pytest.mark.asyncio
  9 | async def test_openai_speech(random_text, mp3_checker, audio_transcriber):
 10 |     resp = httpx.post(
 11 |         "http://127.0.0.1:8000/openai/audio/speech",
 12 |         json={
 13 |             "model": "tts-1",
 14 |             "voice": "alloy",
 15 |             "input": random_text,
 16 |             "speed": 1.0,
 17 |         }
 18 |     )
 19 |     audio_data = resp.content
 20 |     assert mp3_checker(audio_data)
 21 |     assert "音声合成" in audio_transcriber(audio_data, "mp3")
 22 | 
 23 | 
 24 | @pytest.mark.asyncio
 25 | async def test_openai_speech_wav(random_text, wave_checker, audio_transcriber):
 26 |     resp = httpx.post(
 27 |         "http://127.0.0.1:8000/openai/audio/speech",
 28 |         json={
 29 |             "model": "tts-1",
 30 |             "voice": "alloy",
 31 |             "input": random_text,
 32 |             "speed": 1.0,
 33 |             "response_format": "wav"
 34 |         }
 35 |     )
 36 |     audio_data = resp.content
 37 |     assert wave_checker(audio_data)
 38 |     assert "音声合成" in audio_transcriber(audio_data, "wav")
 39 | 
 40 | 
 41 | @pytest.mark.asyncio
 42 | async def test_openai_speech_mp3(random_text, mp3_checker, audio_transcriber):
 43 |     resp = httpx.post(
 44 |         "http://127.0.0.1:8000/openai/audio/speech",
 45 |         json={
 46 |             "model": "tts-1",
 47 |             "voice": "alloy",
 48 |             "input": random_text,
 49 |             "speed": 1.0,
 50 |             "response_format": "mp3"
 51 |         }
 52 |     )
 53 |     audio_data = resp.content
 54 |     assert mp3_checker(audio_data)
 55 |     assert "音声合成" in audio_transcriber(audio_data, "mp3")
 56 | 
 57 | 
 58 | @pytest.mark.asyncio
 59 | async def test_openai_speech_wav_mp3(random_text, mp3_checker, audio_transcriber):
 60 |     resp = httpx.post(
 61 |         "http://127.0.0.1:8000/openai/audio/speech",
 62 |         json={
 63 |             "model": "tts-1",
 64 |             "voice": "alloy",
 65 |             "input": random_text,
 66 |             "speed": 1.0,
 67 |             "response_format": "wav"    # <- wav
 68 |         },
 69 |         params={
 70 |             "x_audio_format": "mp3"     # <- mp3
 71 |         }
 72 |     )
 73 |     audio_data = resp.content
 74 |     assert mp3_checker(audio_data)
 75 |     assert "音声合成" in audio_transcriber(audio_data, "mp3")
 76 | 
 77 | 
 78 | @pytest.mark.asyncio
 79 | async def test_openai_speech_mp3_wav(random_text, wave_checker, audio_transcriber):
 80 |     resp = httpx.post(
 81 |         "http://127.0.0.1:8000/openai/audio/speech",
 82 |         json={
 83 |             "model": "tts-1",
 84 |             "voice": "alloy",
 85 |             "input": random_text,
 86 |             "speed": 1.0,
 87 |             "response_format": "mp3"    # <- mp3
 88 |         },
 89 |         params={
 90 |             "x_audio_format": "wav"     # <- wav
 91 |         }
 92 |     )
 93 |     audio_data = resp.content
 94 |     assert wave_checker(audio_data)
 95 |     assert "音声合成" in audio_transcriber(audio_data, "wav")
 96 | 
 97 | 
 98 | @pytest.mark.asyncio
 99 | async def test_openai_speech_x_wav(random_text, wave_checker, audio_transcriber):
100 |     resp = httpx.post(
101 |         "http://127.0.0.1:8000/openai/audio/speech",
102 |         json={
103 |             "model": "tts-1",
104 |             "voice": "alloy",
105 |             "input": random_text,
106 |             "speed": 1.0,
107 |         },
108 |         params={
109 |             "x_audio_format": "wav"     # <- wav
110 |         }
111 |     )
112 |     audio_data = resp.content
113 |     assert wave_checker(audio_data)
114 |     assert "音声合成" in audio_transcriber(audio_data, "wav")
115 | 
116 | 
117 | @pytest.mark.asyncio
118 | async def test_openai_speech_x_mp3(random_text, mp3_checker, audio_transcriber):
119 |     resp = httpx.post(
120 |         "http://127.0.0.1:8000/openai/audio/speech",
121 |         json={
122 |             "model": "tts-1",
123 |             "voice": "alloy",
124 |             "input": random_text,
125 |             "speed": 1.0,
126 |         },
127 |         params={
128 |             "x_audio_format": "mp3"     # <- mp3
129 |         }
130 |     )
131 |     audio_data = resp.content
132 |     assert mp3_checker(audio_data)
133 |     assert "音声合成" in audio_transcriber(audio_data, "mp3")
134 | 
135 | 
136 | @pytest.mark.asyncio
137 | async def test_openai_speech_unified(random_text, wave_checker, audio_transcriber):
138 |     req = {
139 |         "text": random_text,
140 |         "speaker": SPEAKER,
141 |         "service_name": "openai"
142 |     }
143 |     resp = httpx.post("http://127.0.0.1:8000/tts", json=req)
144 |     audio_data = resp.content
145 |     assert wave_checker(audio_data)
146 |     assert "音声合成" in audio_transcriber(audio_data, "wav")
147 | 
148 | 
149 | @pytest.mark.asyncio
150 | async def test_openai_speech_unified_wav(random_text, wave_checker, audio_transcriber):
151 |     req = {
152 |         "text": random_text,
153 |         "speaker": SPEAKER,
154 |         "service_name": "openai"
155 |     }
156 |     query_params = {
157 |         "x_audio_format": "wav"
158 |     }
159 |     resp = httpx.post("http://127.0.0.1:8000/tts", params=query_params, json=req)
160 |     audio_data = resp.content
161 |     assert wave_checker(audio_data)
162 |     assert "音声合成" in audio_transcriber(audio_data, "wav")
163 | 
164 | 
165 | @pytest.mark.asyncio
166 | async def test_openai_speech_unified_mp3(random_text, mp3_checker, audio_transcriber):
167 |     req = {
168 |         "text": random_text,
169 |         "speaker": SPEAKER,
170 |         "service_name": "openai"
171 |     }
172 |     query_params = {
173 |         "x_audio_format": "mp3"
174 |     }
175 |     resp = httpx.post("http://127.0.0.1:8000/tts", params=query_params, json=req)
176 |     audio_data = resp.content
177 |     assert mp3_checker(audio_data)
178 |     assert "音声合成" in audio_transcriber(audio_data, "mp3")
179 | 


--------------------------------------------------------------------------------
/tests/gateway/test_azure_openai_speech.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pytest
  3 | import httpx
  4 | 
  5 | SPEAKER = "alloy"
  6 | 
  7 | 
  8 | @pytest.mark.asyncio
  9 | async def test_openai_speech(random_text, mp3_checker, audio_transcriber):
 10 |     resp = httpx.post(
 11 |         "http://127.0.0.1:8000/azure_openai/audio/speech",
 12 |         json={
 13 |             "model": "gpt-4o-mini-tts",
 14 |             "voice": "alloy",
 15 |             "input": random_text,
 16 |             "speed": 1.0,
 17 |         }
 18 |     )
 19 |     audio_data = resp.content
 20 |     assert mp3_checker(audio_data)
 21 |     assert "音声合成" in audio_transcriber(audio_data, "mp3")
 22 | 
 23 | 
 24 | @pytest.mark.asyncio
 25 | async def test_openai_speech_wav(random_text, wave_checker, audio_transcriber):
 26 |     resp = httpx.post(
 27 |         "http://127.0.0.1:8000/azure_openai/audio/speech",
 28 |         json={
 29 |             "model": "gpt-4o-mini-tts",
 30 |             "voice": "alloy",
 31 |             "input": random_text,
 32 |             "speed": 1.0,
 33 |             "response_format": "wav"
 34 |         }
 35 |     )
 36 |     audio_data = resp.content
 37 |     assert wave_checker(audio_data)
 38 |     assert "音声合成" in audio_transcriber(audio_data, "wav")
 39 | 
 40 | 
 41 | @pytest.mark.asyncio
 42 | async def test_openai_speech_mp3(random_text, mp3_checker, audio_transcriber):
 43 |     resp = httpx.post(
 44 |         "http://127.0.0.1:8000/azure_openai/audio/speech",
 45 |         json={
 46 |             "model": "gpt-4o-mini-tts",
 47 |             "voice": "alloy",
 48 |             "input": random_text,
 49 |             "speed": 1.0,
 50 |             "response_format": "mp3"
 51 |         }
 52 |     )
 53 |     audio_data = resp.content
 54 |     assert mp3_checker(audio_data)
 55 |     assert "音声合成" in audio_transcriber(audio_data, "mp3")
 56 | 
 57 | 
 58 | @pytest.mark.asyncio
 59 | async def test_openai_speech_wav_mp3(random_text, mp3_checker, audio_transcriber):
 60 |     resp = httpx.post(
 61 |         "http://127.0.0.1:8000/azure_openai/audio/speech",
 62 |         json={
 63 |             "model": "gpt-4o-mini-tts",
 64 |             "voice": "alloy",
 65 |             "input": random_text,
 66 |             "speed": 1.0,
 67 |             "response_format": "wav"    # <- wav
 68 |         },
 69 |         params={
 70 |             "x_audio_format": "mp3"     # <- mp3
 71 |         }
 72 |     )
 73 |     audio_data = resp.content
 74 |     assert mp3_checker(audio_data)
 75 |     assert "音声合成" in audio_transcriber(audio_data, "mp3")
 76 | 
 77 | 
 78 | @pytest.mark.asyncio
 79 | async def test_openai_speech_mp3_wav(random_text, wave_checker, audio_transcriber):
 80 |     resp = httpx.post(
 81 |         "http://127.0.0.1:8000/azure_openai/audio/speech",
 82 |         json={
 83 |             "model": "gpt-4o-mini-tts",
 84 |             "voice": "alloy",
 85 |             "input": random_text,
 86 |             "speed": 1.0,
 87 |             "response_format": "mp3"    # <- mp3
 88 |         },
 89 |         params={
 90 |             "x_audio_format": "wav"     # <- wav
 91 |         }
 92 |     )
 93 |     audio_data = resp.content
 94 |     assert wave_checker(audio_data)
 95 |     assert "音声合成" in audio_transcriber(audio_data, "wav")
 96 | 
 97 | 
 98 | @pytest.mark.asyncio
 99 | async def test_openai_speech_x_wav(random_text, wave_checker, audio_transcriber):
100 |     resp = httpx.post(
101 |         "http://127.0.0.1:8000/azure_openai/audio/speech",
102 |         json={
103 |             "model": "gpt-4o-mini-tts",
104 |             "voice": "alloy",
105 |             "input": random_text,
106 |             "speed": 1.0,
107 |         },
108 |         params={
109 |             "x_audio_format": "wav"     # <- wav
110 |         }
111 |     )
112 |     audio_data = resp.content
113 |     assert wave_checker(audio_data)
114 |     assert "音声合成" in audio_transcriber(audio_data, "wav")
115 | 
116 | 
117 | @pytest.mark.asyncio
118 | async def test_openai_speech_x_mp3(random_text, mp3_checker, audio_transcriber):
119 |     resp = httpx.post(
120 |         "http://127.0.0.1:8000/azure_openai/audio/speech",
121 |         json={
122 |             "model": "gpt-4o-mini-tts",
123 |             "voice": "alloy",
124 |             "input": random_text,
125 |             "speed": 1.0,
126 |         },
127 |         params={
128 |             "x_audio_format": "mp3"     # <- mp3
129 |         }
130 |     )
131 |     audio_data = resp.content
132 |     assert mp3_checker(audio_data)
133 |     assert "音声合成" in audio_transcriber(audio_data, "mp3")
134 | 
135 | 
136 | @pytest.mark.asyncio
137 | async def test_openai_speech_unified(random_text, wave_checker, audio_transcriber):
138 |     req = {
139 |         "text": random_text,
140 |         "speaker": SPEAKER,
141 |         "service_name": "azure_openai"
142 |     }
143 |     resp = httpx.post("http://127.0.0.1:8000/tts", json=req)
144 |     audio_data = resp.content
145 |     assert wave_checker(audio_data)
146 |     assert "音声合成" in audio_transcriber(audio_data, "wav")
147 | 
148 | 
149 | @pytest.mark.asyncio
150 | async def test_openai_speech_unified_wav(random_text, wave_checker, audio_transcriber):
151 |     req = {
152 |         "text": random_text,
153 |         "speaker": SPEAKER,
154 |         "service_name": "azure_openai"
155 |     }
156 |     query_params = {
157 |         "x_audio_format": "wav"
158 |     }
159 |     resp = httpx.post("http://127.0.0.1:8000/tts", params=query_params, json=req)
160 |     audio_data = resp.content
161 |     assert wave_checker(audio_data)
162 |     assert "音声合成" in audio_transcriber(audio_data, "wav")
163 | 
164 | 
165 | @pytest.mark.asyncio
166 | async def test_openai_speech_unified_mp3(random_text, mp3_checker, audio_transcriber):
167 |     req = {
168 |         "text": random_text,
169 |         "speaker": SPEAKER,
170 |         "service_name": "azure_openai"
171 |     }
172 |     query_params = {
173 |         "x_audio_format": "mp3"
174 |     }
175 |     resp = httpx.post("http://127.0.0.1:8000/tts", params=query_params, json=req)
176 |     audio_data = resp.content
177 |     assert mp3_checker(audio_data)
178 |     assert "音声合成" in audio_transcriber(audio_data, "mp3")
179 | 


--------------------------------------------------------------------------------
/tests/source/test_openai_speech_source.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import os
  3 | from speech_gateway.source.openai_speech import OpenAIStreamSource
  4 | 
  5 | OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
  6 | AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
  7 | AZURE_OPENAI_BASE_URL =os.getenv("AZURE_OPENAI_BASE_URL")
  8 | 
  9 | @pytest.fixture
 10 | def source():
 11 |     # Create an instance of OpenAIStreamSource
 12 |     return OpenAIStreamSource(api_key=OPENAI_API_KEY)
 13 | 
 14 | @pytest.mark.asyncio
 15 | async def test_get_cache_key(source):
 16 |     # Test get_cache_key method
 17 |     request_json = {
 18 |         "model": "tts-1",
 19 |         "voice": "alloy",
 20 |         "input": "こんにちは。これはテストです。",
 21 |         "speed": 1.0,
 22 |         "response_format": "wav"
 23 |     }
 24 |     cache_key = source.get_cache_key("mp3", request_json)
 25 |     assert cache_key.endswith(".mp3")
 26 | 
 27 |     cache_key = source.get_cache_key("wav", request_json)
 28 |     assert cache_key.endswith(".wav")
 29 | 
 30 | @pytest.mark.asyncio
 31 | async def test_parse_text(source):
 32 |     # Test parse_text method
 33 |     request_json = {
 34 |         "model": "tts-1",
 35 |         "voice": "alloy",
 36 |         "input": "こんにちは。これはテストです。",
 37 |         "speed": 1.0,
 38 |         "response_format": "wav"
 39 |     }
 40 |     text = source.parse_text(request_json)
 41 |     assert text == "こんにちは。これはテストです。"
 42 | 
 43 | @pytest.mark.asyncio
 44 | async def test_make_stream_request(source):
 45 |     # Test make_stream_request method
 46 |     request_json = {
 47 |         "model": "tts-1",
 48 |         "voice": "alloy",
 49 |         "input": "こんにちは。これはテストです。",
 50 |         "speed": 1.0,
 51 |         "response_format": "wav"
 52 |     }
 53 |     request = source.make_stream_request(request_json)
 54 |     assert request["method"] == "POST"
 55 |     assert request["url"] == "https://api.openai.com/v1/audio/speech"
 56 |     assert request["json"] == request_json
 57 | 
 58 | @pytest.mark.asyncio
 59 | async def test_fetch_stream_raw(source):
 60 |     # Test fetch_stream_raw with a real request (ensure server is running locally)
 61 |     request_json = {
 62 |         "model": "tts-1",
 63 |         "voice": "alloy",
 64 |         "input": "こんにちは。これはテストです。",
 65 |         "speed": 1.0,
 66 |         "response_format": "wav"
 67 |     }
 68 |     http_request = source.make_stream_request(request_json)
 69 | 
 70 |     try:
 71 |         async for chunk in source.fetch_stream_raw(http_request):
 72 |             assert isinstance(chunk, bytes)
 73 |     except Exception as e:
 74 |         pytest.fail(f"fetch_stream_raw failed: {e}")
 75 | 
 76 | @pytest.mark.asyncio
 77 | async def test_fetch_stream(source):
 78 |     # Test fetch_stream method with conversion and caching
 79 |     request_json = {
 80 |         "model": "tts-1",
 81 |         "voice": "alloy",
 82 |         "input": "こんにちは。これはテストです。",
 83 |         "speed": 1.0,
 84 |         "response_format": "wav"
 85 |     }
 86 | 
 87 |     audio_format = "wav"
 88 | 
 89 |     try:
 90 |         async for chunk in await source.fetch_stream(audio_format, request_json=request_json):
 91 |             assert isinstance(chunk, bytes)
 92 |     except Exception as e:
 93 |         pytest.fail(f"fetch_stream failed: {e}")
 94 | 
 95 | @pytest.mark.asyncio
 96 | async def test_fetch_stream_raw(source):
 97 |     # Test fetch_stream_raw with a real request (ensure server is running locally)
 98 |     request_json = {
 99 |         "model": "tts-1",
100 |         "voice": "alloy",
101 |         "input": "こんにちは。これはテストです。",
102 |         "speed": 1.0,
103 |         "response_format": "wav"
104 |     }
105 |     http_request = source.make_stream_request(request_json)
106 | 
107 |     try:
108 |         async for chunk in source.fetch_stream_raw(http_request):
109 |             assert isinstance(chunk, bytes)
110 |     except Exception as e:
111 |         pytest.fail(f"fetch_stream_raw failed: {e}")
112 | 
113 | @pytest.mark.asyncio
114 | async def test_fetch_stream(source):
115 |     # Test fetch_stream method with conversion and caching
116 |     request_json = {
117 |         "model": "tts-1",
118 |         "voice": "alloy",
119 |         "input": "こんにちは。これはテストです。",
120 |         "speed": 1.0,
121 |         "response_format": "wav"
122 |     }
123 | 
124 |     audio_format = "wav"
125 | 
126 |     try:
127 |         async for chunk in await source.fetch_stream(audio_format, request_json=request_json):
128 |             assert isinstance(chunk, bytes)
129 |     except Exception as e:
130 |         pytest.fail(f"fetch_stream failed: {e}")
131 | 
132 | 
133 | @pytest.mark.asyncio
134 | async def test_fetch_stream_raw_azure(source):
135 |     # Use Azure OpenAI API
136 |     source.api_key = AZURE_OPENAI_API_KEY
137 |     source.base_url = AZURE_OPENAI_BASE_URL
138 | 
139 |     # Test fetch_stream_raw with a real request (ensure server is running locally)
140 |     request_json = {
141 |         "model": "gpt-4o-mini-tts",
142 |         "voice": "alloy",
143 |         "input": "こんにちは。これはテストです。",
144 |         "speed": 1.0,
145 |         "response_format": "wav"
146 |     }
147 |     http_request = source.make_stream_request(request_json)
148 | 
149 |     try:
150 |         async for chunk in source.fetch_stream_raw(http_request):
151 |             assert isinstance(chunk, bytes)
152 |     except Exception as e:
153 |         pytest.fail(f"fetch_stream_raw_azure failed: {e}")
154 | 
155 | @pytest.mark.asyncio
156 | async def test_fetch_stream_azure(source):
157 |     # Use Azure OpenAI API
158 |     source.api_key = AZURE_OPENAI_API_KEY
159 |     source.base_url = AZURE_OPENAI_BASE_URL
160 | 
161 |     # Test fetch_stream method with conversion and caching
162 |     request_json = {
163 |         "model": "gpt-4o-mini-tts",
164 |         "voice": "alloy",
165 |         "input": "こんにちは。これはテストです。",
166 |         "speed": 1.0,
167 |         "response_format": "wav"
168 |     }
169 | 
170 |     audio_format = "wav"
171 | 
172 |     try:
173 |         async for chunk in await source.fetch_stream(audio_format, request_json=request_json):
174 |             assert isinstance(chunk, bytes)
175 |     except Exception as e:
176 |         pytest.fail(f"fetch_stream_azure failed: {e}")
177 | 


--------------------------------------------------------------------------------
/tests/gateway/test_nijivoice.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pytest
  3 | import httpx
  4 | 
  5 | VOICE_ACTOR_ID = "dba2fa0e-f750-43ad-b9f6-d5aeaea7dc16"
  6 | 
  7 | 
  8 | @pytest.mark.asyncio
  9 | async def test_nijivoice(random_text, mp3_checker, audio_transcriber):
 10 |     resp_json = httpx.post(
 11 |         f"http://127.0.0.1:8000/nijivoice/api/platform/v1/voice-actors/{VOICE_ACTOR_ID}/generate-voice",
 12 |         json={
 13 |             "script": random_text,
 14 |             "speed": "1.0"
 15 |         }
 16 |     ).json()
 17 | 
 18 |     resp = httpx.get(resp_json["generatedVoice"]["audioFileUrl"])
 19 |     audio_data = resp.content
 20 |     assert mp3_checker(audio_data)
 21 |     assert "音声合成" in audio_transcriber(audio_data, "mp3")
 22 | 
 23 | 
 24 | @pytest.mark.asyncio
 25 | async def test_nijivoice_wav(random_text, wave_checker, audio_transcriber):
 26 |     resp_json = httpx.post(
 27 |         f"http://127.0.0.1:8000/nijivoice/api/platform/v1/voice-actors/{VOICE_ACTOR_ID}/generate-voice",
 28 |         json={
 29 |             "script": random_text,
 30 |             "speed": "1.0",
 31 |             "format": "wav"
 32 |         }
 33 |     ).json()
 34 | 
 35 |     resp = httpx.get(resp_json["generatedVoice"]["audioFileUrl"])
 36 |     audio_data = resp.content
 37 |     assert wave_checker(audio_data)
 38 |     assert "音声合成" in audio_transcriber(audio_data, "wav")
 39 | 
 40 | 
 41 | @pytest.mark.asyncio
 42 | async def test_nijivoice_mp3(random_text, mp3_checker, audio_transcriber):
 43 |     resp_json = httpx.post(
 44 |         f"http://127.0.0.1:8000/nijivoice/api/platform/v1/voice-actors/{VOICE_ACTOR_ID}/generate-voice",
 45 |         json={
 46 |             "script": random_text,
 47 |             "speed": "1.0",
 48 |             "format": "mp3"
 49 |         }
 50 |     ).json()
 51 | 
 52 |     resp = httpx.get(resp_json["generatedVoice"]["audioFileUrl"])
 53 |     audio_data = resp.content
 54 |     assert mp3_checker(audio_data)
 55 |     assert "音声合成" in audio_transcriber(audio_data, "mp3")
 56 | 
 57 | 
 58 | @pytest.mark.asyncio
 59 | async def test_nijivoice_wav_mp3(random_text, mp3_checker, audio_transcriber):
 60 |     resp_json = httpx.post(
 61 |         f"http://127.0.0.1:8000/nijivoice/api/platform/v1/voice-actors/{VOICE_ACTOR_ID}/generate-voice",
 62 |         json={
 63 |             "script": random_text,
 64 |             "speed": "1.0",
 65 |             "format": "wav"         # <- wav
 66 |         },
 67 |         params={
 68 |             "x_audio_format": "mp3"     # <- mp3
 69 |         }
 70 |     ).json()
 71 | 
 72 |     resp = httpx.get(resp_json["generatedVoice"]["audioFileUrl"])
 73 |     audio_data = resp.content
 74 |     assert mp3_checker(audio_data)
 75 |     assert "音声合成" in audio_transcriber(audio_data, "mp3")
 76 | 
 77 | 
 78 | @pytest.mark.asyncio
 79 | async def test_nijivoice_mp3_wav(random_text, wave_checker, audio_transcriber):
 80 |     resp_json = httpx.post(
 81 |         f"http://127.0.0.1:8000/nijivoice/api/platform/v1/voice-actors/{VOICE_ACTOR_ID}/generate-voice",
 82 |         json={
 83 |             "script": random_text,
 84 |             "speed": "1.0",
 85 |             "format": "mp3"         # <- mp3
 86 |         },
 87 |         params = {
 88 |             "x_audio_format": "wav"     # <- wav
 89 |         }
 90 |     ).json()
 91 | 
 92 |     resp = httpx.get(resp_json["generatedVoice"]["audioFileUrl"])
 93 |     audio_data = resp.content
 94 |     assert wave_checker(audio_data)
 95 |     assert "音声合成" in audio_transcriber(audio_data, "wav")
 96 | 
 97 | 
 98 | @pytest.mark.asyncio
 99 | async def test_nijivoice_x_wav(random_text, wave_checker, audio_transcriber):
100 |     resp_json = httpx.post(
101 |         f"http://127.0.0.1:8000/nijivoice/api/platform/v1/voice-actors/{VOICE_ACTOR_ID}/generate-voice",
102 |         json={
103 |             "script": random_text,
104 |             "speed": "1.0"
105 |         },
106 |         params = {
107 |             "x_audio_format": "wav"     # <- wav
108 |         }
109 |     ).json()
110 | 
111 |     resp = httpx.get(resp_json["generatedVoice"]["audioFileUrl"])
112 |     audio_data = resp.content
113 |     assert wave_checker(audio_data)
114 |     assert "音声合成" in audio_transcriber(audio_data, "wav")
115 | 
116 | 
117 | @pytest.mark.asyncio
118 | async def test_nijivoice_x_mp3(random_text, mp3_checker, audio_transcriber):
119 |     resp_json = httpx.post(
120 |         f"http://127.0.0.1:8000/nijivoice/api/platform/v1/voice-actors/{VOICE_ACTOR_ID}/generate-voice",
121 |         json={
122 |             "script": random_text,
123 |             "speed": "1.0"
124 |         },
125 |         params = {
126 |             "x_audio_format": "mp3"     # <- mp3
127 |         }
128 |     ).json()
129 | 
130 |     resp = httpx.get(resp_json["generatedVoice"]["audioFileUrl"])
131 |     audio_data = resp.content
132 |     assert mp3_checker(audio_data)
133 |     assert "音声合成" in audio_transcriber(audio_data, "mp3")
134 | 
135 | 
136 | @pytest.mark.asyncio
137 | async def test_nijivoice_unified(random_text, wave_checker, audio_transcriber):
138 |     req = {
139 |         "text": random_text,
140 |         "speaker": VOICE_ACTOR_ID,
141 |         "service_name": "nijivoice"
142 |     }
143 |     resp = httpx.post("http://127.0.0.1:8000/tts", json=req)
144 |     audio_data = resp.content
145 |     assert wave_checker(audio_data)
146 |     assert "音声合成" in audio_transcriber(audio_data, "wav")
147 | 
148 | 
149 | @pytest.mark.asyncio
150 | async def test_nijivoice_unified_wav(random_text, wave_checker, audio_transcriber):
151 |     req = {
152 |         "text": random_text,
153 |         "speaker": VOICE_ACTOR_ID,
154 |         "service_name": "nijivoice"
155 |     }
156 |     query_params = {
157 |         "x_audio_format": "wav"
158 |     }
159 |     resp = httpx.post("http://127.0.0.1:8000/tts", params=query_params, json=req)
160 |     audio_data = resp.content
161 |     assert wave_checker(audio_data)
162 |     assert "音声合成" in audio_transcriber(audio_data, "wav")
163 | 
164 | 
165 | @pytest.mark.asyncio
166 | async def test_nijivoice_unified_mp3(random_text, mp3_checker, audio_transcriber):
167 |     req = {
168 |         "text": random_text,
169 |         "speaker": VOICE_ACTOR_ID,
170 |         "service_name": "nijivoice"
171 |     }
172 |     query_params = {
173 |         "x_audio_format": "mp3"
174 |     }
175 |     resp = httpx.post("http://127.0.0.1:8000/tts", params=query_params, json=req)
176 |     audio_data = resp.content
177 |     assert mp3_checker(audio_data)
178 |     assert "音声合成" in audio_transcriber(audio_data, "mp3")
179 | 


--------------------------------------------------------------------------------
/tests/gateway/test_nijivoice_encoded.py:
--------------------------------------------------------------------------------
  1 | import base64
  2 | import pytest
  3 | import httpx
  4 | 
  5 | VOICE_ACTOR_ID = "dba2fa0e-f750-43ad-b9f6-d5aeaea7dc16"
  6 | 
  7 | 
  8 | @pytest.mark.asyncio
  9 | async def test_nijivoice(random_text, mp3_checker, audio_transcriber):
 10 |     resp_json = httpx.post(
 11 |         f"http://127.0.0.1:8000/nijivoice_encoded/api/platform/v1/voice-actors/{VOICE_ACTOR_ID}/generate-encoded-voice",
 12 |         json={
 13 |             "script": random_text,
 14 |             "speed": "1.0"
 15 |         }
 16 |     ).json()
 17 |     base64_audio = resp_json["generatedVoice"]["base64Audio"]
 18 |     audio_data = base64.b64decode(base64_audio)
 19 | 
 20 |     assert mp3_checker(audio_data)
 21 |     assert "音声合成" in audio_transcriber(audio_data, "mp3")
 22 | 
 23 | 
 24 | @pytest.mark.asyncio
 25 | async def test_nijivoice_wav(random_text, wave_checker, audio_transcriber):
 26 |     resp_json = httpx.post(
 27 |         f"http://127.0.0.1:8000/nijivoice_encoded/api/platform/v1/voice-actors/{VOICE_ACTOR_ID}/generate-encoded-voice",
 28 |         json={
 29 |             "script": random_text,
 30 |             "speed": "1.0",
 31 |             "format": "wav"
 32 |         }
 33 |     ).json()
 34 |     base64_audio = resp_json["generatedVoice"]["base64Audio"]
 35 |     audio_data = base64.b64decode(base64_audio)
 36 | 
 37 |     assert wave_checker(audio_data)
 38 |     assert "音声合成" in audio_transcriber(audio_data, "wav")
 39 | 
 40 | 
 41 | @pytest.mark.asyncio
 42 | async def test_nijivoice_mp3(random_text, mp3_checker, audio_transcriber):
 43 |     resp_json = httpx.post(
 44 |         f"http://127.0.0.1:8000/nijivoice_encoded/api/platform/v1/voice-actors/{VOICE_ACTOR_ID}/generate-encoded-voice",
 45 |         json={
 46 |             "script": random_text,
 47 |             "speed": "1.0",
 48 |             "format": "mp3"
 49 |         }
 50 |     ).json()
 51 |     base64_audio = resp_json["generatedVoice"]["base64Audio"]
 52 |     audio_data = base64.b64decode(base64_audio)
 53 | 
 54 |     assert mp3_checker(audio_data)
 55 |     assert "音声合成" in audio_transcriber(audio_data, "mp3")
 56 | 
 57 | 
 58 | @pytest.mark.asyncio
 59 | async def test_nijivoice_wav_mp3(random_text, mp3_checker, audio_transcriber):
 60 |     resp_json = httpx.post(
 61 |         f"http://127.0.0.1:8000/nijivoice_encoded/api/platform/v1/voice-actors/{VOICE_ACTOR_ID}/generate-encoded-voice",
 62 |         json={
 63 |             "script": random_text,
 64 |             "speed": "1.0",
 65 |             "format": "wav"         # <- wav
 66 |         },
 67 |         params={
 68 |             "x_audio_format": "mp3"     # <- mp3
 69 |         }
 70 |     ).json()
 71 |     base64_audio = resp_json["generatedVoice"]["base64Audio"]
 72 |     audio_data = base64.b64decode(base64_audio)
 73 | 
 74 |     assert mp3_checker(audio_data)
 75 |     assert "音声合成" in audio_transcriber(audio_data, "mp3")
 76 | 
 77 | 
 78 | @pytest.mark.asyncio
 79 | async def test_nijivoice_mp3_wav(random_text, wave_checker, audio_transcriber):
 80 |     resp_json = httpx.post(
 81 |         f"http://127.0.0.1:8000/nijivoice_encoded/api/platform/v1/voice-actors/{VOICE_ACTOR_ID}/generate-encoded-voice",
 82 |         json={
 83 |             "script": random_text,
 84 |             "speed": "1.0",
 85 |             "format": "mp3"         # <- mp3
 86 |         },
 87 |         params = {
 88 |             "x_audio_format": "wav"     # <- wav
 89 |         }
 90 |     ).json()
 91 |     base64_audio = resp_json["generatedVoice"]["base64Audio"]
 92 |     audio_data = base64.b64decode(base64_audio)
 93 | 
 94 |     assert wave_checker(audio_data)
 95 |     assert "音声合成" in audio_transcriber(audio_data, "wav")
 96 | 
 97 | 
 98 | @pytest.mark.asyncio
 99 | async def test_nijivoice_x_wav(random_text, wave_checker, audio_transcriber):
100 |     resp_json = httpx.post(
101 |         f"http://127.0.0.1:8000/nijivoice_encoded/api/platform/v1/voice-actors/{VOICE_ACTOR_ID}/generate-encoded-voice",
102 |         json={
103 |             "script": random_text,
104 |             "speed": "1.0"
105 |         },
106 |         params = {
107 |             "x_audio_format": "wav"     # <- wav
108 |         }
109 |     ).json()
110 |     base64_audio = resp_json["generatedVoice"]["base64Audio"]
111 |     audio_data = base64.b64decode(base64_audio)
112 | 
113 |     assert wave_checker(audio_data)
114 |     assert "音声合成" in audio_transcriber(audio_data, "wav")
115 | 
116 | 
117 | @pytest.mark.asyncio
118 | async def test_nijivoice_x_mp3(random_text, mp3_checker, audio_transcriber):
119 |     resp_json = httpx.post(
120 |         f"http://127.0.0.1:8000/nijivoice_encoded/api/platform/v1/voice-actors/{VOICE_ACTOR_ID}/generate-encoded-voice",
121 |         json={
122 |             "script": random_text,
123 |             "speed": "1.0"
124 |         },
125 |         params = {
126 |             "x_audio_format": "mp3"     # <- mp3
127 |         }
128 |     ).json()
129 |     base64_audio = resp_json["generatedVoice"]["base64Audio"]
130 |     audio_data = base64.b64decode(base64_audio)
131 | 
132 |     assert mp3_checker(audio_data)
133 |     assert "音声合成" in audio_transcriber(audio_data, "mp3")
134 | 
135 | 
136 | @pytest.mark.asyncio
137 | async def test_nijivoice_unified(random_text, wave_checker, audio_transcriber):
138 |     req = {
139 |         "text": random_text,
140 |         "speaker": VOICE_ACTOR_ID,
141 |         "service_name": "nijivoice"
142 |     }
143 |     resp = httpx.post("http://127.0.0.1:8000/tts", json=req)
144 |     audio_data = resp.content
145 |     assert wave_checker(audio_data)
146 |     assert "音声合成" in audio_transcriber(audio_data, "wav")
147 | 
148 | 
149 | @pytest.mark.asyncio
150 | async def test_nijivoice_unified_wav(random_text, wave_checker, audio_transcriber):
151 |     req = {
152 |         "text": random_text,
153 |         "speaker": VOICE_ACTOR_ID,
154 |         "service_name": "nijivoice"
155 |     }
156 |     query_params = {
157 |         "x_audio_format": "wav"
158 |     }
159 |     resp = httpx.post("http://127.0.0.1:8000/tts", params=query_params, json=req)
160 |     audio_data = resp.content
161 |     assert wave_checker(audio_data)
162 |     assert "音声合成" in audio_transcriber(audio_data, "wav")
163 | 
164 | 
165 | @pytest.mark.asyncio
166 | async def test_nijivoice_unified_mp3(random_text, mp3_checker, audio_transcriber):
167 |     req = {
168 |         "text": random_text,
169 |         "speaker": VOICE_ACTOR_ID,
170 |         "service_name": "nijivoice"
171 |     }
172 |     query_params = {
173 |         "x_audio_format": "mp3"
174 |     }
175 |     resp = httpx.post("http://127.0.0.1:8000/tts", params=query_params, json=req)
176 |     audio_data = resp.content
177 |     assert mp3_checker(audio_data)
178 |     assert "音声合成" in audio_transcriber(audio_data, "mp3")
179 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # SpeechGateway
  2 | 
  3 | A reverse proxy server that enhances speech synthesis with essential, extensible features. 🦉💬
  4 | 
  5 | 
  6 | ## 💎 Features
  7 | 
  8 | - 🥰 **Supports Popular Speech Services**: Works seamlessly with AivisSpeech, VOICEVOX, Style-Bert-VITS2, NijiVoice, OpenAI and Azure — and lets you integrate additional services to suit your needs.
  9 | - 🗂️ **Caching**: Boost response speed and save API calls with built-in audio caching.
 10 | - 🔄 **Format Conversion**: Effortlessly convert WAV to MP3 for bandwidth-friendly responses.
 11 | - 📊 **Performance Metrics**: Track synthesis time and cache hits for in-depth insights.
 12 | - ⚡️ **Low Latency**: Streamlined pipeline for minimal delay, delivering fast results!
 13 | - 🌟 **Unified Interface**: Use various text-to-speech services through a unified interface — now with multi-language support!🌏
 14 | 
 15 | 
 16 | ## 🎁 Installation
 17 | 
 18 | ```sh
 19 | pip install speech-gateway
 20 | ```
 21 | 
 22 | To use MP3 format conversion, you also need to install ffmpeg to your computer.
 23 | 
 24 | 
 25 | ## 🚀 Start server
 26 | 
 27 | Create a script like the following example:
 28 | 
 29 | ```python
 30 | from contextlib import asynccontextmanager
 31 | from fastapi import FastAPI
 32 | from speech_gateway.gateway.voicevox import VoicevoxGateway
 33 | from speech_gateway.gateway.sbv2 import StyleBertVits2Gateway
 34 | from speech_gateway.gateway.nijivoice import NijiVoiceGateway
 35 | 
 36 | # Create gateways
 37 | voicevox_gateway = VoicevoxGateway(base_url="http://127.0.0.1:10101", debug=True)
 38 | sbv2_gateway = StyleBertVits2Gateway(base_url="http://127.0.0.1:5000", debug=True)
 39 | nijivoice_gateway = NijiVoiceGateway(api_key=NIJIVOICE_API_KEY, prefix="/nijivoice", debug=True)
 40 | 
 41 | # Create app
 42 | app = FastAPI()
 43 | 
 44 | # Add gateways to app
 45 | app.include_router(voicevox_gateway.get_router(), prefix="/aivisspeech")
 46 | app.include_router(sbv2_gateway.get_router(), prefix="/sbv2")
 47 | app.include_router(nijivoice_gateway.get_router(), prefix="/nijivoice")
 48 | 
 49 | # On app down
 50 | @asynccontextmanager
 51 | async def lifespan(app: FastAPI):
 52 |     yield
 53 |     await voicevox_gateway.shutdown()
 54 |     await sbv2_gateway.shutdown()
 55 |     await nijivoice_gateway.shutdown()
 56 | ```
 57 | 
 58 | Then, run it with uvicorn:
 59 | 
 60 | ```
 61 | uvicorn run:app --port 8000
 62 | ```
 63 | 
 64 | In this example, you can access AivisSpeech at http://127.0.0.1:8000/aivisspeech, Style-Bert-VITS2 at http://127.0.0.1:8000/sbv2, and NijiVoice at http://127.0.0.1:8000/nijivoice.
 65 | 
 66 | **NOTE**: If you want to perform MP3 conversion, make sure to include `x_audio_format=mp3` as a query parameter in your request. 
 67 | 
 68 | 
 69 | ## 🌟 Unified Interface
 70 | 
 71 | You can use various text-to-speech services through a unified interface specification.
 72 | Below is an example of providing a unified interface for AivisSpeech, Style-Bert-VITS2, and Nijivoice.
 73 | 
 74 | ```python
 75 | from speech_gateway.gateway.unified import UnifiedGateway
 76 | 
 77 | # Create UnifiedGateway and add gateways with its service name
 78 | unified_gateway = UnifiedGateway(debug=True)
 79 | unified_gateway.add_gateway("aivisspeech", aivisspeech_gateway, True)   # Set as default gateway
 80 | unified_gateway.add_gateway("sbv2", sbv2_gateway)
 81 | unified_gateway.add_gateway("nijivoice", nijivoice_gateway)
 82 | 
 83 | # Add unified interface router
 84 | app.include_router(unified_gateway.get_router())
 85 | ```
 86 | 
 87 | ### Parameters
 88 | 
 89 | POST a JSON object with the following fields:
 90 | 
 91 | | Parameter     | Type   | Required | Description |
 92 | |---------------|--------|----------|---------------------------------------------------------------------------------------------|
 93 | | `text`        | string | Required | The text to be synthesized into speech. |
 94 | | `speaker`     | string | Optional | The unique identifier for the voice in each speech service.<br>For Style-Bert-VITS2, specify as `{model_id}-{speaker_id}`.<br>If omitted, the default speaker of the speech service will be used. |
 95 | | `style`| string | Optional | A predefined set of voice styles that includes `neutral`, `joy`, `angry`, `sorrow`, `fun`, and `surprised`. |
 96 | | `service_name`| string | Optional | The name of the service as specified in `add_gateway`.<br>If omitted, the default gateway will be used. |
 97 | | `language`| string | Optional | The language. The corresponding text-to-speech service will be used. If omitted, the default gateway will be used. |
 98 | 
 99 | 
100 | ### Client code
101 | 
102 | You can access the services in a unified manner as shown in the client code below:
103 | 
104 | ```python
105 | import httpx
106 | 
107 | req = {"text": "こんにちは。これはデフォルトサービスだよ。", "speaker": "888753761"}
108 | # req = {"text": "こんにちは。これはAivisSpeechだよ。", "speaker": "888753761", "service_name": "aivisspeech"}
109 | # req = {"text": "こんにちは。これはStyle-Bert-VITS2だよ。", "speaker": "0-0", "service_name": "sbv2"}
110 | # req = {"text": "こんにちは。これはにじボイスだよ。", "speaker": "a192db5f-bd8b-4fc7-bc08-af5ca5957c12", "service_name": "nijivoice"}
111 | 
112 | resp = httpx.post("http://127.0.0.1:8000/tts", json=req, timeout=60)
113 | 
114 | with open("tts.wav", "wb") as f:
115 |     f.write(resp.content)
116 | ```
117 | 
118 | **NOTE**: Due to the unified specification, it is not possible to use features specific to each text-to-speech service (e.g., intonation adjustment or pitch variation control). If you need high-quality speech synthesis utilizing such features, please use the individual service interfaces.
119 | 
120 | 
121 | ### Applying Style
122 | 
123 | Define styles on server side.
124 | 
125 | ```python
126 | aivisspeech_gateway = VoicevoxGateway(base_url="http://127.0.0.1:10101", debug=True)
127 | # Define speakers for each style
128 | aivisspeech_gateway.style_mapper["888753761"] = {
129 |     "joy": "888753764",
130 |     "angry": "888753765",
131 |     "sorrow": "888753765",
132 |     "fun": "888753762",
133 |     "surprised": "888753762"
134 | }
135 | 
136 | sbv2_gateway = StyleBertVits2Gateway(base_url="http://127.0.0.1:5000", debug=True)
137 | # Define sytle name for each style
138 | sbv2_gateway.style_mapper["0-0"] = {
139 |     "joy": "上機嫌",
140 |     "angry": "怒り・悲しみ",
141 |     "sorrow": "怒り・悲しみ",
142 |     "fun": "テンション高め",
143 |     "surprised": "テンション高め"
144 | }
145 | ```
146 | 
147 | Call with style from client.
148 | 
149 | ```python
150 | req = {"service_name": "aivisspeech", "text": "こんにちは。これはデフォルトサービスだよ。", "speaker": "888753761", "style": "angry"}
151 | # req = {"service_name": "sbv2", "text": "こんにちは。これはStyle-Bert-VITS2だよ。", "speaker": "0-0", "style": "angry"}
152 | 
153 | resp = httpx.post("http://127.0.0.1:8000/tts", json=req, timeout=60)
154 | 
155 | with open("tts.wav", "wb") as f:
156 |     f.write(resp.content)
157 | ```
158 | 
159 | 
160 | ### Multi-language Support
161 | 
162 | You can configure the system to use the appropriate speech service based on the language, without explicitly specifying the service name.  
163 | By passing `languages` to `add_gateway`, you can register a speech service that corresponds to the `language` specified in the request. Additionally, by registering a `default_speaker`, you can eliminate the need to specify a `speaker` in each request.
164 | 
165 | ```python
166 | # Gateway for default language (ja-JP) - Voice: 888753761
167 | unified_gateway.add_gateway("aivisspeech", aivisspeech_gateway, default_speaker="888753761", default=True)
168 | 
169 | # Gateway for en-US and zh-CN - Voice: Alloy
170 | unified_gateway.add_gateway("openai", openai_gateway, languages=["en-US", "zh-CN"], default_speaker="alloy")
171 | ```
172 | 
173 | Here is an example of client code to call this API. Switching the `language` enables easy support for multiple languages.
174 | 
175 | ```python
176 | import httpx
177 | 
178 | # Simply set the text and language - easily switch between multiple languages
179 | req = {"text": "こんにちは。これはデフォルトサービスだよ。"}
180 | # req = {"text": "Hello. This is the speech service for English.", "language": "en-US"}
181 | # req = {"text": "你好，这是英语的语音服务。", "language": "zh-CN"}
182 | 
183 | resp = httpx.post("http://127.0.0.1:8000/tts", json=req, timeout=60)
184 | 
185 | with open("tts.wav", "wb") as f:
186 |     f.write(resp.content)
187 | ```
188 | 
189 | 
190 | ## 🛠️ Customization
191 | 
192 | You can add new speech synthesis services to relay.
193 | Additionally, you can extend the cache store, audio format converter, and performance recorder. For example, the default cache store uses the file system, but you can replace it with a cloud storage service or another alternative.
194 | 
195 | We’ll provide documentation for these customizations as the need arises, so if you have specific requests, please open an issue! 🙏
196 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------