├── speech_gateway ├── __init__.py ├── converter │ ├── __init__.py │ ├── pcm.py │ ├── mp3.py │ ├── wave.py │ └── mulaw.py ├── source │ ├── sbv2.py │ ├── voicevox.py │ ├── nijivoice_encoded.py │ ├── azure.py │ ├── openai_speech.py │ ├── nijivoice.py │ └── __init__.py ├── cache │ ├── __init__.py │ └── file.py ├── performance_recorder │ ├── __init__.py │ ├── sqlite.py │ └── postgres.py └── gateway │ ├── unified.py │ ├── voicevox.py │ ├── openai_speech.py │ ├── sbv2.py │ ├── azure.py │ ├── nijivoice_encoded.py │ ├── nijivoice.py │ └── __init__.py ├── requirements.txt ├── .gitattributes ├── tests ├── data │ └── test.wav ├── converter │ ├── test_mp3.py │ └── test_wave.py ├── conftest.py ├── source │ ├── test_sbv2_source.py │ ├── test_azure_source.py │ ├── test_nijivoice_encoded_source.py │ ├── test_nijivoice_source.py │ ├── test_voicevox_source.py │ └── test_openai_speech_source.py ├── gateway │ ├── test_sbv2.py │ ├── test_voicevox.py │ ├── test_unified.py │ ├── test_azure.py │ ├── test_openai_speech.py │ ├── test_azure_openai_speech.py │ ├── test_nijivoice.py │ └── test_nijivoice_encoded.py ├── cache │ └── test_file.py └── performance_recorder │ ├── test_sqlite.py │ └── test_postgres.py ├── docker ├── requirements.txt ├── pgadmin-servers.json ├── init-db.sh ├── README.md ├── setup-volumes.sh ├── Dockerfile.app ├── .env.sample ├── docker-compose.yaml └── run.py ├── setup.py ├── run.py ├── .gitignore ├── README.md └── LICENSE /speech_gateway/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | aiofiles==24.1.0 2 | fastapi==0.115.6 3 | httpx==0.28.1 4 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /tests/data/test.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uezo/speech-gateway/HEAD/tests/data/test.wav -------------------------------------------------------------------------------- /docker/requirements.txt: -------------------------------------------------------------------------------- 1 | psycopg2-binary==2.9.9 2 | python-dotenv==1.0.0 3 | git+https://github.com/uezo/speech-gateway 4 | -------------------------------------------------------------------------------- /docker/pgadmin-servers.json: -------------------------------------------------------------------------------- 1 | { 2 | "Servers": { 3 | "1": { 4 | "Name": "speech-gateway", 5 | "Group": "Servers", 6 | "Host": "spgw-db", 7 | "Port": 5432, 8 | "MaintenanceDB": "postgres", 9 | "Username": "postgres", 10 | "SSLMode": "prefer" 11 | } 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /speech_gateway/converter/__init__.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import AsyncIterator 3 | 4 | 5 | class FormatConverter(ABC): 6 | @abstractmethod 7 | async def convert(self, input_stream: AsyncIterator[bytes]) -> AsyncIterator[bytes]: 8 | pass 9 | 10 | 11 | class FormatConverterError(Exception): 12 | def __init__(self, message: str): 13 | super().__init__(message) 14 | 15 | 16 | from .mp3 import MP3Converter 17 | -------------------------------------------------------------------------------- /docker/init-db.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | psql -v ON_ERROR_STOP=1 --username "$POSTGRES_USER" <<-EOSQL 6 | CREATE DATABASE $SPGW_DB_NAME; 7 | CREATE USER "$SPGW_DB_USER" WITH PASSWORD '$SPGW_DB_PASSWORD'; 8 | GRANT ALL PRIVILEGES ON DATABASE $SPGW_DB_NAME TO "$SPGW_DB_USER"; 9 | EOSQL 10 | 11 | psql -v ON_ERROR_STOP=1 --username "$POSTGRES_USER" --dbname "$SPGW_DB_NAME" <<-EOSQL 12 | GRANT ALL ON SCHEMA public TO "$SPGW_DB_USER"; 13 | ALTER SCHEMA public OWNER TO "$SPGW_DB_USER"; 14 | EOSQL 15 | -------------------------------------------------------------------------------- /speech_gateway/source/sbv2.py: -------------------------------------------------------------------------------- 1 | from . import StreamSource 2 | 3 | 4 | class StyleBertVits2StreamSource(StreamSource): 5 | def get_cache_key(self, audio_format: str, query_params: dict, **kwargs) -> str: 6 | return f"{hash(str(query_params))}.{audio_format or 'wav'}" 7 | 8 | def parse_text(self, query_params: dict, **kwargs) -> str: 9 | return query_params.get("text") 10 | 11 | def make_stream_request(self, query_params: dict, **kwargs): 12 | return { 13 | "method": "GET", 14 | "url": self.base_url + "/voice", 15 | "params": query_params, 16 | } 17 | -------------------------------------------------------------------------------- /speech_gateway/cache/__init__.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import AsyncIterator 3 | 4 | 5 | class CacheStorage(ABC): 6 | @abstractmethod 7 | async def has_cache(self, cache_key: str) -> bool: 8 | pass 9 | 10 | @abstractmethod 11 | async def fetch_cache_stream(self, cache_key: str) -> AsyncIterator[bytes]: 12 | pass 13 | 14 | @abstractmethod 15 | async def write_cache(self, input_stream: AsyncIterator[bytes], cache_key: str) -> AsyncIterator[bytes]: 16 | pass 17 | 18 | 19 | class CacheStorageError(Exception): 20 | def __init__(self, message: str): 21 | super().__init__(message) 22 | 23 | 24 | from .file import FileCacheStorage 25 | -------------------------------------------------------------------------------- /speech_gateway/converter/pcm.py: -------------------------------------------------------------------------------- 1 | import io 2 | import wave 3 | import soundfile as sf 4 | import numpy as np 5 | 6 | 7 | def convert_float32bit_to_int16bit(input_data: bytes) -> bytes: 8 | data, samplerate = sf.read(io.BytesIO(input_data)) 9 | pcm16_data = (data * 32767).astype(np.int16) 10 | channels = pcm16_data.shape[1] if pcm16_data.ndim > 1 else 1 11 | 12 | wav_bytes_io = io.BytesIO() 13 | with wave.open(wav_bytes_io, "wb") as wav_file: 14 | wav_file.setnchannels(channels) 15 | wav_file.setsampwidth(2) 16 | wav_file.setframerate(samplerate) 17 | wav_file.writeframes(pcm16_data.tobytes()) 18 | 19 | wav_bytes = wav_bytes_io.getvalue() 20 | return wav_bytes 21 | -------------------------------------------------------------------------------- /docker/README.md: -------------------------------------------------------------------------------- 1 | # Speech Gateway Docker Setup 2 | 3 | ## Quick Start 4 | 5 | ### 1. Setup Environment 6 | ```bash 7 | cp .env.sample .env 8 | # Edit .env and set your API keys 9 | ``` 10 | 11 | ### 2. Create Volume Directories 12 | ```bash 13 | ./setup-volumes.sh 14 | ``` 15 | 16 | ### 3. Start Services 17 | ```bash 18 | docker compose up -d 19 | ``` 20 | 21 | ## Access 22 | 23 | - Application: http://localhost:18000 24 | - PgAdmin: http://localhost:18001 25 | 26 | ## Configuration 27 | 28 | Edit `.env` file to: 29 | - Set API keys (AZURE_API_KEY, OPENAI_API_KEY, etc.) 30 | - Enable/disable services (AZURE_ENABLED=true/false) 31 | - Change ports if needed 32 | 33 | ## Stop Services 34 | 35 | ```bash 36 | docker compose down 37 | ``` 38 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name="speech_gateway", 5 | version="0.1.7", 6 | url="https://github.com/uezo/speech-gateway", 7 | author="uezo", 8 | author_email="uezo@uezo.net", 9 | maintainer="uezo", 10 | maintainer_email="uezo@uezo.net", 11 | description="A reverse proxy server that enhances speech synthesis with essential, extensible features. 🦉💬", 12 | long_description=open("README.md").read(), 13 | long_description_content_type="text/markdown", 14 | packages=find_packages(exclude=["tests*"]), 15 | install_requires=["aiofiles==24.1.0", "fastapi==0.115.6", "httpx==0.28.1", "uvicorn==0.34.0"], 16 | license="Apache v2", 17 | classifiers=[ 18 | "Programming Language :: Python :: 3" 19 | ] 20 | ) 21 | -------------------------------------------------------------------------------- /speech_gateway/performance_recorder/__init__.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from dataclasses import dataclass 3 | 4 | 5 | class PerformanceRecorder(ABC): 6 | @abstractmethod 7 | def record( 8 | self, 9 | *, 10 | process_id: str, 11 | source: str = None, 12 | text: str = None, 13 | audio_format: str = None, 14 | cached: int = 0, 15 | elapsed: float = None, 16 | ): 17 | pass 18 | 19 | @abstractmethod 20 | def close(self): 21 | pass 22 | 23 | 24 | @dataclass 25 | class PerformanceRecord: 26 | process_id: str 27 | source: str = None 28 | text: str = None 29 | audio_format: str = None 30 | cached: int = 0, 31 | elapsed: float = None, 32 | 33 | 34 | from .sqlite import SQLitePerformanceRecorder 35 | -------------------------------------------------------------------------------- /docker/setup-volumes.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Setup script for Docker volumes 4 | # This script creates necessary directories for Docker named volumes 5 | 6 | set -e 7 | 8 | # Load environment variables 9 | if [ -f .env ]; then 10 | export $(grep -v '^#' .env | xargs) 11 | fi 12 | 13 | # Default to ./data if DATA_PATH is not set 14 | DATA_PATH=${DATA_PATH:-./data} 15 | 16 | echo "Setting up volumes at: $DATA_PATH" 17 | 18 | # Create directories if they don't exist 19 | mkdir -p "$DATA_PATH/postgres" 20 | mkdir -p "$DATA_PATH/pgadmin" 21 | mkdir -p "$DATA_PATH/cache" 22 | 23 | # Set appropriate permissions 24 | # PostgreSQL needs UID 999 (in most PostgreSQL Docker images) 25 | # PgAdmin needs UID 5050 26 | if [ "$(uname)" = "Linux" ]; then 27 | sudo chown -R 999:999 "$DATA_PATH/postgres" 2>/dev/null || true 28 | sudo chown -R 5050:5050 "$DATA_PATH/pgadmin" 2>/dev/null || true 29 | fi 30 | 31 | echo "Volume directories created successfully:" 32 | echo " - $DATA_PATH/postgres" 33 | echo " - $DATA_PATH/pgadmin" 34 | echo " - $DATA_PATH/cache" 35 | echo "" 36 | echo "You can now run: docker compose up -d" 37 | -------------------------------------------------------------------------------- /docker/Dockerfile.app: -------------------------------------------------------------------------------- 1 | # Multi-stage build for optimized image 2 | FROM python:3.11-slim AS builder 3 | 4 | # Install build dependencies (git needed for GitHub installation) 5 | RUN apt-get update && apt-get install -y \ 6 | gcc \ 7 | git \ 8 | && rm -rf /var/lib/apt/lists/* 9 | 10 | # Create virtual environment 11 | RUN python -m venv /opt/venv 12 | ENV PATH="/opt/venv/bin:$PATH" 13 | 14 | # Copy requirements and install dependencies 15 | COPY requirements.txt /tmp/ 16 | RUN pip install --upgrade pip && \ 17 | pip install --no-cache-dir -r /tmp/requirements.txt 18 | 19 | # Runtime stage 20 | FROM python:3.11-slim 21 | 22 | # Install runtime dependencies 23 | RUN apt-get update && apt-get install -y \ 24 | curl \ 25 | && rm -rf /var/lib/apt/lists/* \ 26 | && useradd -m -u 1000 app 27 | 28 | # Copy virtual environment from builder 29 | COPY --from=builder /opt/venv /opt/venv 30 | ENV PATH="/opt/venv/bin:$PATH" 31 | 32 | WORKDIR /app 33 | 34 | # Copy application 35 | COPY --chown=app:app run.py /app/ 36 | 37 | # Switch to non-root user 38 | USER app 39 | 40 | EXPOSE 8000 41 | 42 | CMD ["uvicorn", "run:app", "--host", "0.0.0.0", "--port", "8000"] 43 | -------------------------------------------------------------------------------- /speech_gateway/source/voicevox.py: -------------------------------------------------------------------------------- 1 | import urllib.parse 2 | import httpx 3 | from . import StreamSource, StreamSourceError 4 | 5 | 6 | class VoicevoxStreamSource(StreamSource): 7 | def get_cache_key(self, audio_format: str, speaker: str, audio_query: dict, **kwargs) -> str: 8 | return f"{speaker}_{hash(str(audio_query))}.{audio_format or 'wav'}" 9 | 10 | def parse_text(self, audio_query: dict, **kwargs) -> str: 11 | return audio_query.get("kana") 12 | 13 | def make_stream_request(self, speaker: str, audio_query: dict, **kwargs): 14 | return { 15 | "method": "POST", 16 | "url": self.base_url + "/synthesis", 17 | "params": {"speaker": speaker}, 18 | "json": audio_query 19 | } 20 | 21 | async def get_audio_query(self, speaker: str, text: str, **kwargs): 22 | try: 23 | url = f"{self.base_url}/audio_query" 24 | 25 | response = await self.http_client.post(url, params={"speaker": speaker, "text": text}) 26 | response.raise_for_status() 27 | 28 | return response.json() 29 | 30 | except httpx.RequestError as ex: 31 | raise StreamSourceError(f"HTTP request failed: {ex}") from ex 32 | -------------------------------------------------------------------------------- /docker/.env.sample: -------------------------------------------------------------------------------- 1 | COMPOSE_PROJECT_NAME=speech-gateway 2 | 3 | # Database settings 4 | POSTGRES_USER=postgres 5 | POSTGRES_PASSWORD=postgres 6 | 7 | SPGW_DB_NAME=speech_gateway 8 | SPGW_DB_USER=spgw-runtime 9 | SPGW_DB_PASSWORD=spgw-runtime-password 10 | 11 | # Port settings 12 | PORT_SPGW=18000 13 | PORT_DB=5432 14 | PORT_PGADMIN=18001 15 | 16 | # PgAdmin settings 17 | PGADMIN_USER=pgadmin@example.com 18 | PGADMIN_PASSWORD=pgadmin-password 19 | 20 | 21 | # Application settings 22 | DEBUG=true 23 | 24 | # Service enable/disable flags 25 | AZURE_ENABLED=true 26 | OPENAI_ENABLED=true 27 | VOICEVOX_ENABLED=false 28 | SBV2_ENABLED=false 29 | NIJIVOICE_ENABLED=false 30 | 31 | # Azure TTS 32 | AZURE_API_KEY= 33 | AZURE_REGION= 34 | # AZURE_LANGUAGES=en-US,zh-CN,fr-FR 35 | 36 | # OpenAI TTS 37 | OPENAI_API_KEY= 38 | # OPENAI_LANGUAGES=ja-JP,es-ES 39 | 40 | # VOICEVOX 41 | VOICEVOX_URL=http://voicevox-host:50021 42 | # VOICEVOX_LANGUAGES=ja-JP 43 | 44 | # Style-Bert-VITS2 45 | SBV2_URL==http://sbv2-host:5000 46 | # SBV2_LANGUAGES=ja-JP 47 | 48 | # NIJIVOICE 49 | NIJIVOICE_API_KEY= 50 | # NIJIVOICE_LANGUAGES=ja-JP 51 | 52 | # Data storage path (for external disk mounting) 53 | # Examples: 54 | # DATA_PATH=./data # Local directory (default) 55 | # DATA_PATH=/mnt/external-disk/spgw # External disk 56 | DATA_PATH=./data 57 | -------------------------------------------------------------------------------- /tests/converter/test_mp3.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import os 3 | from typing import AsyncIterator 4 | from speech_gateway.converter import MP3Converter, FormatConverterError 5 | 6 | @pytest.fixture 7 | def mp3_converter(): 8 | # Create an instance of MP3Converter for testing 9 | return MP3Converter() 10 | 11 | @pytest.mark.asyncio 12 | async def test_mp3_conversion(mp3_converter): 13 | # Test the convert method using a real WAV file 14 | input_file = "tests/data/test.wav" 15 | 16 | async def input_stream() -> AsyncIterator[bytes]: 17 | with open(input_file, "rb") as f: 18 | while chunk := f.read(1024): 19 | yield chunk 20 | 21 | output = b"" 22 | try: 23 | async for chunk in mp3_converter.convert(input_stream()): 24 | output += chunk 25 | except FormatConverterError as e: 26 | pytest.fail(f"MP3 conversion failed with error: {e}") 27 | 28 | # Assert that the output is not empty (indicating conversion occurred) 29 | assert output != b"" 30 | 31 | @pytest.mark.asyncio 32 | async def test_mp3_conversion_error_handling(mp3_converter): 33 | # Test error handling in the convert method with invalid input 34 | 35 | async def input_stream() -> AsyncIterator[bytes]: 36 | yield b"Invalid input data" 37 | 38 | with pytest.raises(FormatConverterError): 39 | async for _ in mp3_converter.convert(input_stream()): 40 | pass 41 | -------------------------------------------------------------------------------- /run.py: -------------------------------------------------------------------------------- 1 | from contextlib import asynccontextmanager 2 | import logging 3 | from fastapi import FastAPI 4 | from speech_gateway.gateway.voicevox import VoicevoxGateway 5 | from speech_gateway.gateway.nijivoice import NijiVoiceGateway 6 | from speech_gateway.gateway.sbv2 import StyleBertVits2Gateway 7 | from speech_gateway.gateway.unified import UnifiedGateway 8 | 9 | # Configure root logger 10 | logger = logging.getLogger("speech_gateway") 11 | logger.setLevel(logging.INFO) 12 | log_format = logging.Formatter("[%(levelname)s] %(asctime)s : %(message)s") 13 | streamHandler = logging.StreamHandler() 14 | streamHandler.setFormatter(log_format) 15 | logger.addHandler(streamHandler) 16 | 17 | NIJIVOICE_API_KEY = "YOUR_API_KEY" 18 | 19 | # Create gateways 20 | aivisspeech_gateway = VoicevoxGateway(base_url="http://127.0.0.1:10101", debug=True) 21 | sbv2_gateway = StyleBertVits2Gateway(base_url="http://127.0.0.1:5000", debug=True) 22 | nijivoice_gateway = NijiVoiceGateway(api_key=NIJIVOICE_API_KEY, prefix="/nijivoice", debug=True) 23 | 24 | # Unified gateway 25 | unified_gateway = UnifiedGateway(debug=True) 26 | unified_gateway.add_gateway("aivisspeech", aivisspeech_gateway, True) 27 | unified_gateway.add_gateway("sbv2", sbv2_gateway) 28 | unified_gateway.add_gateway("nijivoice", nijivoice_gateway) 29 | 30 | # Create app 31 | app = FastAPI() 32 | 33 | # Add gateways to app 34 | app.include_router(aivisspeech_gateway.get_router(), prefix="/aivisspeech") 35 | app.include_router(sbv2_gateway.get_router(), prefix="/sbv2") 36 | app.include_router(nijivoice_gateway.get_router(), prefix="/nijivoice") 37 | app.include_router(unified_gateway.get_router()) 38 | 39 | # On app down 40 | @asynccontextmanager 41 | async def lifespan(app: FastAPI): 42 | yield 43 | await aivisspeech_gateway.shutdown() 44 | await sbv2_gateway.shutdown() 45 | await nijivoice_gateway.shutdown() 46 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | import pytest 4 | import httpx 5 | 6 | OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") 7 | 8 | 9 | def make_random_text(): 10 | random_key = "{:,}".format(random.randint(100000, 999999)) 11 | return f"これは音声合成のテストです。ランダムキーは、{random_key}です。" 12 | 13 | 14 | def is_wave(data: bytes) -> bool: 15 | if len(data) < 12: 16 | return False 17 | return data[:4] == b"RIFF" and data[8:12] == b"WAVE" 18 | 19 | 20 | def is_mp3(data: bytes) -> bool: 21 | if data[:3] == b"ID3": 22 | id3_size = 10 23 | if len(data) >= 10: 24 | tag_size = ( 25 | (data[6] << 21) 26 | | (data[7] << 14) 27 | | (data[8] << 7) 28 | | data[9] 29 | ) 30 | id3_size += tag_size 31 | data = data[id3_size:] 32 | 33 | if len(data) < 2: 34 | return False 35 | return data[:2] in [b"\xFF\xFB", b"\xFF\xF3", b"\xFF\xF2"] 36 | 37 | 38 | def transcribe(data: bytes, audio_format: str) -> str: 39 | headers = {"Authorization": f"Bearer {OPENAI_API_KEY}"} 40 | form_data = {"model": "whisper-1"} 41 | files = {"file": (f"voice.{audio_format}", data, f"audio/{audio_format}")} 42 | resp = httpx.post( 43 | "https://api.openai.com/v1/audio/transcriptions", 44 | headers=headers, 45 | data=form_data, 46 | files=files 47 | ) 48 | return resp.json().get("text") 49 | 50 | 51 | @pytest.fixture 52 | def random_text(): 53 | random_key = "{:,}".format(random.randint(100000, 999999)) 54 | return f"これは音声合成のテストです。ランダムキーは、{random_key}です。" 55 | 56 | @pytest.fixture 57 | def wave_checker(): 58 | return is_wave 59 | 60 | @pytest.fixture 61 | def mp3_checker(): 62 | return is_mp3 63 | 64 | @pytest.fixture 65 | def audio_transcriber(): 66 | return transcribe 67 | -------------------------------------------------------------------------------- /speech_gateway/source/nijivoice_encoded.py: -------------------------------------------------------------------------------- 1 | from typing import Dict 2 | from . import StreamSource 3 | from ..cache import CacheStorage 4 | from ..cache.file import FileCacheStorage 5 | from ..converter import FormatConverter 6 | from ..performance_recorder import PerformanceRecorder 7 | 8 | 9 | class NijiVoiceEncodedStreamSource(StreamSource): 10 | def __init__(self, 11 | *, 12 | api_key: str = None, 13 | base_url: str = "https://api.nijivoice.com", 14 | cache_storage: CacheStorage = None, 15 | format_converters: Dict[str, FormatConverter] = None, 16 | max_connections: int = 100, 17 | max_keepalive_connections: int = 20, 18 | timeout: float = 10.0, 19 | performance_recorder: PerformanceRecorder = None, 20 | debug: bool = False 21 | ): 22 | super().__init__( 23 | base_url=base_url, 24 | cache_storage=cache_storage or FileCacheStorage(cache_dir="nijivoice_encoded_cache"), 25 | format_converters=format_converters, 26 | max_connections=max_connections, 27 | max_keepalive_connections=max_keepalive_connections, 28 | timeout=timeout, 29 | performance_recorder=performance_recorder, 30 | debug=debug 31 | ) 32 | self.base_url = base_url 33 | self.api_key = api_key 34 | 35 | def get_cache_key(self, audio_format: str, voice_actor_id: str, request_json: dict, **kwargs) -> str: 36 | if not audio_format: 37 | audio_format = request_json.get("format", "mp3") 38 | return f"{voice_actor_id}_{hash(str(request_json))}.{audio_format}.json" 39 | 40 | def parse_text(self, request_json: dict, **kwargs) -> str: 41 | return request_json.get("script") 42 | 43 | def make_stream_request(self, voice_actor_id: str, request_json: dict, **kwargs): 44 | return { 45 | "method": "POST", 46 | "url": self.base_url + f"/api/platform/v1/voice-actors/{voice_actor_id}/generate-encoded-voice", 47 | "headers": {"x-api-key": self.api_key}, 48 | "json": request_json 49 | } 50 | -------------------------------------------------------------------------------- /speech_gateway/converter/mp3.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from typing import AsyncIterator 3 | from . import FormatConverter, FormatConverterError 4 | 5 | 6 | class MP3Converter(FormatConverter): 7 | def __init__(self, ffmpeg_path: str = "ffmpeg", bitrate: str = "64k", output_chunksize: int = 1024): 8 | self.ffmpeg_path = ffmpeg_path 9 | self.bitrate = bitrate 10 | self.output_chunksize = output_chunksize 11 | 12 | async def convert(self, input_stream: AsyncIterator[bytes]) -> AsyncIterator[bytes]: 13 | try: 14 | ffmpeg_proc = await asyncio.create_subprocess_exec( 15 | self.ffmpeg_path, 16 | "-y", 17 | "-i", "-", # Read from stdin 18 | "-f", "mp3", 19 | "-b:a", self.bitrate, 20 | "-", # Write to stdout 21 | stdin=asyncio.subprocess.PIPE, 22 | stdout=asyncio.subprocess.PIPE, 23 | stderr=asyncio.subprocess.PIPE 24 | ) 25 | 26 | async def feed_ffmpeg_stdin(): 27 | try: 28 | async for chunk in input_stream: 29 | ffmpeg_proc.stdin.write(chunk) 30 | await ffmpeg_proc.stdin.drain() 31 | ffmpeg_proc.stdin.close() 32 | 33 | except Exception as ex: 34 | ffmpeg_proc.stdin.close() 35 | raise FormatConverterError(f"Error feeding data to ffmpeg: {str(ex)}") 36 | 37 | asyncio.create_task(feed_ffmpeg_stdin()) 38 | 39 | while True: 40 | chunk = await ffmpeg_proc.stdout.read(self.output_chunksize) 41 | if not chunk: 42 | break 43 | yield chunk 44 | 45 | await ffmpeg_proc.wait() 46 | 47 | if ffmpeg_proc.returncode != 0: 48 | stderr = await ffmpeg_proc.stderr.read() 49 | raise FormatConverterError(f"FFmpeg conversion error: {stderr.decode('utf-8')}") 50 | 51 | except Exception as ex: 52 | raise FormatConverterError(f"Error during MP3 conversion: {str(ex)}") 53 | -------------------------------------------------------------------------------- /speech_gateway/source/azure.py: -------------------------------------------------------------------------------- 1 | from . import StreamSource 2 | from typing import Dict 3 | from ..cache import CacheStorage 4 | from ..cache.file import FileCacheStorage 5 | from ..converter import FormatConverter 6 | from ..performance_recorder import PerformanceRecorder 7 | 8 | 9 | class AzureStreamSource(StreamSource): 10 | def __init__(self, 11 | *, 12 | api_key: str = None, 13 | region: str = None, 14 | base_url: str = "https://{region}.tts.speech.microsoft.com/cognitiveservices/v1", 15 | cache_storage: CacheStorage = None, 16 | format_converters: Dict[str, FormatConverter] = None, 17 | max_connections: int = 100, 18 | max_keepalive_connections: int = 20, 19 | timeout: float = 10.0, 20 | performance_recorder: PerformanceRecorder = None, 21 | debug: bool = False 22 | ): 23 | super().__init__( 24 | base_url=base_url, 25 | cache_storage=cache_storage or FileCacheStorage(cache_dir="azure_cache"), 26 | format_converters=format_converters, 27 | max_connections=max_connections, 28 | max_keepalive_connections=max_keepalive_connections, 29 | timeout=timeout, 30 | performance_recorder=performance_recorder, 31 | debug=debug 32 | ) 33 | self.api_key = api_key 34 | self.region = region 35 | 36 | def get_cache_key(self, audio_format: str, encoded_ssml: bytes, **kwargs) -> str: 37 | return f"{hash(encoded_ssml)}.{audio_format or 'wav'}" 38 | 39 | def parse_text(self, encoded_ssml: bytes, **kwargs) -> str: 40 | return encoded_ssml.decode("utf-8") 41 | 42 | def make_stream_request(self, encoded_ssml: bytes, azure_audio_format: str, **kwargs): 43 | return { 44 | "method": "POST", 45 | "url": self.base_url.format(region=self.region), 46 | "headers": { 47 | "X-Microsoft-OutputFormat": azure_audio_format, 48 | "Content-Type": "application/ssml+xml", 49 | "Ocp-Apim-Subscription-Key": self.api_key 50 | }, 51 | "data": encoded_ssml 52 | } 53 | -------------------------------------------------------------------------------- /speech_gateway/source/openai_speech.py: -------------------------------------------------------------------------------- 1 | from . import StreamSource 2 | from typing import Dict 3 | from ..cache import CacheStorage 4 | from ..cache.file import FileCacheStorage 5 | from ..converter import FormatConverter 6 | from ..performance_recorder import PerformanceRecorder 7 | 8 | 9 | class OpenAIStreamSource(StreamSource): 10 | def __init__(self, 11 | *, 12 | api_key: str = None, 13 | base_url: str = "https://api.openai.com/v1", 14 | cache_storage: CacheStorage = None, 15 | format_converters: Dict[str, FormatConverter] = None, 16 | max_connections: int = 100, 17 | max_keepalive_connections: int = 20, 18 | timeout: float = 10.0, 19 | performance_recorder: PerformanceRecorder = None, 20 | debug: bool = False 21 | ): 22 | super().__init__( 23 | base_url=base_url, 24 | cache_storage=cache_storage or FileCacheStorage(cache_dir="openai_cache"), 25 | format_converters=format_converters, 26 | max_connections=max_connections, 27 | max_keepalive_connections=max_keepalive_connections, 28 | timeout=timeout, 29 | performance_recorder=performance_recorder, 30 | debug=debug 31 | ) 32 | self.base_url = base_url 33 | self.api_key = api_key 34 | 35 | def get_cache_key(self, audio_format: str, request_json: dict, **kwargs) -> str: 36 | if not audio_format: 37 | audio_format = request_json.get("response_format", "mp3") 38 | return f"{hash(str(request_json))}.{audio_format}" 39 | 40 | def parse_text(self, request_json: dict, **kwargs) -> str: 41 | return request_json.get("input") 42 | 43 | def make_stream_request(self, request_json: dict, **kwargs): 44 | if "azure" in self.base_url: 45 | url = self.base_url 46 | headers = {"api-key": self.api_key} 47 | else: 48 | url = f"{self.base_url}/audio/speech" 49 | headers = {"Authorization": f"Bearer {self.api_key}"} 50 | 51 | return { 52 | "method": "POST", 53 | "url": url, 54 | "headers": headers, 55 | "json": request_json 56 | } 57 | -------------------------------------------------------------------------------- /tests/source/test_sbv2_source.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import os 3 | from speech_gateway.source.sbv2 import StyleBertVits2StreamSource 4 | 5 | SBV2_URL = os.getenv("SBV2_URL") 6 | 7 | 8 | @pytest.fixture 9 | def source(): 10 | # Create an instance of StyleBertVits2StreamSource 11 | return StyleBertVits2StreamSource(base_url=SBV2_URL) 12 | 13 | @pytest.mark.asyncio 14 | async def test_get_cache_key(source): 15 | # Test get_cache_key method 16 | query_params = {"text": "こんにちは。これはテストです。", "voice": "test"} 17 | cache_key = source.get_cache_key("mp3", query_params) 18 | assert cache_key.endswith(".mp3") 19 | 20 | cache_key = source.get_cache_key("wav", query_params) 21 | assert cache_key.endswith(".wav") 22 | 23 | @pytest.mark.asyncio 24 | async def test_parse_text(source): 25 | # Test parse_text method 26 | query_params = {"text": "こんにちは。これはテストです。", "voice": "test"} 27 | text = source.parse_text(query_params) 28 | assert text == "こんにちは。これはテストです。" 29 | 30 | @pytest.mark.asyncio 31 | async def test_make_stream_request(source): 32 | # Test make_stream_request method 33 | query_params = {"text": "こんにちは。これはテストです。", "voice": "test"} 34 | request = source.make_stream_request(query_params) 35 | assert request["method"] == "GET" 36 | assert request["url"] == f"{SBV2_URL}/voice" 37 | assert request["params"] == query_params 38 | 39 | @pytest.mark.asyncio 40 | async def test_fetch_stream_raw(source): 41 | # Test fetch_stream_raw with a real request (ensure server is running locally) 42 | query_params = {"text": "こんにちは。これはテストです。", "voice": "test"} 43 | http_request = source.make_stream_request(query_params) 44 | 45 | try: 46 | # Replace this part with a live test against the actual service 47 | async for chunk in source.fetch_stream_raw(http_request): 48 | assert isinstance(chunk, bytes) 49 | except Exception as e: 50 | pytest.fail(f"fetch_stream_raw failed: {e}") 51 | 52 | @pytest.mark.asyncio 53 | async def test_fetch_stream(source): 54 | # Test fetch_stream method with conversion and caching 55 | query_params = {"text": "こんにちは。", "voice": "test"} 56 | audio_format = "mp3" 57 | 58 | try: 59 | async for chunk in await source.fetch_stream(audio_format, query_params=query_params): 60 | assert isinstance(chunk, bytes) 61 | except Exception as e: 62 | pytest.fail(f"fetch_stream failed: {e}") 63 | -------------------------------------------------------------------------------- /docker/docker-compose.yaml: -------------------------------------------------------------------------------- 1 | services: 2 | app: 3 | container_name: spgw-app 4 | build: 5 | context: . 6 | dockerfile: Dockerfile.app 7 | env_file: 8 | - .env 9 | environment: 10 | - DATABASE_URL=postgresql://${SPGW_DB_USER}:${SPGW_DB_PASSWORD}@db:5432/${SPGW_DB_NAME} 11 | ports: 12 | - "${PORT_SPGW}:8000" 13 | volumes: 14 | - spgw-app-cache:/app/cache 15 | depends_on: 16 | db: 17 | condition: service_healthy 18 | healthcheck: 19 | test: ["CMD", "curl", "-f", "http://localhost:8000/docs"] 20 | interval: 30s 21 | timeout: 10s 22 | retries: 3 23 | start_period: 40s 24 | restart: unless-stopped 25 | 26 | db: 27 | container_name: spgw-db 28 | image: postgres:16 29 | environment: 30 | - POSTGRES_USER=${POSTGRES_USER} 31 | - POSTGRES_PASSWORD=${POSTGRES_PASSWORD} 32 | - SPGW_DB_NAME=${SPGW_DB_NAME} 33 | - SPGW_DB_USER=${SPGW_DB_USER} 34 | - SPGW_DB_PASSWORD=${SPGW_DB_PASSWORD} 35 | ports: 36 | - "${PORT_DB}:5432" 37 | volumes: 38 | - spgw-postgres-data:/var/lib/postgresql/data 39 | - ./init-db.sh:/docker-entrypoint-initdb.d/init-db.sh:ro 40 | healthcheck: 41 | test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER} -d ${SPGW_DB_NAME}"] 42 | interval: 10s 43 | timeout: 5s 44 | retries: 5 45 | start_period: 30s 46 | restart: unless-stopped 47 | 48 | pgadmin4: 49 | container_name: spgw-pgadmin4 50 | image: dpage/pgadmin4:8.14 51 | environment: 52 | PGADMIN_DEFAULT_EMAIL: ${PGADMIN_USER} 53 | PGADMIN_DEFAULT_PASSWORD: ${PGADMIN_PASSWORD} 54 | PGADMIN_CONFIG_SERVER_MODE: "True" 55 | ports: 56 | - "${PORT_PGADMIN}:80" 57 | volumes: 58 | - spgw-pgadmin-data:/var/lib/pgadmin 59 | - ./pgadmin-servers.json:/pgadmin4/servers.json 60 | depends_on: 61 | db: 62 | condition: service_healthy 63 | restart: unless-stopped 64 | 65 | volumes: 66 | spgw-postgres-data: 67 | driver: local 68 | driver_opts: 69 | type: none 70 | o: bind 71 | device: ${DATA_PATH:-./data}/postgres 72 | spgw-pgadmin-data: 73 | driver: local 74 | driver_opts: 75 | type: none 76 | o: bind 77 | device: ${DATA_PATH:-./data}/pgadmin 78 | spgw-app-cache: 79 | driver: local 80 | driver_opts: 81 | type: none 82 | o: bind 83 | device: ${DATA_PATH:-./data}/cache 84 | -------------------------------------------------------------------------------- /speech_gateway/gateway/unified.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List 2 | from fastapi import HTTPException 3 | from fastapi import Request, APIRouter 4 | from . import SpeechGateway, UnifiedTTSRequest 5 | 6 | 7 | class UnifiedGateway(SpeechGateway): 8 | def __init__(self, *, default_gateway: SpeechGateway = None, default_language: str = "ja-JP", debug = False): 9 | super().__init__(stream_source=None, debug=debug) 10 | self.service_map: Dict[str, SpeechGateway] = {} 11 | self.language_map: Dict[str, SpeechGateway] = {} 12 | self.default_speakers: Dict[SpeechGateway, str] = {} 13 | self.default_gateway: SpeechGateway = default_gateway 14 | self.default_language = default_language 15 | 16 | def add_gateway(self, service_name: str, gateway: SpeechGateway, *, languages: List[str] = None, default_speaker: str = None, default: bool = False): 17 | self.service_map[service_name] = gateway 18 | if languages: 19 | for lang in languages: 20 | self.language_map[lang] = gateway 21 | if default: 22 | self.default_gateway = gateway 23 | self.language_map[self.default_language] = gateway 24 | self.default_speakers[gateway] = default_speaker 25 | 26 | def get_gateway(self, tts_request: UnifiedTTSRequest): 27 | if tts_request.service_name: 28 | return self.service_map.get(tts_request.service_name) 29 | elif tts_request.language: 30 | return self.language_map.get(tts_request.language) 31 | elif self.default_gateway: 32 | return self.default_gateway 33 | return None 34 | 35 | def get_router(self) -> APIRouter: 36 | router = APIRouter() 37 | self.register_endpoint(router) 38 | return router 39 | 40 | def register_endpoint(self, router: APIRouter): 41 | @router.post("/tts") 42 | async def post_tts(request: Request, tts_request: UnifiedTTSRequest, x_audio_format: str = "wav"): 43 | gateway = self.get_gateway(tts_request) 44 | 45 | if not gateway: 46 | raise HTTPException(status_code=404, detail="No gateway found.") 47 | 48 | if not tts_request.speaker: 49 | tts_request.speaker = self.default_speakers.get(gateway) 50 | 51 | return await gateway.unified_tts_handler(request, tts_request, x_audio_format) 52 | 53 | async def shutdown(self): 54 | pass 55 | -------------------------------------------------------------------------------- /speech_gateway/converter/wave.py: -------------------------------------------------------------------------------- 1 | import audioop 2 | import io 3 | import wave 4 | from typing import AsyncIterator 5 | from . import FormatConverter, FormatConverterError 6 | 7 | 8 | class WaveConverter(FormatConverter): 9 | def __init__(self, output_sample_rate: int = 16000, output_sample_width: int = 2): 10 | self.output_sample_rate = output_sample_rate 11 | self.output_sample_width = output_sample_width 12 | 13 | def convert_wave_bytes(self, input_bytes, output_sample_rate, output_sample_width): 14 | input_io = io.BytesIO(input_bytes) 15 | with wave.open(input_io, 'rb') as wf: 16 | input_sample_rate = wf.getframerate() 17 | input_sample_width = wf.getsampwidth() 18 | channels = wf.getnchannels() 19 | frames = wf.readframes(wf.getnframes()) 20 | 21 | # Convert sample rate 22 | if input_sample_rate != output_sample_rate: 23 | frames, _ = audioop.ratecv(frames, input_sample_width, channels, input_sample_rate, output_sample_rate, None) 24 | 25 | # Convert sample width 26 | if input_sample_width != output_sample_width: 27 | # 16 -> 8 28 | if input_sample_width == 2 and output_sample_width == 1: 29 | frames = audioop.lin2lin(frames, 2, 1) 30 | frames = audioop.bias(frames, 1, 128) 31 | # 8 -> 16 32 | elif input_sample_width == 1 and output_sample_width == 2: 33 | frames = audioop.bias(frames, 1, -128) 34 | frames = audioop.lin2lin(frames, 1, 2) 35 | else: 36 | frames = audioop.lin2lin(frames, input_sample_width, output_sample_width) 37 | 38 | output_io = io.BytesIO() 39 | with wave.open(output_io, "wb") as wf_out: 40 | wf_out.setframerate(output_sample_rate) 41 | wf_out.setsampwidth(output_sample_width) 42 | wf_out.setnchannels(channels) 43 | wf_out.writeframes(frames) 44 | 45 | return output_io.getvalue() 46 | 47 | async def convert(self, input_stream: AsyncIterator[bytes]) -> AsyncIterator[bytes]: 48 | try: 49 | wav_data = b"" 50 | async for chunk in input_stream: 51 | wav_data += chunk 52 | 53 | yield self.convert_wave_bytes(wav_data, self.output_sample_rate, self.output_sample_width) 54 | 55 | except Exception as ex: 56 | raise FormatConverterError(f"Error during Mu-Law conversion: {str(ex)}") 57 | -------------------------------------------------------------------------------- /speech_gateway/cache/file.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from typing import AsyncIterator 3 | import aiofiles 4 | from . import CacheStorage, CacheStorageError 5 | 6 | 7 | class FileCacheStorage(CacheStorage): 8 | def __init__(self, cache_dir: str = "voice_cache"): 9 | self.cache_dir = Path(cache_dir) 10 | if not self.cache_dir.exists(): 11 | self.cache_dir.mkdir(parents=True) 12 | 13 | async def has_cache(self, cache_key: str) -> bool: 14 | file_path = self.cache_dir / cache_key 15 | if not file_path.exists(): 16 | return False 17 | 18 | if file_path.stat().st_size == 0: 19 | await self.delete_cache(cache_key) 20 | return False 21 | 22 | return True 23 | 24 | async def fetch_cache_stream(self, cache_key: str) -> AsyncIterator[bytes]: 25 | try: 26 | file_path = self.cache_dir / cache_key 27 | async with aiofiles.open(file_path, mode="rb") as file: 28 | while chunk := await file.read(1024): 29 | yield chunk 30 | 31 | except Exception as ex: 32 | raise IOError(f"Error reading file {file_path}: {str(ex)}") 33 | 34 | async def write_cache(self, input_stream: AsyncIterator[bytes], cache_key: str) -> AsyncIterator[bytes]: 35 | file_path = self.cache_dir / cache_key 36 | try: 37 | async with aiofiles.open(file_path, "wb") as file: 38 | async for chunk in input_stream: 39 | await file.write(chunk) 40 | await file.flush() 41 | yield chunk 42 | 43 | except Exception as ex: 44 | # Clean up partial file if it was created 45 | if file_path.exists(): 46 | try: 47 | file_path.unlink() 48 | except: 49 | pass 50 | raise CacheStorageError(f"Error during file save operation: {str(ex)}") 51 | 52 | async def delete_cache(self, cache_key: str) -> None: 53 | file_path = self.cache_dir / cache_key 54 | try: 55 | if file_path.exists(): 56 | file_path.unlink() 57 | 58 | except Exception as ex: 59 | raise CacheStorageError(f"Error deleting cache file {file_path}: {str(ex)}") 60 | 61 | async def clear_all_cache(self) -> None: 62 | try: 63 | for file_path in self.cache_dir.iterdir(): 64 | if file_path.is_file(): 65 | file_path.unlink() 66 | 67 | except Exception as ex: 68 | raise CacheStorageError(f"Error clearing cache directory {self.cache_dir}: {str(ex)}") 69 | -------------------------------------------------------------------------------- /tests/source/test_azure_source.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import os 3 | from speech_gateway.source.azure import AzureStreamSource 4 | 5 | AZURE_API_KEY = os.getenv("AZURE_API_KEY") 6 | AZURE_REGION = os.getenv("AZURE_REGION") 7 | 8 | 9 | @pytest.fixture 10 | def source(): 11 | # Create an instance of Azure Speech 12 | return AzureStreamSource(api_key=AZURE_API_KEY, region=AZURE_REGION) 13 | 14 | @pytest.mark.asyncio 15 | async def test_get_cache_key(source): 16 | cache_key = source.get_cache_key("mp3", b"dummy") 17 | assert cache_key.endswith(".mp3") 18 | 19 | cache_key = source.get_cache_key("wav", b"dummy") 20 | assert cache_key.endswith(".wav") 21 | 22 | @pytest.mark.asyncio 23 | async def test_parse_text(source): 24 | text = source.parse_text(encoded_ssml=b"dummy") 25 | assert text == "dummy" 26 | 27 | @pytest.mark.asyncio 28 | async def test_make_stream_request(source): 29 | # Test make_stream_request method 30 | request = source.make_stream_request(encoded_ssml=b"dummy", azure_audio_format="dummy_mp3") 31 | assert request["method"] == "POST" 32 | assert request["url"] == f"https://{AZURE_REGION}.tts.speech.microsoft.com/cognitiveservices/v1" 33 | assert request["headers"]["X-Microsoft-OutputFormat"] == "dummy_mp3" 34 | assert request["headers"]["Content-Type"] == "application/ssml+xml" 35 | assert request["headers"]["Ocp-Apim-Subscription-Key"] == source.api_key 36 | assert request["data"] == b"dummy" 37 | 38 | @pytest.mark.asyncio 39 | async def test_fetch_stream_raw(source): 40 | # Test fetch_stream_raw with a real request (ensure server is running locally) 41 | ssml_text = f"こんにちは。これは音声合成のテストです。" 42 | http_request = source.make_stream_request(ssml_text.encode("utf-8"), "riff-16khz-16bit-mono-pcm") 43 | 44 | try: 45 | # Replace this part with a live test against the actual service 46 | async for chunk in source.fetch_stream_raw(http_request): 47 | assert isinstance(chunk, bytes) 48 | except Exception as e: 49 | pytest.fail(f"fetch_stream_raw failed: {e}") 50 | 51 | @pytest.mark.asyncio 52 | async def test_fetch_stream(source): 53 | # Test fetch_stream method with conversion and caching 54 | ssml_text = f"こんにちは。これは音声合成のテストです。" 55 | audio_format = "mp3" 56 | 57 | try: 58 | async for chunk in await source.fetch_stream(audio_format, azure_audio_format="audio-16khz-32kbitrate-mono-mp3", encoded_ssml=ssml_text.encode("utf-8")): 59 | assert isinstance(chunk, bytes) 60 | except Exception as e: 61 | pytest.fail(f"fetch_stream failed: {e}") 62 | -------------------------------------------------------------------------------- /tests/source/test_nijivoice_encoded_source.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import os 3 | from speech_gateway.source.nijivoice_encoded import NijiVoiceEncodedStreamSource 4 | 5 | BASE_URL = "https://api.nijivoice.com" 6 | GATEWAY_BASE_URL = "http://127.0.0.1:8000/nijivoice" 7 | NIJIVOICE_API_KEY = os.getenv("NIJIVOICE_API_KEY") 8 | VOICE_ACTOR_ID = "a192db5f-bd8b-4fc7-bc08-af5ca5957c12" 9 | PAYLOAD = { 10 | "script": "こんにちは。これはテストです。", 11 | "speed": "1.0", 12 | "emotionalLevel": "0.1", 13 | "soundDuration": "0.1", 14 | "format": "mp3", 15 | } 16 | 17 | 18 | @pytest.fixture 19 | def source(): 20 | # Create an instance of NijiVoiceEncodedStreamSource 21 | return NijiVoiceEncodedStreamSource(base_url=BASE_URL, api_key=NIJIVOICE_API_KEY, debug=True) 22 | 23 | @pytest.mark.asyncio 24 | async def test_get_cache_key(source): 25 | # Test get_cache_key method 26 | cache_key = source.get_cache_key("mp3", VOICE_ACTOR_ID, PAYLOAD) 27 | assert cache_key.endswith(".mp3.json") 28 | assert VOICE_ACTOR_ID in cache_key 29 | 30 | cache_key = source.get_cache_key("wav", VOICE_ACTOR_ID, PAYLOAD) 31 | assert cache_key.endswith(".wav.json") 32 | assert VOICE_ACTOR_ID in cache_key 33 | 34 | @pytest.mark.asyncio 35 | async def test_parse_text(source): 36 | # Test parse_text method 37 | text = source.parse_text(request_json=PAYLOAD) 38 | assert text == PAYLOAD["script"] 39 | 40 | @pytest.mark.asyncio 41 | async def test_make_stream_request(source): 42 | # Test make_stream_request method 43 | request = source.make_stream_request(VOICE_ACTOR_ID, PAYLOAD) 44 | assert request["method"] == "POST" 45 | assert request["url"] == f"{BASE_URL}/api/platform/v1/voice-actors/{VOICE_ACTOR_ID}/generate-encoded-voice" 46 | assert request["headers"]["x-api-key"] == NIJIVOICE_API_KEY 47 | assert request["json"] == PAYLOAD 48 | 49 | @pytest.mark.asyncio 50 | async def test_fetch_stream_raw(source): 51 | # Test fetch_stream_raw with a real request (ensure server is running locally) 52 | http_request = source.make_stream_request(VOICE_ACTOR_ID, PAYLOAD) 53 | 54 | try: 55 | # Replace this part with a live test against the actual service 56 | async for chunk in source.fetch_stream_raw(http_request): 57 | assert isinstance(chunk, bytes) 58 | except Exception as e: 59 | pytest.fail(f"fetch_stream_raw failed: {e}") 60 | 61 | @pytest.mark.asyncio 62 | async def test_fetch_stream(source): 63 | # Test fetch_stream method with conversion and caching 64 | try: 65 | async for chunk in await source.fetch_stream( 66 | audio_format="mp3", 67 | voice_actor_id=VOICE_ACTOR_ID, 68 | request_json=PAYLOAD, 69 | ): 70 | assert isinstance(chunk, bytes) 71 | except Exception as e: 72 | pytest.fail(f"fetch_stream failed: {e}") 73 | -------------------------------------------------------------------------------- /speech_gateway/gateway/voicevox.py: -------------------------------------------------------------------------------- 1 | from typing import Dict 2 | from fastapi import APIRouter, Request 3 | from fastapi.responses import StreamingResponse 4 | from . import SpeechGateway, UnifiedTTSRequest 5 | from ..cache.file import FileCacheStorage 6 | from ..converter.mp3 import MP3Converter 7 | from ..performance_recorder import SQLitePerformanceRecorder 8 | from ..source.voicevox import VoicevoxStreamSource 9 | 10 | 11 | class VoicevoxGateway(SpeechGateway): 12 | def __init__(self, *, stream_source: VoicevoxStreamSource = None, base_url: str = None, cache_dir: str = None, style_mapper: Dict[str, Dict[str, str]] = None, debug = False): 13 | self.stream_source: VoicevoxStreamSource = None 14 | if stream_source: 15 | super().__init__(stream_source=stream_source, debug=debug) 16 | else: 17 | super().__init__( 18 | stream_source=VoicevoxStreamSource( 19 | base_url=base_url or "http://127.0.0.1:50021", 20 | cache_storage=FileCacheStorage(cache_dir=cache_dir or "voicevox_cache"), 21 | format_converters={"mp3": MP3Converter(bitrate="64k")}, 22 | performance_recorder=SQLitePerformanceRecorder(), 23 | debug=debug 24 | ), 25 | debug=debug 26 | ) 27 | self.style_mapper = style_mapper or {} 28 | 29 | def register_endpoint(self, router: APIRouter): 30 | @router.post("/synthesis") 31 | async def synthesis_handler(speaker: str, request: Request, x_audio_format: str = "wav"): 32 | audio_format = "mp3" if x_audio_format == "mp3" else "wav" 33 | stream_resp = await self.stream_source.fetch_stream( 34 | audio_format=audio_format, 35 | speaker=speaker, 36 | audio_query=await request.json(), 37 | ) 38 | return StreamingResponse(stream_resp, media_type=f"audio/{audio_format}") 39 | 40 | async def unified_tts_handler(self, request: Request, tts_request: UnifiedTTSRequest, x_audio_format: str = "wav"): 41 | speaker = tts_request.speaker 42 | 43 | # Apply style 44 | if tts_request.style is not None and (styles_for_speaker := self.style_mapper.get(tts_request.speaker)): 45 | for k, v in styles_for_speaker.items(): 46 | if k.lower() == tts_request.style.lower(): 47 | speaker = v 48 | break 49 | 50 | audio_query = await self.stream_source.get_audio_query(speaker, tts_request.text) 51 | 52 | if tts_request.speed: 53 | audio_query["speedScale"] = tts_request.speed 54 | 55 | stream_resp = await self.stream_source.fetch_stream( 56 | audio_format=x_audio_format, 57 | speaker=speaker, 58 | audio_query=audio_query, 59 | ) 60 | return StreamingResponse(stream_resp, media_type=f"audio/{x_audio_format}") 61 | -------------------------------------------------------------------------------- /tests/gateway/test_sbv2.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | import httpx 4 | 5 | 6 | @pytest.mark.asyncio 7 | async def test_sbv2(random_text, wave_checker, audio_transcriber): 8 | query_params = { 9 | "text": random_text, 10 | "model_id": "0", 11 | "speaker_id": "0" 12 | } 13 | resp = httpx.get("http://127.0.0.1:8000/sbv2/voice", params=query_params) 14 | audio_data = resp.content 15 | assert wave_checker(audio_data) 16 | assert "音声合成" in audio_transcriber(audio_data, "wav") 17 | 18 | 19 | @pytest.mark.asyncio 20 | async def test_sbv2_wav(random_text, wave_checker, audio_transcriber): 21 | query_params = { 22 | "text": random_text, 23 | "model_id": "0", 24 | "speaker_id": "0", 25 | "x_audio_format": "wav" 26 | } 27 | resp = httpx.get("http://127.0.0.1:8000/sbv2/voice", params=query_params) 28 | audio_data = resp.content 29 | assert wave_checker(audio_data) 30 | assert "音声合成" in audio_transcriber(audio_data, "wav") 31 | 32 | 33 | @pytest.mark.asyncio 34 | async def test_sbv2_mp3(random_text, mp3_checker, audio_transcriber): 35 | query_params = { 36 | "text": random_text, 37 | "model_id": "0", 38 | "speaker_id": "0", 39 | "x_audio_format": "mp3" 40 | } 41 | resp = httpx.get("http://127.0.0.1:8000/sbv2/voice", params=query_params) 42 | audio_data = resp.content 43 | assert mp3_checker(audio_data) 44 | assert "音声合成" in audio_transcriber(audio_data, "mp3") 45 | 46 | 47 | @pytest.mark.asyncio 48 | async def test_sbv2_unified(random_text, wave_checker, audio_transcriber): 49 | req = { 50 | "text": random_text, 51 | "speaker": "0-0", 52 | "service_name": "sbv2" 53 | } 54 | resp = httpx.post("http://127.0.0.1:8000/tts", json=req) 55 | audio_data = resp.content 56 | assert wave_checker(audio_data) 57 | assert "音声合成" in audio_transcriber(audio_data, "wav") 58 | 59 | 60 | @pytest.mark.asyncio 61 | async def test_sbv2_unified_wav(random_text, wave_checker, audio_transcriber): 62 | req = { 63 | "text": random_text, 64 | "speaker": "0-0", 65 | "service_name": "sbv2" 66 | } 67 | query_params = { 68 | "x_audio_format": "wav" 69 | } 70 | resp = httpx.post("http://127.0.0.1:8000/tts", params=query_params, json=req) 71 | audio_data = resp.content 72 | assert wave_checker(audio_data) 73 | assert "音声合成" in audio_transcriber(audio_data, "wav") 74 | 75 | 76 | @pytest.mark.asyncio 77 | async def test_sbv2_unified_mp3(random_text, mp3_checker, audio_transcriber): 78 | req = { 79 | "text": random_text, 80 | "speaker": "0-0", 81 | "service_name": "sbv2" 82 | } 83 | query_params = { 84 | "x_audio_format": "mp3" 85 | } 86 | resp = httpx.post("http://127.0.0.1:8000/tts", params=query_params, json=req) 87 | audio_data = resp.content 88 | assert mp3_checker(audio_data) 89 | assert "音声合成" in audio_transcriber(audio_data, "mp3") 90 | -------------------------------------------------------------------------------- /tests/cache/test_file.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from speech_gateway.cache import FileCacheStorage 3 | 4 | 5 | @pytest.fixture 6 | def temp_cache_dir(tmp_path): 7 | # Create a temporary cache directory for testing 8 | cache_dir = tmp_path / "test_cache" 9 | cache_dir.mkdir() 10 | return cache_dir 11 | 12 | 13 | @pytest.fixture 14 | def file_cache_storage(temp_cache_dir): 15 | # Create a FileCacheStorage instance using the temporary directory 16 | return FileCacheStorage(cache_dir=str(temp_cache_dir)) 17 | 18 | 19 | @pytest.mark.asyncio 20 | async def test_has_cache(file_cache_storage, temp_cache_dir): 21 | # Test has_cache method 22 | cache_key = "test_file" 23 | file_path = temp_cache_dir / cache_key 24 | 25 | # Case 1: File does not exist 26 | assert not await file_cache_storage.has_cache(cache_key) 27 | 28 | # Case 2: File exists and has content 29 | file_path.write_text("test content") 30 | assert await file_cache_storage.has_cache(cache_key) 31 | 32 | # Case 3: File exists but is empty 33 | file_path.write_text("") 34 | assert not await file_cache_storage.has_cache(cache_key) 35 | assert not file_path.exists() # Should be deleted 36 | 37 | 38 | @pytest.mark.asyncio 39 | async def test_fetch_cache_stream(file_cache_storage, temp_cache_dir): 40 | # Test fetch_cache_stream method 41 | cache_key = "test_file" 42 | file_path = temp_cache_dir / cache_key 43 | content = b"This is test content." 44 | file_path.write_bytes(content) 45 | 46 | result = b"" 47 | async for chunk in file_cache_storage.fetch_cache_stream(cache_key): 48 | result += chunk 49 | 50 | assert result == content 51 | 52 | 53 | @pytest.mark.asyncio 54 | async def test_write_cache(file_cache_storage, temp_cache_dir): 55 | # Test write_cache method 56 | cache_key = "test_file" 57 | file_path = temp_cache_dir / cache_key 58 | 59 | async def input_stream(): 60 | yield b"Part 1 " 61 | yield b"Part 2" 62 | 63 | result = b"" 64 | async for chunk in file_cache_storage.write_cache(input_stream(), cache_key): 65 | result += chunk 66 | 67 | assert file_path.exists() 68 | assert file_path.read_bytes() == b"Part 1 Part 2" 69 | assert result == b"Part 1 Part 2" 70 | 71 | 72 | @pytest.mark.asyncio 73 | async def test_delete_cache(file_cache_storage, temp_cache_dir): 74 | # Test delete_cache method 75 | cache_key = "test_file" 76 | file_path = temp_cache_dir / cache_key 77 | file_path.write_text("test content") 78 | 79 | await file_cache_storage.delete_cache(cache_key) 80 | assert not file_path.exists() 81 | 82 | 83 | @pytest.mark.asyncio 84 | async def test_clear_all_cache(file_cache_storage, temp_cache_dir): 85 | # Test clear_all_cache method 86 | (temp_cache_dir / "file1").write_text("content1") 87 | (temp_cache_dir / "file2").write_text("content2") 88 | 89 | await file_cache_storage.clear_all_cache() 90 | 91 | assert len(list(temp_cache_dir.iterdir())) == 0 92 | -------------------------------------------------------------------------------- /tests/converter/test_wave.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import os 3 | import wave 4 | import io 5 | from typing import AsyncIterator 6 | from speech_gateway.converter.wave import WaveConverter, FormatConverterError 7 | 8 | 9 | @pytest.fixture 10 | def wave_converter(): 11 | return WaveConverter() 12 | 13 | 14 | @pytest.fixture 15 | def wave_converter_custom(): 16 | return WaveConverter(output_sample_rate=8000, output_sample_width=1) 17 | 18 | 19 | @pytest.mark.asyncio 20 | async def test_wave_conversion(wave_converter): 21 | input_file = "tests/data/test.wav" 22 | 23 | async def input_stream() -> AsyncIterator[bytes]: 24 | with open(input_file, "rb") as f: 25 | while chunk := f.read(1024): 26 | yield chunk 27 | 28 | output = b"" 29 | try: 30 | async for chunk in wave_converter.convert(input_stream()): 31 | output += chunk 32 | except FormatConverterError as e: 33 | pytest.fail(f"Wave conversion failed with error: {e}") 34 | 35 | assert output != b"" 36 | 37 | with wave.open(io.BytesIO(output), 'rb') as wf: 38 | assert wf.getframerate() == 16000 39 | assert wf.getsampwidth() == 2 40 | 41 | 42 | @pytest.mark.asyncio 43 | async def test_wave_conversion_custom_params(wave_converter_custom): 44 | input_file = "tests/data/test.wav" 45 | 46 | async def input_stream() -> AsyncIterator[bytes]: 47 | with open(input_file, "rb") as f: 48 | while chunk := f.read(1024): 49 | yield chunk 50 | 51 | output = b"" 52 | try: 53 | async for chunk in wave_converter_custom.convert(input_stream()): 54 | output += chunk 55 | except FormatConverterError as e: 56 | pytest.fail(f"Wave conversion failed with error: {e}") 57 | 58 | assert output != b"" 59 | 60 | with wave.open(io.BytesIO(output), 'rb') as wf: 61 | assert wf.getframerate() == 8000 62 | assert wf.getsampwidth() == 1 63 | 64 | 65 | @pytest.mark.asyncio 66 | async def test_wave_conversion_error_handling(wave_converter): 67 | async def input_stream() -> AsyncIterator[bytes]: 68 | yield b"Invalid wave data" 69 | 70 | with pytest.raises(FormatConverterError) as exc_info: 71 | async for _ in wave_converter.convert(input_stream()): 72 | pass 73 | 74 | assert "Error during Mu-Law conversion" in str(exc_info.value) 75 | 76 | 77 | @pytest.mark.asyncio 78 | async def test_convert_wave_bytes(): 79 | converter = WaveConverter(output_sample_rate=8000, output_sample_width=1) 80 | 81 | input_io = io.BytesIO() 82 | with wave.open(input_io, 'wb') as wf: 83 | wf.setframerate(16000) 84 | wf.setsampwidth(2) 85 | wf.setnchannels(1) 86 | wf.writeframes(b'\x00\x00' * 1000) 87 | 88 | input_bytes = input_io.getvalue() 89 | output_bytes = converter.convert_wave_bytes(input_bytes, 8000, 1) 90 | 91 | assert output_bytes != b"" 92 | 93 | with wave.open(io.BytesIO(output_bytes), 'rb') as wf: 94 | assert wf.getframerate() == 8000 95 | assert wf.getsampwidth() == 1 96 | assert wf.getnchannels() == 1 97 | -------------------------------------------------------------------------------- /speech_gateway/performance_recorder/sqlite.py: -------------------------------------------------------------------------------- 1 | from dataclasses import fields 2 | from datetime import datetime, timezone 3 | import queue 4 | import sqlite3 5 | import threading 6 | from . import PerformanceRecorder, PerformanceRecord 7 | 8 | 9 | class SQLitePerformanceRecorder(PerformanceRecorder): 10 | def __init__(self, db_path="performance.db"): 11 | self.db_path = db_path 12 | self.record_queue = queue.Queue() 13 | self.stop_event = threading.Event() 14 | 15 | self.init_db() 16 | 17 | self.worker_thread = threading.Thread(target=self.start_worker, daemon=True) 18 | self.worker_thread.start() 19 | 20 | def init_db(self): 21 | conn = sqlite3.connect(self.db_path) 22 | try: 23 | with conn: 24 | conn.execute( 25 | """ 26 | CREATE TABLE IF NOT EXISTS performance_records ( 27 | id INTEGER PRIMARY KEY AUTOINCREMENT, 28 | process_id TEXT NOT NULL, 29 | created_at TEXT NOT NULL, 30 | source TEXT, 31 | text TEXT, 32 | audio_format TEXT, 33 | cached INTEGER, 34 | elapsed REAL 35 | ) 36 | """ 37 | ) 38 | finally: 39 | conn.close() 40 | 41 | def start_worker(self): 42 | conn = sqlite3.connect(self.db_path) 43 | try: 44 | while not self.stop_event.is_set() or not self.record_queue.empty(): 45 | try: 46 | record = self.record_queue.get(timeout=0.5) 47 | except queue.Empty: 48 | continue 49 | 50 | self.insert_record(conn, record) 51 | self.record_queue.task_done() 52 | finally: 53 | conn.close() 54 | 55 | def insert_record(self, conn: sqlite3.Connection, record: PerformanceRecord): 56 | columns = [field.name for field in fields(PerformanceRecord)] + ["created_at"] 57 | placeholders = ["?"] * len(columns) 58 | values = [getattr(record, field.name) for field in fields(PerformanceRecord)] + [datetime.now(timezone.utc)] 59 | sql = f"INSERT INTO performance_records ({', '.join(columns)}) VALUES ({', '.join(placeholders)})" 60 | conn.execute(sql, values) 61 | conn.commit() 62 | 63 | def record( 64 | self, 65 | *, 66 | process_id: str, 67 | source: str = None, 68 | text: str = None, 69 | audio_format: str = None, 70 | cached: int = 0, 71 | elapsed: float = None, 72 | ): 73 | performance_record = PerformanceRecord( 74 | process_id=process_id, 75 | source=source, 76 | text=text, 77 | audio_format=audio_format, 78 | cached = cached, 79 | elapsed = elapsed 80 | ) 81 | 82 | self.record_queue.put(performance_record) 83 | 84 | def close(self): 85 | self.stop_event.set() 86 | self.record_queue.join() 87 | self.worker_thread.join() 88 | -------------------------------------------------------------------------------- /speech_gateway/gateway/openai_speech.py: -------------------------------------------------------------------------------- 1 | from fastapi import APIRouter, Request 2 | from fastapi.responses import StreamingResponse 3 | from . import SpeechGateway, UnifiedTTSRequest 4 | from ..cache.file import FileCacheStorage 5 | from ..performance_recorder import PerformanceRecorder, SQLitePerformanceRecorder 6 | from ..source.openai_speech import OpenAIStreamSource 7 | 8 | 9 | class OpenAIGateway(SpeechGateway): 10 | def __init__(self, *, stream_source: OpenAIStreamSource = None, api_key: str = None, model: str = "tts-1", speed: float = 1.0, instructions: str = None, base_url: str = None, cache_dir: str = None, performance_recorder: PerformanceRecorder = None, debug = False): 11 | self.stream_source: OpenAIStreamSource = None 12 | if stream_source: 13 | super().__init__(stream_source=stream_source, debug=debug) 14 | else: 15 | super().__init__( 16 | stream_source=OpenAIStreamSource( 17 | api_key=api_key, 18 | base_url=base_url or "https://api.openai.com/v1", 19 | cache_storage=FileCacheStorage(cache_dir=cache_dir or "openai_cache"), 20 | format_converters={}, 21 | performance_recorder=performance_recorder or SQLitePerformanceRecorder(), 22 | debug=debug 23 | ), 24 | debug=debug 25 | ) 26 | self.model = model 27 | self.speed = speed 28 | self.instructions = instructions 29 | 30 | def register_endpoint(self, router: APIRouter): 31 | @router.post("/audio/speech") 32 | async def synthesis_handler(request: Request, x_audio_format: str = None): 33 | request_json = await request.json() 34 | 35 | if x_audio_format: 36 | if x_audio_format in ["mp3", "opus", "aac", "flac", "wav", "pcm"]: 37 | request_json["response_format"] = x_audio_format 38 | else: 39 | # Set wave to convert to other format later 40 | request_json["response_format"] = "wav" 41 | else: 42 | x_audio_format = request_json.get("response_format", "mp3") 43 | 44 | stream_resp = await self.stream_source.fetch_stream( 45 | request_json=request_json, 46 | audio_format=x_audio_format 47 | ) 48 | return StreamingResponse(stream_resp, media_type=f"audio/{x_audio_format}") 49 | 50 | async def unified_tts_handler(self, request: Request, tts_request: UnifiedTTSRequest, x_audio_format: str = "wav"): 51 | request_json = { 52 | "model": self.model, 53 | "voice": tts_request.speaker, 54 | "input": tts_request.text, 55 | "speed": tts_request.speed or self.speed, 56 | "instructions": self.instructions, 57 | "response_format": x_audio_format 58 | } 59 | 60 | stream_resp = await self.stream_source.fetch_stream( 61 | audio_format=x_audio_format, 62 | request_json=request_json, 63 | ) 64 | return StreamingResponse(stream_resp, media_type=f"audio/{x_audio_format}") 65 | -------------------------------------------------------------------------------- /speech_gateway/converter/mulaw.py: -------------------------------------------------------------------------------- 1 | import audioop 2 | import io 3 | import struct 4 | from typing import AsyncIterator 5 | import wave 6 | from . import FormatConverter, FormatConverterError 7 | 8 | 9 | class MuLawConverter(FormatConverter): 10 | def __init__(self, rate: int = 8000, include_header: bool = False, to_linear16: callable = None): 11 | self.rate = rate 12 | self.include_header = include_header 13 | self.to_linear16 = to_linear16 14 | 15 | def create_au_header(self, data_size: int, sample_rate: int, channels: int) -> bytes: 16 | magic_number = b".snd" # Magic number 17 | header_size = 24 # Fixed header size (24 bytes for standard .au header) 18 | encoding = 1 # Mu-Law encoding 19 | reserved = 0 # Reserved field, must be 0 20 | 21 | # Create header 22 | header = struct.pack( 23 | ">4sIIIIII", # Big-endian: 4-char string, 6 unsigned integers 24 | magic_number, # Magic number 25 | header_size, # Header size 26 | data_size, # Data size 27 | encoding, # Encoding format 28 | sample_rate, # Sample rate 29 | channels, # Number of channels 30 | reserved # Reserved field 31 | ) 32 | return header 33 | 34 | async def convert(self, input_stream: AsyncIterator[bytes]) -> AsyncIterator[bytes]: 35 | try: 36 | # Load whole wave data 37 | wav_data = b"" 38 | async for chunk in input_stream: 39 | wav_data += chunk 40 | 41 | if self.to_linear16: 42 | wav_data = self.to_linear16(wav_data) 43 | 44 | # Parse wave info 45 | with wave.open(io.BytesIO(wav_data), "rb") as wf: 46 | nchannels = wf.getnchannels() 47 | sampwidth = wf.getsampwidth() 48 | framerate = wf.getframerate() 49 | nframes = wf.getnframes() 50 | raw_frames = wf.readframes(nframes) 51 | 52 | # Convert channel 53 | if nchannels > 1: 54 | mono_frames = audioop.tomono(raw_frames, sampwidth, 0.5, 0.5) 55 | else: 56 | mono_frames = raw_frames 57 | 58 | # Convert sample rate 59 | if framerate != self.rate: 60 | converted_frames, _ = audioop.ratecv( 61 | mono_frames, 62 | sampwidth, 63 | 1, 64 | framerate, 65 | self.rate, 66 | None 67 | ) 68 | else: 69 | converted_frames = mono_frames 70 | 71 | # Convert format 72 | mulaw_data = audioop.lin2ulaw(converted_frames, sampwidth) 73 | 74 | if self.include_header: 75 | # Create .au header 76 | header = self.create_au_header(len(mulaw_data), self.rate, 1) 77 | mulaw_data = header + mulaw_data 78 | 79 | # Return whole data at once 80 | yield mulaw_data 81 | 82 | except Exception as ex: 83 | raise FormatConverterError(f"Error during Mu-Law conversion: {str(ex)}") 84 | -------------------------------------------------------------------------------- /speech_gateway/gateway/sbv2.py: -------------------------------------------------------------------------------- 1 | from typing import Dict 2 | from fastapi import APIRouter, Request 3 | from fastapi.responses import StreamingResponse 4 | from . import SpeechGateway, UnifiedTTSRequest 5 | from ..cache.file import FileCacheStorage 6 | from ..converter.mp3 import MP3Converter 7 | from ..performance_recorder import PerformanceRecorder, SQLitePerformanceRecorder 8 | from ..source.sbv2 import StyleBertVits2StreamSource 9 | 10 | 11 | class StyleBertVits2Gateway(SpeechGateway): 12 | def __init__(self, *, stream_source: StyleBertVits2StreamSource = None, base_url: str = None, cache_dir: str = None, performance_recorder: PerformanceRecorder = None, style_mapper: Dict[str, Dict[str, str]] = None, debug = False): 13 | self.stream_source: StyleBertVits2StreamSource = None 14 | if stream_source: 15 | super().__init__(stream_source=stream_source, debug=debug) 16 | else: 17 | super().__init__( 18 | stream_source=StyleBertVits2StreamSource( 19 | base_url=base_url or "http://127.0.0.1:5000", 20 | cache_storage=FileCacheStorage(cache_dir=cache_dir or "sbv2_cache"), 21 | format_converters={"mp3": MP3Converter(bitrate="64k")}, 22 | performance_recorder=performance_recorder or SQLitePerformanceRecorder(), 23 | debug=debug 24 | ), 25 | debug=debug 26 | ) 27 | self.style_mapper = style_mapper or {} 28 | 29 | def register_endpoint(self, router: APIRouter): 30 | @router.get("/voice") 31 | async def get_voice_handler(request: Request): 32 | query_params = dict(request.query_params) 33 | filtered_params = { 34 | k: v for k, v in query_params.items() if v is not None and k not in {"x_audio_format"} 35 | } 36 | audio_format = query_params.get("x_audio_format", "wav") 37 | 38 | stream_resp = await self.stream_source.fetch_stream( 39 | audio_format=audio_format, 40 | query_params=filtered_params, 41 | ) 42 | return StreamingResponse(stream_resp, media_type=f"audio/{audio_format}") 43 | 44 | async def unified_tts_handler(self, request: Request, tts_request: UnifiedTTSRequest, x_audio_format: str = "wav"): 45 | # Basic params 46 | model_id, speaker_id = tts_request.speaker.split("-") 47 | query_params = { 48 | "text": tts_request.text, 49 | "model_id": model_id, 50 | "speaker_id": speaker_id 51 | } 52 | 53 | if tts_request.speed: 54 | query_params["length"] = 1 / tts_request.speed 55 | 56 | # Apply style 57 | if tts_request.style is not None and (styles_for_speaker := self.style_mapper.get(tts_request.speaker)): 58 | for k, v in styles_for_speaker.items(): 59 | if k.lower() == tts_request.style.lower(): 60 | query_params["style"] = v 61 | break 62 | 63 | # Additional params 64 | for k, v in dict(request.query_params).items(): 65 | if v is not None and k not in {"x_audio_format"}: 66 | query_params[k] = v 67 | 68 | stream_resp = await self.stream_source.fetch_stream( 69 | audio_format=x_audio_format, 70 | query_params=query_params, 71 | ) 72 | 73 | return StreamingResponse(stream_resp, media_type=f"audio/{x_audio_format}") 74 | -------------------------------------------------------------------------------- /speech_gateway/gateway/azure.py: -------------------------------------------------------------------------------- 1 | from fastapi import APIRouter, Request 2 | from fastapi.responses import StreamingResponse 3 | from . import SpeechGateway, UnifiedTTSRequest 4 | from ..cache.file import FileCacheStorage 5 | from ..performance_recorder import PerformanceRecorder, SQLitePerformanceRecorder 6 | from ..source.azure import AzureStreamSource 7 | 8 | 9 | class AzureGateway(SpeechGateway): 10 | def __init__(self, *, stream_source: AzureStreamSource = None, api_key: str = None, region: str = None, base_url: str = None, language: str = "ja-JP", cache_dir: str = None, performance_recorder: PerformanceRecorder = None, debug = False): 11 | self.stream_source: AzureStreamSource = None 12 | if stream_source: 13 | super().__init__(stream_source=stream_source, debug=debug) 14 | else: 15 | super().__init__( 16 | stream_source=AzureStreamSource( 17 | api_key=api_key, 18 | region=region, 19 | base_url=base_url or "https://{region}.tts.speech.microsoft.com/cognitiveservices/v1", 20 | cache_storage=FileCacheStorage(cache_dir=cache_dir or "azure_cache"), 21 | format_converters={}, 22 | performance_recorder=performance_recorder or SQLitePerformanceRecorder(), 23 | debug=debug 24 | ), 25 | debug=debug 26 | ) 27 | self.default_language = language 28 | 29 | def register_endpoint(self, router: APIRouter): 30 | @router.post("/cognitiveservices/v1") 31 | async def synthesis_handler(request: Request, x_audio_format: str = None): 32 | if x_audio_format == "wav": 33 | azure_audio_format = "riff-16khz-16bit-mono-pcm" 34 | elif x_audio_format == "mp3": 35 | azure_audio_format = "audio-16khz-32kbitrate-mono-mp3" 36 | else: 37 | azure_audio_format = request.headers["X-Microsoft-OutputFormat"] 38 | if "pcm" in azure_audio_format: 39 | x_audio_format = "wav" 40 | else: 41 | x_audio_format = "mp3" 42 | 43 | stream_resp = await self.stream_source.fetch_stream( 44 | encoded_ssml=await request.body(), 45 | azure_audio_format=azure_audio_format, 46 | audio_format=x_audio_format 47 | ) 48 | return StreamingResponse(stream_resp, media_type=f"audio/{x_audio_format}") 49 | 50 | async def unified_tts_handler(self, request: Request, tts_request: UnifiedTTSRequest, x_audio_format: str = "wav"): 51 | if x_audio_format == "wav": 52 | azure_audio_format = "riff-16khz-16bit-mono-pcm" 53 | elif x_audio_format == "mp3": 54 | azure_audio_format = "audio-16khz-32kbitrate-mono-mp3" 55 | 56 | if tts_request.speed: 57 | speed_percentage = (tts_request.speed - 1.0) * 100 58 | else: 59 | speed_percentage = 0 60 | ssml_text = f"{tts_request.text}" 61 | 62 | stream_resp = await self.stream_source.fetch_stream( 63 | encoded_ssml=ssml_text.encode("utf-8"), 64 | azure_audio_format=azure_audio_format, 65 | audio_format=x_audio_format 66 | ) 67 | return StreamingResponse(stream_resp, media_type=f"audio/{x_audio_format}") 68 | -------------------------------------------------------------------------------- /speech_gateway/gateway/nijivoice_encoded.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import io 3 | import json 4 | from typing import Dict 5 | from fastapi import APIRouter, Request 6 | from fastapi.responses import StreamingResponse, Response 7 | from . import SpeechGateway, UnifiedTTSRequest 8 | from ..cache.file import FileCacheStorage 9 | from ..performance_recorder import PerformanceRecorder, SQLitePerformanceRecorder 10 | from ..source.nijivoice_encoded import NijiVoiceEncodedStreamSource 11 | 12 | 13 | class NijiVoiceEncodedGateway(SpeechGateway): 14 | def __init__(self, *, stream_source: NijiVoiceEncodedStreamSource = None, api_key: str = None, speeds: Dict[str, float] = None, base_url: str = None, cache_dir: str = None, performance_recorder: PerformanceRecorder = None, debug = False): 15 | self.stream_source: NijiVoiceEncodedStreamSource = None 16 | if stream_source: 17 | super().__init__(stream_source=stream_source, debug=debug) 18 | else: 19 | super().__init__( 20 | stream_source=NijiVoiceEncodedStreamSource( 21 | api_key=api_key, 22 | base_url=base_url or "https://api.nijivoice.com", 23 | cache_storage=FileCacheStorage(cache_dir=cache_dir or "nijivoice_encoded_cache"), 24 | format_converters={}, 25 | performance_recorder=performance_recorder or SQLitePerformanceRecorder(), 26 | debug=debug 27 | ), 28 | debug=debug 29 | ) 30 | self.speeds = speeds or {} 31 | 32 | def register_endpoint(self, router: APIRouter): 33 | @router.post("/api/platform/v1/voice-actors/{voice_actor_id}/generate-encoded-voice") 34 | async def get_voice_handler(voice_actor_id: str, request: Request, x_audio_format: str = None): 35 | request_json = await request.json() 36 | 37 | if x_audio_format: 38 | if x_audio_format in ["mp3", "wav"]: 39 | request_json["format"] = x_audio_format 40 | else: 41 | # Set wave to convert to other format later 42 | request_json["format"] = "wav" 43 | else: 44 | x_audio_format = request_json.get("format", "mp3") 45 | 46 | stream_resp = await self.stream_source.fetch_stream( 47 | voice_actor_id=voice_actor_id, 48 | audio_format=x_audio_format, 49 | request_json=request_json, 50 | ) 51 | 52 | json_bytes = b"" 53 | async for chunk in stream_resp: 54 | json_bytes += chunk 55 | 56 | return Response(content=json_bytes, media_type=f"application/json") 57 | 58 | async def unified_tts_handler(self, request: Request, tts_request: UnifiedTTSRequest, x_audio_format: str = "wav"): 59 | request_json = { 60 | "script": tts_request.text, 61 | "speed": str(tts_request.speed) if tts_request.speed else str(self.speeds.get(tts_request.speaker, "1.0")), 62 | "format": x_audio_format if x_audio_format == "mp3" else "wav" 63 | } 64 | 65 | stream_resp = await self.stream_source.fetch_stream( 66 | voice_actor_id=tts_request.speaker, 67 | audio_format=x_audio_format, 68 | request_json=request_json, 69 | ) 70 | 71 | json_bytes = b"" 72 | async for chunk in stream_resp: 73 | json_bytes += chunk 74 | response_json = json.loads(json_bytes) 75 | base64_audio = response_json["generatedVoice"]["base64Audio"] 76 | audio_bytes = base64.b64decode(base64_audio) 77 | 78 | return StreamingResponse(io.BytesIO(audio_bytes), media_type=f"audio/{x_audio_format}") 79 | -------------------------------------------------------------------------------- /tests/gateway/test_voicevox.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | import httpx 4 | 5 | SPEAKER = 46 6 | 7 | 8 | @pytest.mark.asyncio 9 | async def test_voicevox(random_text, wave_checker, audio_transcriber): 10 | audio_query = httpx.post( 11 | "http://127.0.0.1:8000/voicevox/audio_query", 12 | params={"speaker": SPEAKER, "text": random_text} 13 | ).json() 14 | 15 | query_params = { 16 | "speaker": SPEAKER 17 | } 18 | resp = httpx.post( 19 | "http://127.0.0.1:8000/voicevox/synthesis", 20 | params=query_params, 21 | json=audio_query 22 | ) 23 | audio_data = resp.content 24 | assert wave_checker(audio_data) 25 | assert "音声合成" in audio_transcriber(audio_data, "wav") 26 | 27 | 28 | @pytest.mark.asyncio 29 | async def test_voicevox_wav(random_text, wave_checker, audio_transcriber): 30 | audio_query = httpx.post( 31 | "http://127.0.0.1:8000/voicevox/audio_query", 32 | params={"speaker": SPEAKER, "text": random_text} 33 | ).json() 34 | 35 | query_params = { 36 | "speaker": SPEAKER, 37 | "x_audio_format": "wav" 38 | } 39 | resp = httpx.post( 40 | "http://127.0.0.1:8000/voicevox/synthesis", 41 | params=query_params, 42 | json=audio_query 43 | ) 44 | audio_data = resp.content 45 | assert wave_checker(audio_data) 46 | assert "音声合成" in audio_transcriber(audio_data, "wav") 47 | 48 | 49 | @pytest.mark.asyncio 50 | async def test_voicevox_mp3(random_text, mp3_checker, audio_transcriber): 51 | audio_query = httpx.post( 52 | "http://127.0.0.1:8000/voicevox/audio_query", 53 | params={"speaker": SPEAKER, "text": random_text} 54 | ).json() 55 | 56 | query_params = { 57 | "speaker": SPEAKER, 58 | "x_audio_format": "mp3" 59 | } 60 | resp = httpx.post( 61 | "http://127.0.0.1:8000/voicevox/synthesis", 62 | params=query_params, 63 | json=audio_query 64 | ) 65 | audio_data = resp.content 66 | assert mp3_checker(audio_data) 67 | assert "音声合成" in audio_transcriber(audio_data, "mp3") 68 | 69 | 70 | @pytest.mark.asyncio 71 | async def test_voicevox_unified(random_text, wave_checker, audio_transcriber): 72 | req = { 73 | "text": random_text, 74 | "speaker": str(SPEAKER), 75 | "service_name": "voicevox" 76 | } 77 | resp = httpx.post("http://127.0.0.1:8000/tts", json=req) 78 | audio_data = resp.content 79 | assert wave_checker(audio_data) 80 | assert "音声合成" in audio_transcriber(audio_data, "wav") 81 | 82 | 83 | @pytest.mark.asyncio 84 | async def test_voicevox_unified_wav(random_text, wave_checker, audio_transcriber): 85 | req = { 86 | "text": random_text, 87 | "speaker": str(SPEAKER), 88 | "service_name": "voicevox" 89 | } 90 | query_params = { 91 | "x_audio_format": "wav" 92 | } 93 | resp = httpx.post("http://127.0.0.1:8000/tts", params=query_params, json=req) 94 | audio_data = resp.content 95 | assert wave_checker(audio_data) 96 | assert "音声合成" in audio_transcriber(audio_data, "wav") 97 | 98 | 99 | @pytest.mark.asyncio 100 | async def test_voicevox_unified_mp3(random_text, mp3_checker, audio_transcriber): 101 | req = { 102 | "text": random_text, 103 | "speaker": str(SPEAKER), 104 | "service_name": "voicevox" 105 | } 106 | query_params = { 107 | "x_audio_format": "mp3" 108 | } 109 | resp = httpx.post("http://127.0.0.1:8000/tts", params=query_params, json=req) 110 | audio_data = resp.content 111 | assert mp3_checker(audio_data) 112 | assert "音声合成" in audio_transcriber(audio_data, "mp3") 113 | -------------------------------------------------------------------------------- /tests/gateway/test_unified.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import os 3 | import httpx 4 | from speech_gateway.gateway.voicevox import VoicevoxGateway 5 | from speech_gateway.gateway.nijivoice import NijiVoiceGateway 6 | from speech_gateway.gateway.sbv2 import StyleBertVits2Gateway 7 | from speech_gateway.gateway.openai_speech import OpenAIGateway 8 | from speech_gateway.gateway.unified import UnifiedGateway 9 | from speech_gateway.gateway import UnifiedTTSRequest 10 | 11 | VOICEVOX_URL = os.getenv("VOICEVOX_URL") 12 | SBV2_URL = os.getenv("SBV2_URL") 13 | NIJIVOICE_API_KEY = os.getenv("NIJIVOICE_API_KEY") 14 | OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") 15 | 16 | 17 | @pytest.mark.asyncio 18 | async def test_unified_gateway_default(): 19 | # Create gateways 20 | voicevox_gateway = VoicevoxGateway(base_url=VOICEVOX_URL, debug=True) 21 | sbv2_gateway = StyleBertVits2Gateway(base_url=SBV2_URL, debug=True) 22 | nijivoice_gateway = NijiVoiceGateway(api_key=NIJIVOICE_API_KEY, prefix="/nijivoice", debug=True) 23 | openai_gateway = OpenAIGateway(api_key=OPENAI_API_KEY, debug=True) 24 | 25 | # Unified gateway 26 | unified_gateway = UnifiedGateway(debug=True) 27 | unified_gateway.add_gateway("voicevox", voicevox_gateway, default_speaker="46", default=True) 28 | unified_gateway.add_gateway("sbv2", sbv2_gateway) 29 | unified_gateway.add_gateway("nijivoice", nijivoice_gateway) 30 | unified_gateway.add_gateway("openai", openai_gateway, languages=["en-US", "zh-CN"], default_speaker="alloy") 31 | 32 | assert unified_gateway.get_gateway(UnifiedTTSRequest(text="hello")) == voicevox_gateway 33 | 34 | assert unified_gateway.get_gateway(UnifiedTTSRequest(text="hello", service_name="voicevox")) == voicevox_gateway 35 | assert unified_gateway.get_gateway(UnifiedTTSRequest(text="hello", service_name="sbv2")) == sbv2_gateway 36 | assert unified_gateway.get_gateway(UnifiedTTSRequest(text="hello", service_name="nijivoice")) == nijivoice_gateway 37 | assert unified_gateway.get_gateway(UnifiedTTSRequest(text="hello", service_name="openai")) == openai_gateway 38 | assert unified_gateway.get_gateway(UnifiedTTSRequest(text="hello", service_name="dummy")) is None 39 | 40 | assert unified_gateway.get_gateway(UnifiedTTSRequest(text="hello", language="ja-JP")) == voicevox_gateway 41 | assert unified_gateway.get_gateway(UnifiedTTSRequest(text="hello", language="en-US")) == openai_gateway 42 | assert unified_gateway.get_gateway(UnifiedTTSRequest(text="hello", language="zh-CN")) == openai_gateway 43 | 44 | assert unified_gateway.get_gateway(UnifiedTTSRequest(text="hello", service_name="sbv2", language="en-US")) == sbv2_gateway 45 | 46 | 47 | 48 | @pytest.mark.asyncio 49 | async def test_voicevox_unified(random_text, wave_checker, audio_transcriber): 50 | req = { 51 | "text": random_text 52 | } 53 | resp = httpx.post("http://127.0.0.1:8000/tts", json=req) 54 | audio_data = resp.content 55 | assert wave_checker(audio_data) 56 | assert "音声合成" in audio_transcriber(audio_data, "wav") 57 | 58 | 59 | @pytest.mark.asyncio 60 | async def test_voicevox_unified_wav(random_text, wave_checker, audio_transcriber): 61 | req = { 62 | "text": random_text 63 | } 64 | query_params = { 65 | "x_audio_format": "wav" 66 | } 67 | resp = httpx.post("http://127.0.0.1:8000/tts", params=query_params, json=req) 68 | audio_data = resp.content 69 | assert wave_checker(audio_data) 70 | assert "音声合成" in audio_transcriber(audio_data, "wav") 71 | 72 | 73 | @pytest.mark.asyncio 74 | async def test_voicevox_unified_mp3(random_text, mp3_checker, audio_transcriber): 75 | req = { 76 | "text": random_text 77 | } 78 | query_params = { 79 | "x_audio_format": "mp3" 80 | } 81 | resp = httpx.post("http://127.0.0.1:8000/tts", params=query_params, json=req) 82 | audio_data = resp.content 83 | assert mp3_checker(audio_data) 84 | assert "音声合成" in audio_transcriber(audio_data, "mp3") 85 | -------------------------------------------------------------------------------- /tests/gateway/test_azure.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import httpx 3 | 4 | 5 | @pytest.mark.asyncio 6 | async def test_azure(random_text, wave_checker, audio_transcriber): 7 | ssml_text = f"{random_text}" 8 | resp = httpx.post( 9 | url="http://127.0.0.1:8000/azure/cognitiveservices/v1", 10 | headers={ 11 | "X-Microsoft-OutputFormat": "riff-16khz-16bit-mono-pcm", 12 | "Content-Type": "application/ssml+xml" 13 | }, 14 | data=ssml_text.encode("utf-8") 15 | ) 16 | audio_data = resp.content 17 | assert wave_checker(audio_data) 18 | assert "音声合成" in audio_transcriber(audio_data, "wav") 19 | 20 | 21 | @pytest.mark.asyncio 22 | async def test_azure_wav(random_text, wave_checker, audio_transcriber): 23 | ssml_text = f"{random_text}" 24 | resp = httpx.post( 25 | url="http://127.0.0.1:8000/azure/cognitiveservices/v1", 26 | headers={ 27 | "X-Microsoft-OutputFormat": "audio-16khz-32kbitrate-mono-mp3", # <- set mp3 to header 28 | "Content-Type": "application/ssml+xml" 29 | }, 30 | params={"x_audio_format": "wav"}, # <- overwrite format to wav 31 | data=ssml_text.encode("utf-8") 32 | ) 33 | audio_data = resp.content 34 | assert wave_checker(audio_data) 35 | assert "音声合成" in audio_transcriber(audio_data, "wav") 36 | 37 | 38 | @pytest.mark.asyncio 39 | async def test_azure_mp3(random_text, mp3_checker, audio_transcriber): 40 | ssml_text = f"{random_text}" 41 | resp = httpx.post( 42 | url="http://127.0.0.1:8000/azure/cognitiveservices/v1", 43 | headers={ 44 | "X-Microsoft-OutputFormat": "riff-16khz-16bit-mono-pcm", # <- set wav to header 45 | "Content-Type": "application/ssml+xml" 46 | }, 47 | params={"x_audio_format": "mp3"}, # <- overwrite format to mp3 48 | data=ssml_text.encode("utf-8") 49 | ) 50 | audio_data = resp.content 51 | assert mp3_checker(audio_data) 52 | assert "音声合成" in audio_transcriber(audio_data, "mp3") 53 | 54 | 55 | @pytest.mark.asyncio 56 | async def test_azure_unified(random_text, wave_checker, audio_transcriber): 57 | req = { 58 | "text": random_text, 59 | "speaker": "zh-CN-XiaoyuMultilingualNeural", 60 | "service_name": "azure" 61 | } 62 | resp = httpx.post("http://127.0.0.1:8000/tts", json=req) 63 | audio_data = resp.content 64 | assert wave_checker(audio_data) 65 | assert "音声合成" in audio_transcriber(audio_data, "wav") 66 | 67 | 68 | @pytest.mark.asyncio 69 | async def test_azure_unified_wav(random_text, wave_checker, audio_transcriber): 70 | req = { 71 | "text": random_text, 72 | "speaker": "zh-CN-XiaoyuMultilingualNeural", 73 | "service_name": "azure" 74 | } 75 | query_params = { 76 | "x_audio_format": "wav" 77 | } 78 | resp = httpx.post("http://127.0.0.1:8000/tts", params=query_params, json=req) 79 | audio_data = resp.content 80 | assert wave_checker(audio_data) 81 | assert "音声合成" in audio_transcriber(audio_data, "wav") 82 | 83 | 84 | @pytest.mark.asyncio 85 | async def test_azure_unified_mp3(random_text, mp3_checker, audio_transcriber): 86 | req = { 87 | "text": random_text, 88 | "speaker": "zh-CN-XiaoyuMultilingualNeural", 89 | "service_name": "azure" 90 | } 91 | query_params = { 92 | "x_audio_format": "mp3" 93 | } 94 | resp = httpx.post("http://127.0.0.1:8000/tts", params=query_params, json=req) 95 | audio_data = resp.content 96 | assert mp3_checker(audio_data) 97 | assert "音声合成" in audio_transcriber(audio_data, "mp3") 98 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | 162 | voicevox_cache/ 163 | sbv2_cache/ 164 | nijivoice_cache/ 165 | openai_cache/ 166 | example.py 167 | testrun.py 168 | client.py 169 | pytest.ini 170 | performance.db 171 | -------------------------------------------------------------------------------- /tests/performance_recorder/test_sqlite.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import sqlite3 3 | import threading 4 | from time import sleep 5 | from speech_gateway.performance_recorder.sqlite import SQLitePerformanceRecorder 6 | 7 | 8 | @pytest.fixture 9 | def sqlite_recorder(tmp_path): 10 | """ 11 | Create a new database file in a temporary directory for each test. 12 | After the test finishes, call close() to release resources. 13 | """ 14 | db_path = tmp_path / "test_performance.db" 15 | recorder = SQLitePerformanceRecorder(str(db_path)) 16 | yield recorder 17 | # Ensure that we close the recorder after the test to release all resources 18 | recorder.close() 19 | 20 | 21 | def test_single_thread_record_and_close(sqlite_recorder): 22 | """ 23 | Verify that the record -> close flow finishes without deadlocks 24 | in a single-thread scenario, and confirm the correct number of rows is inserted. 25 | Also, check that the 'id' field is auto-incrementing correctly. 26 | """ 27 | # Insert 5 records 28 | for i in range(5): 29 | sqlite_recorder.record( 30 | process_id=f"process_{i}", 31 | source="test_source", 32 | text=f"test_text_{i}", 33 | audio_format="wav", 34 | cached=0, 35 | elapsed=0.01 * i, 36 | ) 37 | 38 | # Although close() will be called by the fixture teardown, 39 | # here we explicitly call it for clarity. 40 | sqlite_recorder.close() 41 | 42 | # Directly open the database to check how many records were inserted 43 | conn = sqlite3.connect(sqlite_recorder.db_path) 44 | try: 45 | cursor = conn.cursor() 46 | cursor.execute("SELECT COUNT(*) FROM performance_records;") 47 | count = cursor.fetchone()[0] 48 | assert count == 5, f"Expected 5 records, got {count}" 49 | 50 | # Retrieve all IDs in ascending order 51 | cursor.execute("SELECT id FROM performance_records ORDER BY id;") 52 | ids = [row[0] for row in cursor.fetchall()] 53 | 54 | # Confirm we have 5 IDs 55 | assert len(ids) == 5, f"Expected 5 IDs, got {len(ids)}" 56 | 57 | # Check they are strictly increasing by 1 58 | for i in range(1, len(ids)): 59 | assert ids[i] == ids[i - 1] + 1, "IDs are not incrementing as expected" 60 | finally: 61 | conn.close() 62 | 63 | 64 | def test_multi_thread_record_no_deadlock(sqlite_recorder): 65 | """ 66 | Verify that concurrent calls to record() do not cause deadlocks 67 | and that data is correctly committed to the database. 68 | """ 69 | NUM_THREADS = 5 70 | RECORDS_PER_THREAD = 100 71 | 72 | def worker(thread_id: int): 73 | for i in range(RECORDS_PER_THREAD): 74 | sqlite_recorder.record( 75 | process_id=f"thread_{thread_id}_process_{i}", 76 | source="test_source", 77 | text=f"test_text_{i}", 78 | audio_format="wav", 79 | cached=1, 80 | elapsed=0.1 * i, 81 | ) 82 | # Sleep a bit to make concurrency testing more likely to expose issues 83 | sleep(0.001) 84 | 85 | threads = [] 86 | for t_id in range(NUM_THREADS): 87 | t = threading.Thread(target=worker, args=(t_id,)) 88 | t.start() 89 | threads.append(t) 90 | 91 | # Wait for all threads to complete 92 | for t in threads: 93 | t.join() 94 | 95 | # Close the recorder to ensure the queue is fully processed 96 | sqlite_recorder.close() 97 | 98 | # Check that all records were indeed written to the database 99 | total_expected = NUM_THREADS * RECORDS_PER_THREAD 100 | conn = sqlite3.connect(sqlite_recorder.db_path) 101 | try: 102 | cursor = conn.cursor() 103 | cursor.execute("SELECT COUNT(*) FROM performance_records;") 104 | count = cursor.fetchone()[0] 105 | assert count == total_expected, f"Expected {total_expected} records, got {count}" 106 | finally: 107 | conn.close() 108 | -------------------------------------------------------------------------------- /tests/performance_recorder/test_postgres.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | import threading 4 | from time import sleep 5 | from speech_gateway.performance_recorder.postgres import PostgreSQLPerformanceRecorder 6 | 7 | POSTGRES_USER = os.getenv("POSTGRES_USER") 8 | POSTGRES_PASSWORD = os.getenv("POSTGRES_PASSWORD") 9 | POSTGRES_DBNAME = os.getenv("POSTGRES_DBNAME") 10 | 11 | 12 | @pytest.fixture 13 | def postgres_recorder(tmp_path): 14 | recorder = PostgreSQLPerformanceRecorder( 15 | dbname=POSTGRES_DBNAME, 16 | user=POSTGRES_USER, 17 | password=POSTGRES_PASSWORD 18 | ) 19 | yield recorder 20 | conn = recorder.connect_db() 21 | cursor = conn.cursor() 22 | cursor.execute("TRUNCATE TABLE performance_records;") 23 | conn.commit() 24 | conn.close() 25 | recorder.close() 26 | 27 | def test_single_thread_record_and_close(postgres_recorder): 28 | """ 29 | Verify that the record -> close flow finishes without deadlocks 30 | in a single-thread scenario, and confirm the correct number of rows is inserted. 31 | Also, check that the 'id' field is auto-incrementing correctly. 32 | """ 33 | # Insert 5 records 34 | for i in range(5): 35 | postgres_recorder.record( 36 | process_id=f"process_{i}", 37 | source="test_source", 38 | text=f"test_text_{i}", 39 | audio_format="wav", 40 | cached=0, 41 | elapsed=0.01 * i, 42 | ) 43 | 44 | # Although close() will be called by the fixture teardown, 45 | # here we explicitly call it for clarity. 46 | postgres_recorder.close() 47 | 48 | # Directly open the database to check how many records were inserted 49 | conn = postgres_recorder.connect_db() 50 | try: 51 | cursor = conn.cursor() 52 | cursor.execute("SELECT COUNT(*) FROM performance_records;") 53 | count = cursor.fetchone()[0] 54 | assert count == 5, f"Expected 5 records, got {count}" 55 | 56 | # Retrieve all IDs in ascending order 57 | cursor.execute("SELECT id FROM performance_records ORDER BY id;") 58 | ids = [row[0] for row in cursor.fetchall()] 59 | 60 | # Confirm we have 5 IDs 61 | assert len(ids) == 5, f"Expected 5 IDs, got {len(ids)}" 62 | 63 | # Check they are strictly increasing by 1 64 | for i in range(1, len(ids)): 65 | assert ids[i] == ids[i - 1] + 1, "IDs are not incrementing as expected" 66 | finally: 67 | conn.close() 68 | 69 | 70 | def test_multi_thread_record_no_deadlock(postgres_recorder): 71 | """ 72 | Verify that concurrent calls to record() do not cause deadlocks 73 | and that data is correctly committed to the database. 74 | """ 75 | NUM_THREADS = 5 76 | RECORDS_PER_THREAD = 100 77 | 78 | def worker(thread_id: int): 79 | for i in range(RECORDS_PER_THREAD): 80 | postgres_recorder.record( 81 | process_id=f"thread_{thread_id}_process_{i}", 82 | source="test_source", 83 | text=f"test_text_{i}", 84 | audio_format="wav", 85 | cached=1, 86 | elapsed=0.1 * i, 87 | ) 88 | # Sleep a bit to make concurrency testing more likely to expose issues 89 | sleep(0.001) 90 | 91 | threads = [] 92 | for t_id in range(NUM_THREADS): 93 | t = threading.Thread(target=worker, args=(t_id,)) 94 | t.start() 95 | threads.append(t) 96 | 97 | # Wait for all threads to complete 98 | for t in threads: 99 | t.join() 100 | 101 | # Close the recorder to ensure the queue is fully processed 102 | postgres_recorder.close() 103 | 104 | # Check that all records were indeed written to the database 105 | total_expected = NUM_THREADS * RECORDS_PER_THREAD 106 | conn = postgres_recorder.connect_db() 107 | try: 108 | cursor = conn.cursor() 109 | cursor.execute("SELECT COUNT(*) FROM performance_records;") 110 | count = cursor.fetchone()[0] 111 | assert count == total_expected, f"Expected {total_expected} records, got {count}" 112 | finally: 113 | conn.close() 114 | -------------------------------------------------------------------------------- /speech_gateway/gateway/nijivoice.py: -------------------------------------------------------------------------------- 1 | from typing import Dict 2 | from fastapi import APIRouter, Request 3 | from fastapi.responses import StreamingResponse, JSONResponse 4 | from . import SpeechGateway, UnifiedTTSRequest 5 | from ..cache.file import FileCacheStorage 6 | from ..performance_recorder import PerformanceRecorder, SQLitePerformanceRecorder 7 | from ..source.nijivoice import NijiVoiceStreamSource 8 | 9 | 10 | class NijiVoiceGateway(SpeechGateway): 11 | def __init__(self, *, stream_source: NijiVoiceStreamSource = None, api_key: str = None, speeds: Dict[str, float] = None, base_url: str = None, prefix: str = None, cache_dir: str = None, performance_recorder: PerformanceRecorder = None, debug = False): 12 | self.stream_source: NijiVoiceStreamSource = None 13 | if stream_source: 14 | super().__init__(stream_source=stream_source, debug=debug) 15 | else: 16 | super().__init__( 17 | stream_source=NijiVoiceStreamSource( 18 | api_key=api_key, 19 | base_url=base_url or "https://api.nijivoice.com", 20 | cache_storage=FileCacheStorage(cache_dir=cache_dir or "nijivoice_cache"), 21 | format_converters={}, 22 | performance_recorder=performance_recorder or SQLitePerformanceRecorder(), 23 | debug=debug 24 | ), 25 | debug=debug 26 | ) 27 | self.speeds = speeds or {} 28 | self.prefix = prefix 29 | 30 | def register_endpoint(self, router: APIRouter): 31 | @router.post("/api/platform/v1/voice-actors/{voice_actor_id}/generate-voice") 32 | async def generate_voice_handler(voice_actor_id: str, request: Request, x_audio_format: str = None): 33 | request_json = await request.json() 34 | 35 | if x_audio_format: 36 | if x_audio_format in ["mp3", "wav"]: 37 | request_json["format"] = x_audio_format 38 | else: 39 | # Set wave to convert to other format later 40 | request_json["format"] = "wav" 41 | else: 42 | x_audio_format = request_json.get("format", "mp3") 43 | 44 | gateway_base_url = f"{request.base_url.scheme}://{request.base_url.netloc}{self.prefix}" 45 | resp_json = await self.stream_source.generate_voice( 46 | voice_actor_id, 47 | request_json, 48 | gateway_base_url, 49 | x_audio_format 50 | ) 51 | 52 | return JSONResponse(resp_json) 53 | 54 | @router.get("/api/platform/v1/voice-actors/{voice_actor_id}/get-voice") 55 | async def get_voice_handler(voice_actor_id: str, x_audio_format: str, url: str = None, download: str = None, cache_key: str = None): 56 | nijivoice_resp = await self.stream_source.fetch_stream( 57 | voice_actor_id=voice_actor_id, 58 | url=url, 59 | download=download, 60 | cache_key=cache_key, 61 | audio_format=x_audio_format 62 | ) 63 | return StreamingResponse(nijivoice_resp, media_type=f"audio/{x_audio_format}") 64 | 65 | async def unified_tts_handler(self, request: Request, tts_request: UnifiedTTSRequest, x_audio_format: str = "wav"): 66 | gateway_base_url = f"{request.base_url.scheme}://{request.base_url.netloc}{self.prefix}" 67 | 68 | payload = { 69 | "script": tts_request.text, 70 | "speed": str(tts_request.speed) if tts_request.speed else str(self.speeds.get(tts_request.speaker, "1.0")), 71 | "format": x_audio_format if x_audio_format == "mp3" else "wav" 72 | } 73 | 74 | resp_json = await self.stream_source.generate_voice(tts_request.speaker, payload, gateway_base_url, x_audio_format, overwrite_download_urls=False) 75 | 76 | nijivoice_resp = await self.stream_source.fetch_stream( 77 | voice_actor_id=tts_request.speaker, 78 | url=resp_json["generatedVoice"]["audioFileUrl"], 79 | download=False, 80 | cache_key=self.stream_source.get_cache_key(x_audio_format, tts_request.speaker, payload), 81 | audio_format=x_audio_format 82 | ) 83 | 84 | return StreamingResponse(nijivoice_resp, media_type=f"audio/{x_audio_format}") 85 | -------------------------------------------------------------------------------- /speech_gateway/performance_recorder/postgres.py: -------------------------------------------------------------------------------- 1 | from dataclasses import fields 2 | from datetime import datetime, timezone 3 | import logging 4 | import queue 5 | import threading 6 | import time 7 | import psycopg2 8 | from . import PerformanceRecorder, PerformanceRecord 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | class PostgreSQLPerformanceRecorder(PerformanceRecorder): 14 | def __init__( 15 | self, 16 | *, 17 | host: str = "localhost", 18 | port: int = 5432, 19 | dbname: str = "speech_gateway", 20 | user: str = "postgres", 21 | password: str = None, 22 | ): 23 | self.connection_params = { 24 | "host": host, 25 | "port": port, 26 | "dbname": dbname, 27 | "user": user, 28 | "password": password, 29 | } 30 | self.record_queue = queue.Queue() 31 | self.stop_event = threading.Event() 32 | 33 | self.init_db() 34 | 35 | self.worker_thread = threading.Thread(target=self.start_worker, daemon=True) 36 | self.worker_thread.start() 37 | 38 | def connect_db(self): 39 | return psycopg2.connect(**self.connection_params) 40 | 41 | def init_db(self): 42 | conn = self.connect_db() 43 | try: 44 | with conn: 45 | with conn.cursor() as cur: 46 | cur.execute( 47 | """ 48 | CREATE TABLE IF NOT EXISTS performance_records ( 49 | id SERIAL PRIMARY KEY, 50 | process_id TEXT NOT NULL, 51 | created_at TIMESTAMPTZ NOT NULL, 52 | source TEXT, 53 | text TEXT, 54 | audio_format TEXT, 55 | cached INTEGER, 56 | elapsed REAL 57 | ) 58 | """ 59 | ) 60 | finally: 61 | conn.close() 62 | 63 | def start_worker(self): 64 | conn = self.connect_db() 65 | try: 66 | while not self.stop_event.is_set() or not self.record_queue.empty(): 67 | try: 68 | record = self.record_queue.get(timeout=0.5) 69 | except queue.Empty: 70 | continue 71 | 72 | try: 73 | self.insert_record(conn, record) 74 | except (psycopg2.InterfaceError, psycopg2.OperationalError): 75 | try: 76 | conn.close() 77 | except Exception: 78 | pass 79 | 80 | logger.warning("Connection is not available. Retrying insert_record with new connection...") 81 | time.sleep(0.5) 82 | conn = self.connect_db() 83 | self.insert_record(conn, record) 84 | 85 | self.record_queue.task_done() 86 | finally: 87 | try: 88 | conn.close() 89 | except Exception: 90 | pass 91 | 92 | 93 | def insert_record(self, conn, record: PerformanceRecord): 94 | columns = [field.name for field in fields(PerformanceRecord)] + ["created_at"] 95 | placeholders = ["%s"] * len(columns) 96 | values = [getattr(record, field.name) for field in fields(PerformanceRecord)] + [ 97 | datetime.now(timezone.utc) 98 | ] 99 | 100 | with conn.cursor() as cur: 101 | cur.execute( 102 | f"INSERT INTO performance_records ({', '.join(columns)}) VALUES ({', '.join(placeholders)})", 103 | values, 104 | ) 105 | conn.commit() 106 | 107 | def record( 108 | self, 109 | *, 110 | process_id: str, 111 | source: str = None, 112 | text: str = None, 113 | audio_format: str = None, 114 | cached: int = 0, 115 | elapsed: float = None, 116 | ): 117 | performance_record = PerformanceRecord( 118 | process_id=process_id, 119 | source=source, 120 | text=text, 121 | audio_format=audio_format, 122 | cached=cached, 123 | elapsed=elapsed, 124 | ) 125 | self.record_queue.put(performance_record) 126 | 127 | def close(self): 128 | self.stop_event.set() 129 | self.record_queue.join() 130 | self.worker_thread.join() 131 | -------------------------------------------------------------------------------- /speech_gateway/source/nijivoice.py: -------------------------------------------------------------------------------- 1 | from time import time 2 | from typing import AsyncIterator, Dict 3 | import urllib.parse 4 | import httpx 5 | from . import StreamSource, StreamSourceError 6 | from ..cache import CacheStorage 7 | from ..cache.file import FileCacheStorage 8 | from ..converter import FormatConverter 9 | from ..performance_recorder import PerformanceRecorder 10 | 11 | 12 | class NijiVoiceStreamSource(StreamSource): 13 | def __init__(self, 14 | *, 15 | api_key: str = None, 16 | base_url: str = "https://api.nijivoice.com", 17 | cache_storage: CacheStorage = None, 18 | format_converters: Dict[str, FormatConverter] = None, 19 | max_connections: int = 100, 20 | max_keepalive_connections: int = 20, 21 | timeout: float = 10.0, 22 | performance_recorder: PerformanceRecorder = None, 23 | debug: bool = False 24 | ): 25 | super().__init__( 26 | base_url=base_url, 27 | cache_storage=cache_storage or FileCacheStorage(cache_dir="nijivoice_cache"), 28 | format_converters=format_converters, 29 | max_connections=max_connections, 30 | max_keepalive_connections=max_keepalive_connections, 31 | timeout=timeout, 32 | performance_recorder=performance_recorder, 33 | debug=debug 34 | ) 35 | self.base_url = base_url 36 | self.api_key = api_key 37 | 38 | def get_cache_key(self, audio_format: str, voice_actor_id: str = None, payload: dict = None, cache_key: str = None, **kwargs) -> str: 39 | if cache_key: 40 | return cache_key 41 | 42 | return f"{voice_actor_id}_{hash(str(payload))}.{audio_format or 'mp3'}" 43 | 44 | def parse_text(self, **kwargs) -> str: 45 | return None 46 | 47 | def make_stream_request(self, url: str, **kwargs): 48 | return { 49 | "method": "GET", 50 | "url": url, 51 | } 52 | 53 | async def generate_voice(self, voice_actor_id: str, payload: dict, gateway_base_url: str, x_audio_format: str = "mp3", overwrite_download_urls: bool = True): 54 | start_time = time() 55 | cache_key = self.get_cache_key(x_audio_format, voice_actor_id, payload) 56 | use_cache = self.cache_storage and await self.cache_storage.has_cache(cache_key) 57 | 58 | # Return cache info if cached 59 | if use_cache: 60 | gateway_voice_url = f"{gateway_base_url}/api/platform/v1/voice-actors/{voice_actor_id}/get-voice?cache_key={cache_key}&x_audio_format={x_audio_format}" 61 | data = {"generatedVoice": { 62 | "audioFileUrl": gateway_voice_url, 63 | "audioFileDownloadUrl": gateway_voice_url + "&download=true" 64 | }} 65 | 66 | else: 67 | try: 68 | # Generate voice 69 | url = f"{self.base_url}/api/platform/v1/voice-actors/{voice_actor_id}/generate-voice" 70 | headers = { 71 | "x-api-key": self.api_key, 72 | "content-type": "application/json" 73 | } 74 | url_resp = await self.http_client.post(url, headers=headers, json=payload) 75 | if url_resp.status_code != 200: 76 | raise StreamSourceError(f"NijiVoice generate voice failed: {url_resp.status_code}") 77 | 78 | # Get voice URL 79 | data = url_resp.json() 80 | audio_file_url = data.get("generatedVoice", {}).get("audioFileUrl") 81 | encoded_audio_file_url = urllib.parse.quote(audio_file_url, safe='') 82 | 83 | # Overwrite URLs 84 | if overwrite_download_urls: 85 | gateway_voice_url = ( 86 | f"{gateway_base_url}/api/platform/v1/voice-actors/{voice_actor_id}/get-voice" 87 | f"?url={encoded_audio_file_url}&cache_key={cache_key}&x_audio_format={x_audio_format}" 88 | ) 89 | data["generatedVoice"]["audioFileUrl"] = gateway_voice_url 90 | data["generatedVoice"]["audioFileDownloadUrl"] = gateway_voice_url + "&download=true" 91 | 92 | except httpx.RequestError as ex: 93 | raise StreamSourceError(f"HTTP request failed: {ex}") from ex 94 | 95 | # Performance record 96 | if self.performance_recorder: 97 | self.performance_recorder.record( 98 | process_id=cache_key, source=self.__class__.__name__, text=payload.get("script"), 99 | audio_format=x_audio_format, cached=use_cache, elapsed=time() - start_time 100 | ) 101 | 102 | return data 103 | -------------------------------------------------------------------------------- /tests/source/test_nijivoice_source.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import os 3 | import httpx 4 | from speech_gateway.source.nijivoice import NijiVoiceStreamSource 5 | from speech_gateway.source import StreamSourceError 6 | 7 | BASE_URL = "https://api.nijivoice.com" 8 | GATEWAY_BASE_URL = "http://127.0.0.1:8000/nijivoice" 9 | NIJIVOICE_API_KEY = os.getenv("NIJIVOICE_API_KEY") 10 | VOICE_ACTOR_ID = "a192db5f-bd8b-4fc7-bc08-af5ca5957c12" 11 | PAYLOAD = { 12 | "script": "こんにちは。これはテストです。", 13 | "speed": "1.0", 14 | "emotionalLevel": "0.1", 15 | "soundDuration": "0.1", 16 | "format": "mp3", 17 | } 18 | 19 | @pytest.fixture 20 | def source(): 21 | # Create an instance of NijiVoiceStreamSource 22 | return NijiVoiceStreamSource(base_url=BASE_URL, api_key=NIJIVOICE_API_KEY, debug=True) 23 | 24 | @pytest.mark.asyncio 25 | async def test_get_cache_key(source): 26 | # Test get_cache_key method 27 | cache_key = source.get_cache_key("mp3", VOICE_ACTOR_ID, PAYLOAD) 28 | assert cache_key.endswith(".mp3") 29 | assert VOICE_ACTOR_ID in cache_key 30 | 31 | cache_key = source.get_cache_key("wav", VOICE_ACTOR_ID, PAYLOAD) 32 | assert cache_key.endswith(".wav") 33 | assert VOICE_ACTOR_ID in cache_key 34 | 35 | @pytest.mark.asyncio 36 | async def test_parse_text(source): 37 | # Test parse_text method 38 | text = source.parse_text(payload=PAYLOAD) 39 | assert text is None # Since parse_text returns None in the current implementation 40 | 41 | @pytest.mark.asyncio 42 | async def test_make_stream_request(source): 43 | # Test make_stream_request method 44 | url = f"{BASE_URL}/api/platform/v1/voice-actors/{VOICE_ACTOR_ID}/generate-voice" 45 | request = source.make_stream_request(url=url) 46 | assert request["method"] == "GET" 47 | assert request["url"] == url 48 | 49 | @pytest.mark.asyncio 50 | async def test_generate_voice_cached(source): 51 | # Test generate_voice method with cache 52 | cache_key = source.get_cache_key("mp3", VOICE_ACTOR_ID, PAYLOAD) 53 | 54 | # Create a dummy async generator for cached data 55 | async def dummy_cache_data(): 56 | yield b"cached data" 57 | 58 | # Write a dummy cache 59 | async for _ in source.cache_storage.write_cache(dummy_cache_data(), cache_key): 60 | pass # Consume the generator to simulate writing cache 61 | 62 | # Call generate_voice and verify it uses cache 63 | response = await source.generate_voice(VOICE_ACTOR_ID, PAYLOAD, GATEWAY_BASE_URL) 64 | assert "generatedVoice" in response 65 | assert response["generatedVoice"]["audioFileUrl"].startswith(GATEWAY_BASE_URL) 66 | 67 | @pytest.mark.asyncio 68 | async def test_generate_voice_fresh(source): 69 | # Test generate_voice method without cache (actual API call) 70 | try: 71 | response = await source.generate_voice(VOICE_ACTOR_ID, PAYLOAD, GATEWAY_BASE_URL) 72 | assert "generatedVoice" in response 73 | assert response["generatedVoice"]["audioFileUrl"].startswith(GATEWAY_BASE_URL) 74 | except Exception as e: 75 | pytest.fail(f"generate_voice failed: {e}") 76 | 77 | @pytest.mark.asyncio 78 | async def test_generate_voice_error(source): 79 | # Test generate_voice method with invalid payload 80 | invalid_payload = PAYLOAD.copy() 81 | invalid_payload["script"] = "" # Invalid script 82 | 83 | with pytest.raises(StreamSourceError): 84 | await source.generate_voice(VOICE_ACTOR_ID, invalid_payload, GATEWAY_BASE_URL) 85 | 86 | @pytest.mark.asyncio 87 | async def test_fetch_stream_raw(source): 88 | # Test fetch_stream_raw method (actual API call) 89 | url_resp = httpx.post( 90 | f"{GATEWAY_BASE_URL}/api/platform/v1/voice-actors/{VOICE_ACTOR_ID}/generate-voice", 91 | json={"script": "こんにちは。これはテストです。", "speed": "1.0"} 92 | ) 93 | 94 | assert url_resp.status_code == 200 95 | url = url_resp.json()["generatedVoice"]["audioFileUrl"] 96 | assert GATEWAY_BASE_URL in url 97 | 98 | http_request = { 99 | "method": "GET", 100 | "url": url, 101 | } 102 | 103 | try: 104 | async for chunk in source.fetch_stream_raw(http_request): 105 | assert isinstance(chunk, bytes) 106 | except Exception as e: 107 | pytest.fail(f"fetch_stream_raw failed: {e}") 108 | 109 | @pytest.mark.asyncio 110 | async def test_fetch_stream(source): 111 | # Test fetch_stream method with a full pipeline 112 | try: 113 | async for chunk in await source.fetch_stream( 114 | audio_format="mp3", 115 | voice_actor_id=VOICE_ACTOR_ID, 116 | payload=PAYLOAD, 117 | gateway_base_url=GATEWAY_BASE_URL, 118 | ): 119 | assert isinstance(chunk, bytes) 120 | except Exception as e: 121 | pytest.fail(f"fetch_stream failed: {e}") 122 | -------------------------------------------------------------------------------- /speech_gateway/gateway/__init__.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | import logging 3 | from fastapi import Request, APIRouter, HTTPException 4 | from fastapi.responses import Response 5 | from pydantic import BaseModel, Field 6 | import httpx 7 | from ..source import StreamSource 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | 12 | class UnifiedTTSRequest(BaseModel): 13 | text: str = Field(..., description="The text to be synthesized into speech.", example="hello") 14 | speaker: str = Field( 15 | None, 16 | description="The unique identifier for the voice in each speech service. " 17 | "For Style-Bert-VITS2, specify as `{model_id}-{speaker_id}`. " 18 | "If omitted, the default speaker of the speech service will be used.", 19 | example="888753761" 20 | ) 21 | style: str = Field( 22 | None, 23 | description="A predefined set of voice styles that includes `neutral`, `joy`, `angry`, `sorrow`, `fun`, and `surprised`. " 24 | "These styles act as presets and must be mapped appropriately to the corresponding style identifiers in each speech service. " 25 | "If omitted, no style will be applied.", 26 | example="neutral" 27 | ) 28 | speed: float = Field( 29 | None, 30 | description="The speed of synthesized speech, where 1.0 is normal speed. " 31 | "Values greater than 1.0 increase the speed (e.g., 1.5 is 50% faster), " 32 | "and values less than 1.0 decrease the speed (e.g., 0.5 is 50% slower). " 33 | "The acceptable range depends on each speech service.", 34 | example=1.0 35 | ) 36 | service_name: str = Field( 37 | None, 38 | description="The name of the service as specified in `add_gateway`. " 39 | "If omitted, the default gateway will be used.", 40 | example="aivisspeech", 41 | ) 42 | language: str = Field( 43 | None, 44 | description="The language. The corresponding text-to-speech service will be used. " 45 | "Specify the language code in ISO639-1 format combined with the country code using a hyphen." 46 | "If omitted, the default gateway will be used.", 47 | example="en-US", 48 | ) 49 | 50 | 51 | class SpeechGateway(ABC): 52 | HOP_BY_HOP_HEADERS = { 53 | "connection", 54 | "keep-alive", 55 | "proxy-authenticate", 56 | "proxy-authorization", 57 | "te", 58 | "trailers", 59 | "transfer-encoding", 60 | "upgrade", 61 | } 62 | 63 | def __init__( 64 | self, 65 | *, 66 | stream_source: StreamSource = None, 67 | debug: bool = False 68 | ): 69 | self.stream_source = stream_source 70 | self.debug = debug 71 | 72 | def filter_headers(self, headers: httpx.Headers) -> dict: 73 | filtered = {} 74 | for k, v in headers.items(): 75 | if k.lower() not in self.HOP_BY_HOP_HEADERS: 76 | filtered[k] = v 77 | return filtered 78 | 79 | @abstractmethod 80 | def register_endpoint(self, router: APIRouter): 81 | pass 82 | 83 | async def passthrough_handler(self, request: Request, path: str): 84 | url = f"{self.stream_source.base_url}/{path}" 85 | if request.query_params: 86 | url += f"?{request.query_params}" 87 | 88 | headers = dict(request.headers) 89 | headers.pop("host", None) 90 | body = await request.body() 91 | 92 | r = await self.stream_source.http_client.request( 93 | request.method, 94 | url, 95 | headers=headers, 96 | content=body 97 | ) 98 | 99 | resp_headers = self.filter_headers(r.headers) 100 | 101 | if self.debug: 102 | logger.info(f"Proxy: {request.method} /{path} -> {r.status_code}") 103 | 104 | return Response(content=r.content, status_code=r.status_code, headers=resp_headers) 105 | 106 | async def unified_tts_handler(self, request: Request, tts_request: UnifiedTTSRequest, x_audio_format: str = "wav"): 107 | raise HTTPException(status_code=400, detail=f"This speech service doesn't support unified interface for now: {self.__class__.__name__}") 108 | 109 | def get_router(self) -> APIRouter: 110 | router = APIRouter() 111 | self.register_endpoint(router) 112 | router.add_api_route( 113 | "/{path:path}", 114 | self.passthrough_handler, 115 | methods=["GET", "POST", "PUT", "DELETE", "PATCH", "OPTIONS", "HEAD"], 116 | include_in_schema=False 117 | ) 118 | 119 | return router 120 | 121 | async def shutdown(self): 122 | await self.stream_source.close() 123 | -------------------------------------------------------------------------------- /tests/source/test_voicevox_source.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import os 3 | from speech_gateway.source.voicevox import VoicevoxStreamSource 4 | 5 | VOICEVOX_URL = os.getenv("VOICEVOX_URL") 6 | SPEAKER = "2" 7 | 8 | @pytest.fixture 9 | def source(): 10 | # Create an instance of VoicevoxStreamSource 11 | return VoicevoxStreamSource(base_url=VOICEVOX_URL) 12 | 13 | @pytest.fixture 14 | def audio_query(): 15 | # Provide the audio_query data 16 | return { 17 | "accent_phrases": [ 18 | { 19 | "moras": [ 20 | {"text": "コ", "consonant": "k", "consonant_length": 0, "vowel": "o", "vowel_length": 0, "pitch": 0}, 21 | {"text": "ン", "consonant": None, "consonant_length": None, "vowel": "N", "vowel_length": 0, "pitch": 0}, 22 | {"text": "ニ", "consonant": "n", "consonant_length": 0, "vowel": "i", "vowel_length": 0, "pitch": 0}, 23 | {"text": "チ", "consonant": "ch", "consonant_length": 0, "vowel": "i", "vowel_length": 0, "pitch": 0}, 24 | {"text": "ワ", "consonant": "w", "consonant_length": 0, "vowel": "a", "vowel_length": 0, "pitch": 0}, 25 | {"text": ".", "consonant": None, "consonant_length": None, "vowel": "pau", "vowel_length": 0, "pitch": 0} 26 | ], 27 | "accent": 5, 28 | "pause_mora": None, 29 | "is_interrogative": False 30 | }, 31 | { 32 | "moras": [ 33 | {"text": "コ", "consonant": "k", "consonant_length": 0, "vowel": "o", "vowel_length": 0, "pitch": 0}, 34 | {"text": "レ", "consonant": "r", "consonant_length": 0, "vowel": "e", "vowel_length": 0, "pitch": 0}, 35 | {"text": "ワ", "consonant": "w", "consonant_length": 0, "vowel": "a", "vowel_length": 0, "pitch": 0}, 36 | {"text": "テ", "consonant": "t", "consonant_length": 0, "vowel": "e", "vowel_length": 0, "pitch": 0}, 37 | {"text": "ス", "consonant": "s", "consonant_length": 0, "vowel": "u", "vowel_length": 0, "pitch": 0}, 38 | {"text": "ト", "consonant": "t", "consonant_length": 0, "vowel": "o", "vowel_length": 0, "pitch": 0}, 39 | {"text": "デ", "consonant": "d", "consonant_length": 0, "vowel": "e", "vowel_length": 0, "pitch": 0}, 40 | {"text": "ス", "consonant": "s", "consonant_length": 0, "vowel": "u", "vowel_length": 0, "pitch": 0}, 41 | {"text": ".", "consonant": None, "consonant_length": None, "vowel": "pau", "vowel_length": 0, "pitch": 0} 42 | ], 43 | "accent": 4, 44 | "pause_mora": None, 45 | "is_interrogative": False 46 | } 47 | ], 48 | "speedScale": 1, 49 | "intonationScale": 1, 50 | "tempoDynamicsScale": 1, 51 | "pitchScale": 0, 52 | "volumeScale": 1, 53 | "prePhonemeLength": 0.1, 54 | "postPhonemeLength": 0.1, 55 | "pauseLength": None, 56 | "pauseLengthScale": 1, 57 | "outputSamplingRate": 44100, 58 | "outputStereo": False, 59 | "kana": "こんにちは。これはテストです。" 60 | } 61 | 62 | @pytest.mark.asyncio 63 | async def test_get_cache_key(source, audio_query): 64 | # Test get_cache_key method 65 | cache_key = source.get_cache_key("mp3", SPEAKER, audio_query) 66 | assert cache_key.endswith(".mp3") 67 | assert SPEAKER in cache_key 68 | 69 | cache_key = source.get_cache_key("wav", SPEAKER, audio_query) 70 | assert cache_key.endswith(".wav") 71 | assert SPEAKER in cache_key 72 | 73 | @pytest.mark.asyncio 74 | async def test_parse_text(source, audio_query): 75 | # Test parse_text method 76 | text = source.parse_text(audio_query) 77 | assert text == "こんにちは。これはテストです。" 78 | 79 | @pytest.mark.asyncio 80 | async def test_make_stream_request(source, audio_query): 81 | # Test make_stream_request method 82 | request = source.make_stream_request(SPEAKER, audio_query) 83 | assert request["method"] == "POST" 84 | assert request["url"] == f"{VOICEVOX_URL}/synthesis" 85 | assert request["params"] == {"speaker": SPEAKER} 86 | assert request["json"] == audio_query 87 | 88 | @pytest.mark.asyncio 89 | async def test_fetch_stream_raw(source, audio_query): 90 | # Test fetch_stream_raw with a real request (ensure server is running locally) 91 | http_request = source.make_stream_request(SPEAKER, audio_query) 92 | 93 | try: 94 | async for chunk in source.fetch_stream_raw(http_request): 95 | assert isinstance(chunk, bytes) 96 | except Exception as e: 97 | pytest.fail(f"fetch_stream_raw failed: {e}") 98 | 99 | @pytest.mark.asyncio 100 | async def test_fetch_stream(source, audio_query): 101 | # Test fetch_stream method with conversion and caching 102 | audio_format = "mp3" 103 | 104 | try: 105 | async for chunk in await source.fetch_stream(audio_format, speaker=SPEAKER, audio_query=audio_query): 106 | assert isinstance(chunk, bytes) 107 | except Exception as e: 108 | pytest.fail(f"fetch_stream failed: {e}") 109 | -------------------------------------------------------------------------------- /speech_gateway/source/__init__.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | import logging 3 | from time import time 4 | from typing import AsyncIterator, Any, Dict 5 | import httpx 6 | from ..cache import CacheStorage 7 | from ..converter import FormatConverter 8 | from ..performance_recorder import PerformanceRecorder 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | class StreamSourceError(Exception): 14 | def __init__(self, message: str): 15 | super().__init__(message) 16 | 17 | 18 | class StreamSource(ABC): 19 | def __init__(self, 20 | *, 21 | base_url: str, 22 | cache_storage: CacheStorage = None, 23 | format_converters: Dict[str, FormatConverter] = None, 24 | max_connections: int = 100, 25 | max_keepalive_connections: int = 20, 26 | timeout: float = 10.0, 27 | performance_recorder: PerformanceRecorder = None, 28 | debug: bool = False 29 | ): 30 | self.base_url = base_url 31 | self.cache_storage = cache_storage 32 | self.format_converters = format_converters 33 | self.http_client = httpx.AsyncClient( 34 | follow_redirects=False, 35 | timeout=httpx.Timeout(timeout), 36 | limits=httpx.Limits( 37 | max_connections=max_connections, 38 | max_keepalive_connections=max_keepalive_connections 39 | ) 40 | ) 41 | self.performance_recorder = performance_recorder 42 | self.debug = debug 43 | 44 | @abstractmethod 45 | def get_cache_key(self, audio_format: str, **kwargs) -> str: 46 | pass 47 | 48 | @abstractmethod 49 | def parse_text(self, **kwargs) -> str: 50 | pass 51 | 52 | def get_converter(self, audio_format: str) -> FormatConverter: 53 | if self.format_converters: 54 | return self.format_converters.get(audio_format) 55 | 56 | @abstractmethod 57 | def make_stream_request(self, **kwargs) -> dict: 58 | pass 59 | 60 | async def fetch_stream_raw(self, http_request: Dict[str, Any]) -> AsyncIterator[bytes]: 61 | try: 62 | async with self.http_client.stream(**http_request) as audio_resp: 63 | if audio_resp.status_code != 200: 64 | resp_body = "" 65 | try: 66 | resp_body = await audio_resp.aread() 67 | except: 68 | pass 69 | raise StreamSourceError(f"Stream from voice service failed: {audio_resp.status_code}: {resp_body}") 70 | 71 | async for chunk in audio_resp.aiter_bytes(1024): 72 | yield chunk 73 | 74 | except httpx.RequestError as ex: 75 | raise StreamSourceError(f"HTTP request failed: {ex}") from ex 76 | 77 | async def fetch_stream(self, audio_format: str, **kwargs) -> AsyncIterator[bytes]: 78 | start_time = time() 79 | cache_key = self.get_cache_key(audio_format, **kwargs) 80 | use_cache = self.cache_storage and await self.cache_storage.has_cache(cache_key) 81 | 82 | if use_cache: 83 | if self.debug: 84 | logger.info(f"[cache]: {cache_key}") 85 | # Get cache stream 86 | stream = self.cache_storage.fetch_cache_stream(cache_key) 87 | 88 | else: 89 | # Get stream from TTS service 90 | if self.debug: 91 | logger.info(f"[generate]: {cache_key}") 92 | http_request = self.make_stream_request(**kwargs) 93 | 94 | if self.debug: 95 | logger.info(f"Request to speech service: {http_request}") 96 | 97 | stream = self.fetch_stream_raw(http_request) 98 | 99 | # Convert format 100 | converter = self.get_converter(audio_format) 101 | if converter: 102 | stream = converter.convert(stream) 103 | 104 | # Write cache 105 | if self.cache_storage: 106 | stream = self.cache_storage.write_cache(stream, cache_key) 107 | 108 | # Response time 109 | if self.performance_recorder: 110 | stream = self.record_time( 111 | stream, 112 | cache_key=cache_key, 113 | text=self.parse_text(**kwargs), 114 | audio_format=audio_format, 115 | cached=use_cache, 116 | start_time=start_time 117 | ) 118 | 119 | return stream 120 | 121 | async def record_time( 122 | self, 123 | input_stream: AsyncIterator[bytes], 124 | *, 125 | cache_key: str, 126 | text: str, 127 | audio_format: str, 128 | cached: bool, 129 | start_time: float 130 | ) -> AsyncIterator[bytes]: 131 | async for chunk in input_stream: 132 | yield chunk 133 | 134 | self.performance_recorder.record( 135 | process_id=cache_key, source=self.__class__.__name__, text=text, 136 | audio_format=audio_format, cached=1 if cached else 0, elapsed=time() - start_time 137 | ) 138 | 139 | async def close(self): 140 | await self.http_client.aclose() 141 | -------------------------------------------------------------------------------- /docker/run.py: -------------------------------------------------------------------------------- 1 | from contextlib import asynccontextmanager 2 | import logging 3 | import os 4 | from fastapi import FastAPI 5 | from dotenv import load_dotenv 6 | from speech_gateway.performance_recorder.postgres import PostgreSQLPerformanceRecorder 7 | from speech_gateway.gateway.azure import AzureGateway 8 | from speech_gateway.gateway.openai_speech import OpenAIGateway 9 | from speech_gateway.gateway.voicevox import VoicevoxGateway 10 | from speech_gateway.gateway.sbv2 import StyleBertVits2Gateway 11 | from speech_gateway.gateway.nijivoice_encoded import NijiVoiceEncodedGateway 12 | from speech_gateway.gateway.unified import UnifiedGateway 13 | 14 | # Configure root logger 15 | logger = logging.getLogger("speech_gateway") 16 | logger.setLevel(logging.INFO) 17 | log_format = logging.Formatter("[%(levelname)s] %(asctime)s : %(message)s") 18 | streamHandler = logging.StreamHandler() 19 | streamHandler.setFormatter(log_format) 20 | logger.addHandler(streamHandler) 21 | 22 | load_dotenv() 23 | DEBUG = os.getenv("DEBUG", "false").lower() in ("true", "1", "yes") 24 | 25 | # Azure 26 | AZURE_ENABLED = os.getenv("AZURE_ENABLED", "false").lower() in ("true", "1", "yes") 27 | AZURE_API_KEY = os.getenv("AZURE_API_KEY") 28 | AZURE_REGION = os.getenv("AZURE_REGION") 29 | AZURE_LANGUAGES = os.getenv("AZURE_LANGUAGES") 30 | # OpenAI 31 | OPENAI_ENABLED = os.getenv("OPENAI_ENABLED", "false").lower() in ("true", "1", "yes") 32 | OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") 33 | OPENAI_LANGUAGES = os.getenv("OPENAI_LANGUAGES") 34 | # VOICEVOX 35 | VOICEVOX_ENABLED = os.getenv("VOICEVOX_ENABLED", "false").lower() in ("true", "1", "yes") 36 | VOICEVOX_URL = os.getenv("VOICEVOX_URL") 37 | VOICEVOX_LANGUAGES = os.getenv("VOICEVOX_LANGUAGES") 38 | # Style-Bert-VITS2 39 | SBV2_ENABLED = os.getenv("SBV2_ENABLED", "false").lower() in ("true", "1", "yes") 40 | SBV2_URL = os.getenv("SBV2_URL") 41 | SBV2_LANGUAGES = os.getenv("SBV2_LANGUAGES") 42 | # NIJIVOICE 43 | NIJIVOICE_ENABLED = os.getenv("NIJIVOICE_ENABLED", "false").lower() in ("true", "1", "yes") 44 | NIJIVOICE_API_KEY = os.getenv("NIJIVOICE_API_KEY") 45 | NIJIVOICE_LANGUAGES = os.getenv("NIJIVOICE_LANGUAGES") 46 | # Database 47 | DB_PORT = os.getenv("PORT_DB") 48 | DB_USER = os.getenv("SPGW_DB_USER") 49 | DB_PASSWORD = os.getenv("SPGW_DB_PASSWORD") 50 | 51 | # Performance recorder 52 | performance_recorder = PostgreSQLPerformanceRecorder(host="spgw-db", port=DB_PORT, user=DB_USER, password=DB_PASSWORD) 53 | 54 | # On app down 55 | @asynccontextmanager 56 | async def lifespan(app: FastAPI): 57 | yield 58 | # Shutdown enabled gateways 59 | if AZURE_ENABLED and 'azure_gateway' in globals(): 60 | await azure_gateway.shutdown() 61 | if OPENAI_ENABLED and 'openai_gateway' in globals(): 62 | await openai_gateway.shutdown() 63 | if VOICEVOX_ENABLED and 'voicevox_gateway' in globals(): 64 | await voicevox_gateway.shutdown() 65 | if SBV2_ENABLED and 'sbv2_gateway' in globals(): 66 | await sbv2_gateway.shutdown() 67 | if NIJIVOICE_ENABLED and 'nijivoice_gateway' in globals(): 68 | await nijivoice_gateway.shutdown() 69 | 70 | # Create API app 71 | app = FastAPI(lifespan=lifespan) 72 | 73 | # Unified gateway 74 | unified_gateway = UnifiedGateway(debug=True) 75 | app.include_router(unified_gateway.get_router()) 76 | 77 | # Create service gateways 78 | if AZURE_ENABLED: 79 | azure_gateway = AzureGateway(api_key=AZURE_API_KEY, cache_dir="cache/azure", performance_recorder=performance_recorder, region=AZURE_REGION, debug=DEBUG) 80 | unified_gateway.add_gateway( 81 | service_name="azure", 82 | gateway=azure_gateway, 83 | languages=AZURE_LANGUAGES.split(",") if AZURE_LANGUAGES else None, 84 | ) 85 | app.include_router(azure_gateway.get_router(), prefix="/azure") 86 | logger.info("[Gateway] Azure on /azure") 87 | 88 | if OPENAI_ENABLED: 89 | openai_gateway = OpenAIGateway(api_key=OPENAI_API_KEY, cache_dir="cache/openai", performance_recorder=performance_recorder, debug=DEBUG) 90 | unified_gateway.add_gateway( 91 | service_name="openai", 92 | gateway=openai_gateway, 93 | languages=OPENAI_LANGUAGES.split(",") if OPENAI_LANGUAGES else None, 94 | ) 95 | app.include_router(openai_gateway.get_router(), prefix="/openai") 96 | logger.info(f"[Gateway] OpenAI on /openai") 97 | 98 | if VOICEVOX_ENABLED: 99 | voicevox_gateway = VoicevoxGateway(base_url=VOICEVOX_URL, cache_dir="cache/voicevox", performance_recorder=performance_recorder, debug=DEBUG) 100 | unified_gateway.add_gateway( 101 | service_name="voicevox", 102 | gateway=voicevox_gateway, 103 | languages=VOICEVOX_LANGUAGES.split(",") if VOICEVOX_LANGUAGES else None, 104 | ) 105 | app.include_router(voicevox_gateway.get_router(), prefix="/voicevox") 106 | logger.info(f"[Gateway] VOICEVOX on /voicevox") 107 | 108 | if SBV2_ENABLED: 109 | sbv2_gateway = StyleBertVits2Gateway(base_url=SBV2_URL, cache_dir="cache/sbv2", performance_recorder=performance_recorder, debug=DEBUG) 110 | unified_gateway.add_gateway( 111 | service_name="sbv2", 112 | gateway=sbv2_gateway, 113 | languages=SBV2_LANGUAGES.split(",") if SBV2_LANGUAGES else None, 114 | ) 115 | app.include_router(sbv2_gateway.get_router(), prefix="/sbv2") 116 | logger.info(f"[Gateway] Style-Bert-VITS2 on /sbv2") 117 | 118 | if NIJIVOICE_ENABLED: 119 | nijivoice_gateway = NijiVoiceEncodedGateway(api_key=NIJIVOICE_API_KEY, cache_dir="cache/nijivoice", performance_recorder=performance_recorder, debug=DEBUG) 120 | unified_gateway.add_gateway( 121 | service_name="nijivoice", 122 | gateway=nijivoice_gateway, 123 | languages=NIJIVOICE_LANGUAGES.split(",") if NIJIVOICE_LANGUAGES else None, 124 | ) 125 | app.include_router(nijivoice_gateway.get_router(), prefix="/nijivoice") 126 | logger.info(f"[Gateway] Nijivoice on /nijivoice") 127 | -------------------------------------------------------------------------------- /tests/gateway/test_openai_speech.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | import httpx 4 | 5 | SPEAKER = "alloy" 6 | 7 | 8 | @pytest.mark.asyncio 9 | async def test_openai_speech(random_text, mp3_checker, audio_transcriber): 10 | resp = httpx.post( 11 | "http://127.0.0.1:8000/openai/audio/speech", 12 | json={ 13 | "model": "tts-1", 14 | "voice": "alloy", 15 | "input": random_text, 16 | "speed": 1.0, 17 | } 18 | ) 19 | audio_data = resp.content 20 | assert mp3_checker(audio_data) 21 | assert "音声合成" in audio_transcriber(audio_data, "mp3") 22 | 23 | 24 | @pytest.mark.asyncio 25 | async def test_openai_speech_wav(random_text, wave_checker, audio_transcriber): 26 | resp = httpx.post( 27 | "http://127.0.0.1:8000/openai/audio/speech", 28 | json={ 29 | "model": "tts-1", 30 | "voice": "alloy", 31 | "input": random_text, 32 | "speed": 1.0, 33 | "response_format": "wav" 34 | } 35 | ) 36 | audio_data = resp.content 37 | assert wave_checker(audio_data) 38 | assert "音声合成" in audio_transcriber(audio_data, "wav") 39 | 40 | 41 | @pytest.mark.asyncio 42 | async def test_openai_speech_mp3(random_text, mp3_checker, audio_transcriber): 43 | resp = httpx.post( 44 | "http://127.0.0.1:8000/openai/audio/speech", 45 | json={ 46 | "model": "tts-1", 47 | "voice": "alloy", 48 | "input": random_text, 49 | "speed": 1.0, 50 | "response_format": "mp3" 51 | } 52 | ) 53 | audio_data = resp.content 54 | assert mp3_checker(audio_data) 55 | assert "音声合成" in audio_transcriber(audio_data, "mp3") 56 | 57 | 58 | @pytest.mark.asyncio 59 | async def test_openai_speech_wav_mp3(random_text, mp3_checker, audio_transcriber): 60 | resp = httpx.post( 61 | "http://127.0.0.1:8000/openai/audio/speech", 62 | json={ 63 | "model": "tts-1", 64 | "voice": "alloy", 65 | "input": random_text, 66 | "speed": 1.0, 67 | "response_format": "wav" # <- wav 68 | }, 69 | params={ 70 | "x_audio_format": "mp3" # <- mp3 71 | } 72 | ) 73 | audio_data = resp.content 74 | assert mp3_checker(audio_data) 75 | assert "音声合成" in audio_transcriber(audio_data, "mp3") 76 | 77 | 78 | @pytest.mark.asyncio 79 | async def test_openai_speech_mp3_wav(random_text, wave_checker, audio_transcriber): 80 | resp = httpx.post( 81 | "http://127.0.0.1:8000/openai/audio/speech", 82 | json={ 83 | "model": "tts-1", 84 | "voice": "alloy", 85 | "input": random_text, 86 | "speed": 1.0, 87 | "response_format": "mp3" # <- mp3 88 | }, 89 | params={ 90 | "x_audio_format": "wav" # <- wav 91 | } 92 | ) 93 | audio_data = resp.content 94 | assert wave_checker(audio_data) 95 | assert "音声合成" in audio_transcriber(audio_data, "wav") 96 | 97 | 98 | @pytest.mark.asyncio 99 | async def test_openai_speech_x_wav(random_text, wave_checker, audio_transcriber): 100 | resp = httpx.post( 101 | "http://127.0.0.1:8000/openai/audio/speech", 102 | json={ 103 | "model": "tts-1", 104 | "voice": "alloy", 105 | "input": random_text, 106 | "speed": 1.0, 107 | }, 108 | params={ 109 | "x_audio_format": "wav" # <- wav 110 | } 111 | ) 112 | audio_data = resp.content 113 | assert wave_checker(audio_data) 114 | assert "音声合成" in audio_transcriber(audio_data, "wav") 115 | 116 | 117 | @pytest.mark.asyncio 118 | async def test_openai_speech_x_mp3(random_text, mp3_checker, audio_transcriber): 119 | resp = httpx.post( 120 | "http://127.0.0.1:8000/openai/audio/speech", 121 | json={ 122 | "model": "tts-1", 123 | "voice": "alloy", 124 | "input": random_text, 125 | "speed": 1.0, 126 | }, 127 | params={ 128 | "x_audio_format": "mp3" # <- mp3 129 | } 130 | ) 131 | audio_data = resp.content 132 | assert mp3_checker(audio_data) 133 | assert "音声合成" in audio_transcriber(audio_data, "mp3") 134 | 135 | 136 | @pytest.mark.asyncio 137 | async def test_openai_speech_unified(random_text, wave_checker, audio_transcriber): 138 | req = { 139 | "text": random_text, 140 | "speaker": SPEAKER, 141 | "service_name": "openai" 142 | } 143 | resp = httpx.post("http://127.0.0.1:8000/tts", json=req) 144 | audio_data = resp.content 145 | assert wave_checker(audio_data) 146 | assert "音声合成" in audio_transcriber(audio_data, "wav") 147 | 148 | 149 | @pytest.mark.asyncio 150 | async def test_openai_speech_unified_wav(random_text, wave_checker, audio_transcriber): 151 | req = { 152 | "text": random_text, 153 | "speaker": SPEAKER, 154 | "service_name": "openai" 155 | } 156 | query_params = { 157 | "x_audio_format": "wav" 158 | } 159 | resp = httpx.post("http://127.0.0.1:8000/tts", params=query_params, json=req) 160 | audio_data = resp.content 161 | assert wave_checker(audio_data) 162 | assert "音声合成" in audio_transcriber(audio_data, "wav") 163 | 164 | 165 | @pytest.mark.asyncio 166 | async def test_openai_speech_unified_mp3(random_text, mp3_checker, audio_transcriber): 167 | req = { 168 | "text": random_text, 169 | "speaker": SPEAKER, 170 | "service_name": "openai" 171 | } 172 | query_params = { 173 | "x_audio_format": "mp3" 174 | } 175 | resp = httpx.post("http://127.0.0.1:8000/tts", params=query_params, json=req) 176 | audio_data = resp.content 177 | assert mp3_checker(audio_data) 178 | assert "音声合成" in audio_transcriber(audio_data, "mp3") 179 | -------------------------------------------------------------------------------- /tests/gateway/test_azure_openai_speech.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | import httpx 4 | 5 | SPEAKER = "alloy" 6 | 7 | 8 | @pytest.mark.asyncio 9 | async def test_openai_speech(random_text, mp3_checker, audio_transcriber): 10 | resp = httpx.post( 11 | "http://127.0.0.1:8000/azure_openai/audio/speech", 12 | json={ 13 | "model": "gpt-4o-mini-tts", 14 | "voice": "alloy", 15 | "input": random_text, 16 | "speed": 1.0, 17 | } 18 | ) 19 | audio_data = resp.content 20 | assert mp3_checker(audio_data) 21 | assert "音声合成" in audio_transcriber(audio_data, "mp3") 22 | 23 | 24 | @pytest.mark.asyncio 25 | async def test_openai_speech_wav(random_text, wave_checker, audio_transcriber): 26 | resp = httpx.post( 27 | "http://127.0.0.1:8000/azure_openai/audio/speech", 28 | json={ 29 | "model": "gpt-4o-mini-tts", 30 | "voice": "alloy", 31 | "input": random_text, 32 | "speed": 1.0, 33 | "response_format": "wav" 34 | } 35 | ) 36 | audio_data = resp.content 37 | assert wave_checker(audio_data) 38 | assert "音声合成" in audio_transcriber(audio_data, "wav") 39 | 40 | 41 | @pytest.mark.asyncio 42 | async def test_openai_speech_mp3(random_text, mp3_checker, audio_transcriber): 43 | resp = httpx.post( 44 | "http://127.0.0.1:8000/azure_openai/audio/speech", 45 | json={ 46 | "model": "gpt-4o-mini-tts", 47 | "voice": "alloy", 48 | "input": random_text, 49 | "speed": 1.0, 50 | "response_format": "mp3" 51 | } 52 | ) 53 | audio_data = resp.content 54 | assert mp3_checker(audio_data) 55 | assert "音声合成" in audio_transcriber(audio_data, "mp3") 56 | 57 | 58 | @pytest.mark.asyncio 59 | async def test_openai_speech_wav_mp3(random_text, mp3_checker, audio_transcriber): 60 | resp = httpx.post( 61 | "http://127.0.0.1:8000/azure_openai/audio/speech", 62 | json={ 63 | "model": "gpt-4o-mini-tts", 64 | "voice": "alloy", 65 | "input": random_text, 66 | "speed": 1.0, 67 | "response_format": "wav" # <- wav 68 | }, 69 | params={ 70 | "x_audio_format": "mp3" # <- mp3 71 | } 72 | ) 73 | audio_data = resp.content 74 | assert mp3_checker(audio_data) 75 | assert "音声合成" in audio_transcriber(audio_data, "mp3") 76 | 77 | 78 | @pytest.mark.asyncio 79 | async def test_openai_speech_mp3_wav(random_text, wave_checker, audio_transcriber): 80 | resp = httpx.post( 81 | "http://127.0.0.1:8000/azure_openai/audio/speech", 82 | json={ 83 | "model": "gpt-4o-mini-tts", 84 | "voice": "alloy", 85 | "input": random_text, 86 | "speed": 1.0, 87 | "response_format": "mp3" # <- mp3 88 | }, 89 | params={ 90 | "x_audio_format": "wav" # <- wav 91 | } 92 | ) 93 | audio_data = resp.content 94 | assert wave_checker(audio_data) 95 | assert "音声合成" in audio_transcriber(audio_data, "wav") 96 | 97 | 98 | @pytest.mark.asyncio 99 | async def test_openai_speech_x_wav(random_text, wave_checker, audio_transcriber): 100 | resp = httpx.post( 101 | "http://127.0.0.1:8000/azure_openai/audio/speech", 102 | json={ 103 | "model": "gpt-4o-mini-tts", 104 | "voice": "alloy", 105 | "input": random_text, 106 | "speed": 1.0, 107 | }, 108 | params={ 109 | "x_audio_format": "wav" # <- wav 110 | } 111 | ) 112 | audio_data = resp.content 113 | assert wave_checker(audio_data) 114 | assert "音声合成" in audio_transcriber(audio_data, "wav") 115 | 116 | 117 | @pytest.mark.asyncio 118 | async def test_openai_speech_x_mp3(random_text, mp3_checker, audio_transcriber): 119 | resp = httpx.post( 120 | "http://127.0.0.1:8000/azure_openai/audio/speech", 121 | json={ 122 | "model": "gpt-4o-mini-tts", 123 | "voice": "alloy", 124 | "input": random_text, 125 | "speed": 1.0, 126 | }, 127 | params={ 128 | "x_audio_format": "mp3" # <- mp3 129 | } 130 | ) 131 | audio_data = resp.content 132 | assert mp3_checker(audio_data) 133 | assert "音声合成" in audio_transcriber(audio_data, "mp3") 134 | 135 | 136 | @pytest.mark.asyncio 137 | async def test_openai_speech_unified(random_text, wave_checker, audio_transcriber): 138 | req = { 139 | "text": random_text, 140 | "speaker": SPEAKER, 141 | "service_name": "azure_openai" 142 | } 143 | resp = httpx.post("http://127.0.0.1:8000/tts", json=req) 144 | audio_data = resp.content 145 | assert wave_checker(audio_data) 146 | assert "音声合成" in audio_transcriber(audio_data, "wav") 147 | 148 | 149 | @pytest.mark.asyncio 150 | async def test_openai_speech_unified_wav(random_text, wave_checker, audio_transcriber): 151 | req = { 152 | "text": random_text, 153 | "speaker": SPEAKER, 154 | "service_name": "azure_openai" 155 | } 156 | query_params = { 157 | "x_audio_format": "wav" 158 | } 159 | resp = httpx.post("http://127.0.0.1:8000/tts", params=query_params, json=req) 160 | audio_data = resp.content 161 | assert wave_checker(audio_data) 162 | assert "音声合成" in audio_transcriber(audio_data, "wav") 163 | 164 | 165 | @pytest.mark.asyncio 166 | async def test_openai_speech_unified_mp3(random_text, mp3_checker, audio_transcriber): 167 | req = { 168 | "text": random_text, 169 | "speaker": SPEAKER, 170 | "service_name": "azure_openai" 171 | } 172 | query_params = { 173 | "x_audio_format": "mp3" 174 | } 175 | resp = httpx.post("http://127.0.0.1:8000/tts", params=query_params, json=req) 176 | audio_data = resp.content 177 | assert mp3_checker(audio_data) 178 | assert "音声合成" in audio_transcriber(audio_data, "mp3") 179 | -------------------------------------------------------------------------------- /tests/source/test_openai_speech_source.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import os 3 | from speech_gateway.source.openai_speech import OpenAIStreamSource 4 | 5 | OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") 6 | AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY") 7 | AZURE_OPENAI_BASE_URL =os.getenv("AZURE_OPENAI_BASE_URL") 8 | 9 | @pytest.fixture 10 | def source(): 11 | # Create an instance of OpenAIStreamSource 12 | return OpenAIStreamSource(api_key=OPENAI_API_KEY) 13 | 14 | @pytest.mark.asyncio 15 | async def test_get_cache_key(source): 16 | # Test get_cache_key method 17 | request_json = { 18 | "model": "tts-1", 19 | "voice": "alloy", 20 | "input": "こんにちは。これはテストです。", 21 | "speed": 1.0, 22 | "response_format": "wav" 23 | } 24 | cache_key = source.get_cache_key("mp3", request_json) 25 | assert cache_key.endswith(".mp3") 26 | 27 | cache_key = source.get_cache_key("wav", request_json) 28 | assert cache_key.endswith(".wav") 29 | 30 | @pytest.mark.asyncio 31 | async def test_parse_text(source): 32 | # Test parse_text method 33 | request_json = { 34 | "model": "tts-1", 35 | "voice": "alloy", 36 | "input": "こんにちは。これはテストです。", 37 | "speed": 1.0, 38 | "response_format": "wav" 39 | } 40 | text = source.parse_text(request_json) 41 | assert text == "こんにちは。これはテストです。" 42 | 43 | @pytest.mark.asyncio 44 | async def test_make_stream_request(source): 45 | # Test make_stream_request method 46 | request_json = { 47 | "model": "tts-1", 48 | "voice": "alloy", 49 | "input": "こんにちは。これはテストです。", 50 | "speed": 1.0, 51 | "response_format": "wav" 52 | } 53 | request = source.make_stream_request(request_json) 54 | assert request["method"] == "POST" 55 | assert request["url"] == "https://api.openai.com/v1/audio/speech" 56 | assert request["json"] == request_json 57 | 58 | @pytest.mark.asyncio 59 | async def test_fetch_stream_raw(source): 60 | # Test fetch_stream_raw with a real request (ensure server is running locally) 61 | request_json = { 62 | "model": "tts-1", 63 | "voice": "alloy", 64 | "input": "こんにちは。これはテストです。", 65 | "speed": 1.0, 66 | "response_format": "wav" 67 | } 68 | http_request = source.make_stream_request(request_json) 69 | 70 | try: 71 | async for chunk in source.fetch_stream_raw(http_request): 72 | assert isinstance(chunk, bytes) 73 | except Exception as e: 74 | pytest.fail(f"fetch_stream_raw failed: {e}") 75 | 76 | @pytest.mark.asyncio 77 | async def test_fetch_stream(source): 78 | # Test fetch_stream method with conversion and caching 79 | request_json = { 80 | "model": "tts-1", 81 | "voice": "alloy", 82 | "input": "こんにちは。これはテストです。", 83 | "speed": 1.0, 84 | "response_format": "wav" 85 | } 86 | 87 | audio_format = "wav" 88 | 89 | try: 90 | async for chunk in await source.fetch_stream(audio_format, request_json=request_json): 91 | assert isinstance(chunk, bytes) 92 | except Exception as e: 93 | pytest.fail(f"fetch_stream failed: {e}") 94 | 95 | @pytest.mark.asyncio 96 | async def test_fetch_stream_raw(source): 97 | # Test fetch_stream_raw with a real request (ensure server is running locally) 98 | request_json = { 99 | "model": "tts-1", 100 | "voice": "alloy", 101 | "input": "こんにちは。これはテストです。", 102 | "speed": 1.0, 103 | "response_format": "wav" 104 | } 105 | http_request = source.make_stream_request(request_json) 106 | 107 | try: 108 | async for chunk in source.fetch_stream_raw(http_request): 109 | assert isinstance(chunk, bytes) 110 | except Exception as e: 111 | pytest.fail(f"fetch_stream_raw failed: {e}") 112 | 113 | @pytest.mark.asyncio 114 | async def test_fetch_stream(source): 115 | # Test fetch_stream method with conversion and caching 116 | request_json = { 117 | "model": "tts-1", 118 | "voice": "alloy", 119 | "input": "こんにちは。これはテストです。", 120 | "speed": 1.0, 121 | "response_format": "wav" 122 | } 123 | 124 | audio_format = "wav" 125 | 126 | try: 127 | async for chunk in await source.fetch_stream(audio_format, request_json=request_json): 128 | assert isinstance(chunk, bytes) 129 | except Exception as e: 130 | pytest.fail(f"fetch_stream failed: {e}") 131 | 132 | 133 | @pytest.mark.asyncio 134 | async def test_fetch_stream_raw_azure(source): 135 | # Use Azure OpenAI API 136 | source.api_key = AZURE_OPENAI_API_KEY 137 | source.base_url = AZURE_OPENAI_BASE_URL 138 | 139 | # Test fetch_stream_raw with a real request (ensure server is running locally) 140 | request_json = { 141 | "model": "gpt-4o-mini-tts", 142 | "voice": "alloy", 143 | "input": "こんにちは。これはテストです。", 144 | "speed": 1.0, 145 | "response_format": "wav" 146 | } 147 | http_request = source.make_stream_request(request_json) 148 | 149 | try: 150 | async for chunk in source.fetch_stream_raw(http_request): 151 | assert isinstance(chunk, bytes) 152 | except Exception as e: 153 | pytest.fail(f"fetch_stream_raw_azure failed: {e}") 154 | 155 | @pytest.mark.asyncio 156 | async def test_fetch_stream_azure(source): 157 | # Use Azure OpenAI API 158 | source.api_key = AZURE_OPENAI_API_KEY 159 | source.base_url = AZURE_OPENAI_BASE_URL 160 | 161 | # Test fetch_stream method with conversion and caching 162 | request_json = { 163 | "model": "gpt-4o-mini-tts", 164 | "voice": "alloy", 165 | "input": "こんにちは。これはテストです。", 166 | "speed": 1.0, 167 | "response_format": "wav" 168 | } 169 | 170 | audio_format = "wav" 171 | 172 | try: 173 | async for chunk in await source.fetch_stream(audio_format, request_json=request_json): 174 | assert isinstance(chunk, bytes) 175 | except Exception as e: 176 | pytest.fail(f"fetch_stream_azure failed: {e}") 177 | -------------------------------------------------------------------------------- /tests/gateway/test_nijivoice.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | import httpx 4 | 5 | VOICE_ACTOR_ID = "dba2fa0e-f750-43ad-b9f6-d5aeaea7dc16" 6 | 7 | 8 | @pytest.mark.asyncio 9 | async def test_nijivoice(random_text, mp3_checker, audio_transcriber): 10 | resp_json = httpx.post( 11 | f"http://127.0.0.1:8000/nijivoice/api/platform/v1/voice-actors/{VOICE_ACTOR_ID}/generate-voice", 12 | json={ 13 | "script": random_text, 14 | "speed": "1.0" 15 | } 16 | ).json() 17 | 18 | resp = httpx.get(resp_json["generatedVoice"]["audioFileUrl"]) 19 | audio_data = resp.content 20 | assert mp3_checker(audio_data) 21 | assert "音声合成" in audio_transcriber(audio_data, "mp3") 22 | 23 | 24 | @pytest.mark.asyncio 25 | async def test_nijivoice_wav(random_text, wave_checker, audio_transcriber): 26 | resp_json = httpx.post( 27 | f"http://127.0.0.1:8000/nijivoice/api/platform/v1/voice-actors/{VOICE_ACTOR_ID}/generate-voice", 28 | json={ 29 | "script": random_text, 30 | "speed": "1.0", 31 | "format": "wav" 32 | } 33 | ).json() 34 | 35 | resp = httpx.get(resp_json["generatedVoice"]["audioFileUrl"]) 36 | audio_data = resp.content 37 | assert wave_checker(audio_data) 38 | assert "音声合成" in audio_transcriber(audio_data, "wav") 39 | 40 | 41 | @pytest.mark.asyncio 42 | async def test_nijivoice_mp3(random_text, mp3_checker, audio_transcriber): 43 | resp_json = httpx.post( 44 | f"http://127.0.0.1:8000/nijivoice/api/platform/v1/voice-actors/{VOICE_ACTOR_ID}/generate-voice", 45 | json={ 46 | "script": random_text, 47 | "speed": "1.0", 48 | "format": "mp3" 49 | } 50 | ).json() 51 | 52 | resp = httpx.get(resp_json["generatedVoice"]["audioFileUrl"]) 53 | audio_data = resp.content 54 | assert mp3_checker(audio_data) 55 | assert "音声合成" in audio_transcriber(audio_data, "mp3") 56 | 57 | 58 | @pytest.mark.asyncio 59 | async def test_nijivoice_wav_mp3(random_text, mp3_checker, audio_transcriber): 60 | resp_json = httpx.post( 61 | f"http://127.0.0.1:8000/nijivoice/api/platform/v1/voice-actors/{VOICE_ACTOR_ID}/generate-voice", 62 | json={ 63 | "script": random_text, 64 | "speed": "1.0", 65 | "format": "wav" # <- wav 66 | }, 67 | params={ 68 | "x_audio_format": "mp3" # <- mp3 69 | } 70 | ).json() 71 | 72 | resp = httpx.get(resp_json["generatedVoice"]["audioFileUrl"]) 73 | audio_data = resp.content 74 | assert mp3_checker(audio_data) 75 | assert "音声合成" in audio_transcriber(audio_data, "mp3") 76 | 77 | 78 | @pytest.mark.asyncio 79 | async def test_nijivoice_mp3_wav(random_text, wave_checker, audio_transcriber): 80 | resp_json = httpx.post( 81 | f"http://127.0.0.1:8000/nijivoice/api/platform/v1/voice-actors/{VOICE_ACTOR_ID}/generate-voice", 82 | json={ 83 | "script": random_text, 84 | "speed": "1.0", 85 | "format": "mp3" # <- mp3 86 | }, 87 | params = { 88 | "x_audio_format": "wav" # <- wav 89 | } 90 | ).json() 91 | 92 | resp = httpx.get(resp_json["generatedVoice"]["audioFileUrl"]) 93 | audio_data = resp.content 94 | assert wave_checker(audio_data) 95 | assert "音声合成" in audio_transcriber(audio_data, "wav") 96 | 97 | 98 | @pytest.mark.asyncio 99 | async def test_nijivoice_x_wav(random_text, wave_checker, audio_transcriber): 100 | resp_json = httpx.post( 101 | f"http://127.0.0.1:8000/nijivoice/api/platform/v1/voice-actors/{VOICE_ACTOR_ID}/generate-voice", 102 | json={ 103 | "script": random_text, 104 | "speed": "1.0" 105 | }, 106 | params = { 107 | "x_audio_format": "wav" # <- wav 108 | } 109 | ).json() 110 | 111 | resp = httpx.get(resp_json["generatedVoice"]["audioFileUrl"]) 112 | audio_data = resp.content 113 | assert wave_checker(audio_data) 114 | assert "音声合成" in audio_transcriber(audio_data, "wav") 115 | 116 | 117 | @pytest.mark.asyncio 118 | async def test_nijivoice_x_mp3(random_text, mp3_checker, audio_transcriber): 119 | resp_json = httpx.post( 120 | f"http://127.0.0.1:8000/nijivoice/api/platform/v1/voice-actors/{VOICE_ACTOR_ID}/generate-voice", 121 | json={ 122 | "script": random_text, 123 | "speed": "1.0" 124 | }, 125 | params = { 126 | "x_audio_format": "mp3" # <- mp3 127 | } 128 | ).json() 129 | 130 | resp = httpx.get(resp_json["generatedVoice"]["audioFileUrl"]) 131 | audio_data = resp.content 132 | assert mp3_checker(audio_data) 133 | assert "音声合成" in audio_transcriber(audio_data, "mp3") 134 | 135 | 136 | @pytest.mark.asyncio 137 | async def test_nijivoice_unified(random_text, wave_checker, audio_transcriber): 138 | req = { 139 | "text": random_text, 140 | "speaker": VOICE_ACTOR_ID, 141 | "service_name": "nijivoice" 142 | } 143 | resp = httpx.post("http://127.0.0.1:8000/tts", json=req) 144 | audio_data = resp.content 145 | assert wave_checker(audio_data) 146 | assert "音声合成" in audio_transcriber(audio_data, "wav") 147 | 148 | 149 | @pytest.mark.asyncio 150 | async def test_nijivoice_unified_wav(random_text, wave_checker, audio_transcriber): 151 | req = { 152 | "text": random_text, 153 | "speaker": VOICE_ACTOR_ID, 154 | "service_name": "nijivoice" 155 | } 156 | query_params = { 157 | "x_audio_format": "wav" 158 | } 159 | resp = httpx.post("http://127.0.0.1:8000/tts", params=query_params, json=req) 160 | audio_data = resp.content 161 | assert wave_checker(audio_data) 162 | assert "音声合成" in audio_transcriber(audio_data, "wav") 163 | 164 | 165 | @pytest.mark.asyncio 166 | async def test_nijivoice_unified_mp3(random_text, mp3_checker, audio_transcriber): 167 | req = { 168 | "text": random_text, 169 | "speaker": VOICE_ACTOR_ID, 170 | "service_name": "nijivoice" 171 | } 172 | query_params = { 173 | "x_audio_format": "mp3" 174 | } 175 | resp = httpx.post("http://127.0.0.1:8000/tts", params=query_params, json=req) 176 | audio_data = resp.content 177 | assert mp3_checker(audio_data) 178 | assert "音声合成" in audio_transcriber(audio_data, "mp3") 179 | -------------------------------------------------------------------------------- /tests/gateway/test_nijivoice_encoded.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import pytest 3 | import httpx 4 | 5 | VOICE_ACTOR_ID = "dba2fa0e-f750-43ad-b9f6-d5aeaea7dc16" 6 | 7 | 8 | @pytest.mark.asyncio 9 | async def test_nijivoice(random_text, mp3_checker, audio_transcriber): 10 | resp_json = httpx.post( 11 | f"http://127.0.0.1:8000/nijivoice_encoded/api/platform/v1/voice-actors/{VOICE_ACTOR_ID}/generate-encoded-voice", 12 | json={ 13 | "script": random_text, 14 | "speed": "1.0" 15 | } 16 | ).json() 17 | base64_audio = resp_json["generatedVoice"]["base64Audio"] 18 | audio_data = base64.b64decode(base64_audio) 19 | 20 | assert mp3_checker(audio_data) 21 | assert "音声合成" in audio_transcriber(audio_data, "mp3") 22 | 23 | 24 | @pytest.mark.asyncio 25 | async def test_nijivoice_wav(random_text, wave_checker, audio_transcriber): 26 | resp_json = httpx.post( 27 | f"http://127.0.0.1:8000/nijivoice_encoded/api/platform/v1/voice-actors/{VOICE_ACTOR_ID}/generate-encoded-voice", 28 | json={ 29 | "script": random_text, 30 | "speed": "1.0", 31 | "format": "wav" 32 | } 33 | ).json() 34 | base64_audio = resp_json["generatedVoice"]["base64Audio"] 35 | audio_data = base64.b64decode(base64_audio) 36 | 37 | assert wave_checker(audio_data) 38 | assert "音声合成" in audio_transcriber(audio_data, "wav") 39 | 40 | 41 | @pytest.mark.asyncio 42 | async def test_nijivoice_mp3(random_text, mp3_checker, audio_transcriber): 43 | resp_json = httpx.post( 44 | f"http://127.0.0.1:8000/nijivoice_encoded/api/platform/v1/voice-actors/{VOICE_ACTOR_ID}/generate-encoded-voice", 45 | json={ 46 | "script": random_text, 47 | "speed": "1.0", 48 | "format": "mp3" 49 | } 50 | ).json() 51 | base64_audio = resp_json["generatedVoice"]["base64Audio"] 52 | audio_data = base64.b64decode(base64_audio) 53 | 54 | assert mp3_checker(audio_data) 55 | assert "音声合成" in audio_transcriber(audio_data, "mp3") 56 | 57 | 58 | @pytest.mark.asyncio 59 | async def test_nijivoice_wav_mp3(random_text, mp3_checker, audio_transcriber): 60 | resp_json = httpx.post( 61 | f"http://127.0.0.1:8000/nijivoice_encoded/api/platform/v1/voice-actors/{VOICE_ACTOR_ID}/generate-encoded-voice", 62 | json={ 63 | "script": random_text, 64 | "speed": "1.0", 65 | "format": "wav" # <- wav 66 | }, 67 | params={ 68 | "x_audio_format": "mp3" # <- mp3 69 | } 70 | ).json() 71 | base64_audio = resp_json["generatedVoice"]["base64Audio"] 72 | audio_data = base64.b64decode(base64_audio) 73 | 74 | assert mp3_checker(audio_data) 75 | assert "音声合成" in audio_transcriber(audio_data, "mp3") 76 | 77 | 78 | @pytest.mark.asyncio 79 | async def test_nijivoice_mp3_wav(random_text, wave_checker, audio_transcriber): 80 | resp_json = httpx.post( 81 | f"http://127.0.0.1:8000/nijivoice_encoded/api/platform/v1/voice-actors/{VOICE_ACTOR_ID}/generate-encoded-voice", 82 | json={ 83 | "script": random_text, 84 | "speed": "1.0", 85 | "format": "mp3" # <- mp3 86 | }, 87 | params = { 88 | "x_audio_format": "wav" # <- wav 89 | } 90 | ).json() 91 | base64_audio = resp_json["generatedVoice"]["base64Audio"] 92 | audio_data = base64.b64decode(base64_audio) 93 | 94 | assert wave_checker(audio_data) 95 | assert "音声合成" in audio_transcriber(audio_data, "wav") 96 | 97 | 98 | @pytest.mark.asyncio 99 | async def test_nijivoice_x_wav(random_text, wave_checker, audio_transcriber): 100 | resp_json = httpx.post( 101 | f"http://127.0.0.1:8000/nijivoice_encoded/api/platform/v1/voice-actors/{VOICE_ACTOR_ID}/generate-encoded-voice", 102 | json={ 103 | "script": random_text, 104 | "speed": "1.0" 105 | }, 106 | params = { 107 | "x_audio_format": "wav" # <- wav 108 | } 109 | ).json() 110 | base64_audio = resp_json["generatedVoice"]["base64Audio"] 111 | audio_data = base64.b64decode(base64_audio) 112 | 113 | assert wave_checker(audio_data) 114 | assert "音声合成" in audio_transcriber(audio_data, "wav") 115 | 116 | 117 | @pytest.mark.asyncio 118 | async def test_nijivoice_x_mp3(random_text, mp3_checker, audio_transcriber): 119 | resp_json = httpx.post( 120 | f"http://127.0.0.1:8000/nijivoice_encoded/api/platform/v1/voice-actors/{VOICE_ACTOR_ID}/generate-encoded-voice", 121 | json={ 122 | "script": random_text, 123 | "speed": "1.0" 124 | }, 125 | params = { 126 | "x_audio_format": "mp3" # <- mp3 127 | } 128 | ).json() 129 | base64_audio = resp_json["generatedVoice"]["base64Audio"] 130 | audio_data = base64.b64decode(base64_audio) 131 | 132 | assert mp3_checker(audio_data) 133 | assert "音声合成" in audio_transcriber(audio_data, "mp3") 134 | 135 | 136 | @pytest.mark.asyncio 137 | async def test_nijivoice_unified(random_text, wave_checker, audio_transcriber): 138 | req = { 139 | "text": random_text, 140 | "speaker": VOICE_ACTOR_ID, 141 | "service_name": "nijivoice" 142 | } 143 | resp = httpx.post("http://127.0.0.1:8000/tts", json=req) 144 | audio_data = resp.content 145 | assert wave_checker(audio_data) 146 | assert "音声合成" in audio_transcriber(audio_data, "wav") 147 | 148 | 149 | @pytest.mark.asyncio 150 | async def test_nijivoice_unified_wav(random_text, wave_checker, audio_transcriber): 151 | req = { 152 | "text": random_text, 153 | "speaker": VOICE_ACTOR_ID, 154 | "service_name": "nijivoice" 155 | } 156 | query_params = { 157 | "x_audio_format": "wav" 158 | } 159 | resp = httpx.post("http://127.0.0.1:8000/tts", params=query_params, json=req) 160 | audio_data = resp.content 161 | assert wave_checker(audio_data) 162 | assert "音声合成" in audio_transcriber(audio_data, "wav") 163 | 164 | 165 | @pytest.mark.asyncio 166 | async def test_nijivoice_unified_mp3(random_text, mp3_checker, audio_transcriber): 167 | req = { 168 | "text": random_text, 169 | "speaker": VOICE_ACTOR_ID, 170 | "service_name": "nijivoice" 171 | } 172 | query_params = { 173 | "x_audio_format": "mp3" 174 | } 175 | resp = httpx.post("http://127.0.0.1:8000/tts", params=query_params, json=req) 176 | audio_data = resp.content 177 | assert mp3_checker(audio_data) 178 | assert "音声合成" in audio_transcriber(audio_data, "mp3") 179 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SpeechGateway 2 | 3 | A reverse proxy server that enhances speech synthesis with essential, extensible features. 🦉💬 4 | 5 | 6 | ## 💎 Features 7 | 8 | - 🥰 **Supports Popular Speech Services**: Works seamlessly with AivisSpeech, VOICEVOX, Style-Bert-VITS2, NijiVoice, OpenAI and Azure — and lets you integrate additional services to suit your needs. 9 | - 🗂️ **Caching**: Boost response speed and save API calls with built-in audio caching. 10 | - 🔄 **Format Conversion**: Effortlessly convert WAV to MP3 for bandwidth-friendly responses. 11 | - 📊 **Performance Metrics**: Track synthesis time and cache hits for in-depth insights. 12 | - ⚡️ **Low Latency**: Streamlined pipeline for minimal delay, delivering fast results! 13 | - 🌟 **Unified Interface**: Use various text-to-speech services through a unified interface — now with multi-language support!🌏 14 | 15 | 16 | ## 🎁 Installation 17 | 18 | ```sh 19 | pip install speech-gateway 20 | ``` 21 | 22 | To use MP3 format conversion, you also need to install ffmpeg to your computer. 23 | 24 | 25 | ## 🚀 Start server 26 | 27 | Create a script like the following example: 28 | 29 | ```python 30 | from contextlib import asynccontextmanager 31 | from fastapi import FastAPI 32 | from speech_gateway.gateway.voicevox import VoicevoxGateway 33 | from speech_gateway.gateway.sbv2 import StyleBertVits2Gateway 34 | from speech_gateway.gateway.nijivoice import NijiVoiceGateway 35 | 36 | # Create gateways 37 | voicevox_gateway = VoicevoxGateway(base_url="http://127.0.0.1:10101", debug=True) 38 | sbv2_gateway = StyleBertVits2Gateway(base_url="http://127.0.0.1:5000", debug=True) 39 | nijivoice_gateway = NijiVoiceGateway(api_key=NIJIVOICE_API_KEY, prefix="/nijivoice", debug=True) 40 | 41 | # Create app 42 | app = FastAPI() 43 | 44 | # Add gateways to app 45 | app.include_router(voicevox_gateway.get_router(), prefix="/aivisspeech") 46 | app.include_router(sbv2_gateway.get_router(), prefix="/sbv2") 47 | app.include_router(nijivoice_gateway.get_router(), prefix="/nijivoice") 48 | 49 | # On app down 50 | @asynccontextmanager 51 | async def lifespan(app: FastAPI): 52 | yield 53 | await voicevox_gateway.shutdown() 54 | await sbv2_gateway.shutdown() 55 | await nijivoice_gateway.shutdown() 56 | ``` 57 | 58 | Then, run it with uvicorn: 59 | 60 | ``` 61 | uvicorn run:app --port 8000 62 | ``` 63 | 64 | In this example, you can access AivisSpeech at http://127.0.0.1:8000/aivisspeech, Style-Bert-VITS2 at http://127.0.0.1:8000/sbv2, and NijiVoice at http://127.0.0.1:8000/nijivoice. 65 | 66 | **NOTE**: If you want to perform MP3 conversion, make sure to include `x_audio_format=mp3` as a query parameter in your request. 67 | 68 | 69 | ## 🌟 Unified Interface 70 | 71 | You can use various text-to-speech services through a unified interface specification. 72 | Below is an example of providing a unified interface for AivisSpeech, Style-Bert-VITS2, and Nijivoice. 73 | 74 | ```python 75 | from speech_gateway.gateway.unified import UnifiedGateway 76 | 77 | # Create UnifiedGateway and add gateways with its service name 78 | unified_gateway = UnifiedGateway(debug=True) 79 | unified_gateway.add_gateway("aivisspeech", aivisspeech_gateway, True) # Set as default gateway 80 | unified_gateway.add_gateway("sbv2", sbv2_gateway) 81 | unified_gateway.add_gateway("nijivoice", nijivoice_gateway) 82 | 83 | # Add unified interface router 84 | app.include_router(unified_gateway.get_router()) 85 | ``` 86 | 87 | ### Parameters 88 | 89 | POST a JSON object with the following fields: 90 | 91 | | Parameter | Type | Required | Description | 92 | |---------------|--------|----------|---------------------------------------------------------------------------------------------| 93 | | `text` | string | Required | The text to be synthesized into speech. | 94 | | `speaker` | string | Optional | The unique identifier for the voice in each speech service.
For Style-Bert-VITS2, specify as `{model_id}-{speaker_id}`.
If omitted, the default speaker of the speech service will be used. | 95 | | `style`| string | Optional | A predefined set of voice styles that includes `neutral`, `joy`, `angry`, `sorrow`, `fun`, and `surprised`. | 96 | | `service_name`| string | Optional | The name of the service as specified in `add_gateway`.
If omitted, the default gateway will be used. | 97 | | `language`| string | Optional | The language. The corresponding text-to-speech service will be used. If omitted, the default gateway will be used. | 98 | 99 | 100 | ### Client code 101 | 102 | You can access the services in a unified manner as shown in the client code below: 103 | 104 | ```python 105 | import httpx 106 | 107 | req = {"text": "こんにちは。これはデフォルトサービスだよ。", "speaker": "888753761"} 108 | # req = {"text": "こんにちは。これはAivisSpeechだよ。", "speaker": "888753761", "service_name": "aivisspeech"} 109 | # req = {"text": "こんにちは。これはStyle-Bert-VITS2だよ。", "speaker": "0-0", "service_name": "sbv2"} 110 | # req = {"text": "こんにちは。これはにじボイスだよ。", "speaker": "a192db5f-bd8b-4fc7-bc08-af5ca5957c12", "service_name": "nijivoice"} 111 | 112 | resp = httpx.post("http://127.0.0.1:8000/tts", json=req, timeout=60) 113 | 114 | with open("tts.wav", "wb") as f: 115 | f.write(resp.content) 116 | ``` 117 | 118 | **NOTE**: Due to the unified specification, it is not possible to use features specific to each text-to-speech service (e.g., intonation adjustment or pitch variation control). If you need high-quality speech synthesis utilizing such features, please use the individual service interfaces. 119 | 120 | 121 | ### Applying Style 122 | 123 | Define styles on server side. 124 | 125 | ```python 126 | aivisspeech_gateway = VoicevoxGateway(base_url="http://127.0.0.1:10101", debug=True) 127 | # Define speakers for each style 128 | aivisspeech_gateway.style_mapper["888753761"] = { 129 | "joy": "888753764", 130 | "angry": "888753765", 131 | "sorrow": "888753765", 132 | "fun": "888753762", 133 | "surprised": "888753762" 134 | } 135 | 136 | sbv2_gateway = StyleBertVits2Gateway(base_url="http://127.0.0.1:5000", debug=True) 137 | # Define sytle name for each style 138 | sbv2_gateway.style_mapper["0-0"] = { 139 | "joy": "上機嫌", 140 | "angry": "怒り・悲しみ", 141 | "sorrow": "怒り・悲しみ", 142 | "fun": "テンション高め", 143 | "surprised": "テンション高め" 144 | } 145 | ``` 146 | 147 | Call with style from client. 148 | 149 | ```python 150 | req = {"service_name": "aivisspeech", "text": "こんにちは。これはデフォルトサービスだよ。", "speaker": "888753761", "style": "angry"} 151 | # req = {"service_name": "sbv2", "text": "こんにちは。これはStyle-Bert-VITS2だよ。", "speaker": "0-0", "style": "angry"} 152 | 153 | resp = httpx.post("http://127.0.0.1:8000/tts", json=req, timeout=60) 154 | 155 | with open("tts.wav", "wb") as f: 156 | f.write(resp.content) 157 | ``` 158 | 159 | 160 | ### Multi-language Support 161 | 162 | You can configure the system to use the appropriate speech service based on the language, without explicitly specifying the service name. 163 | By passing `languages` to `add_gateway`, you can register a speech service that corresponds to the `language` specified in the request. Additionally, by registering a `default_speaker`, you can eliminate the need to specify a `speaker` in each request. 164 | 165 | ```python 166 | # Gateway for default language (ja-JP) - Voice: 888753761 167 | unified_gateway.add_gateway("aivisspeech", aivisspeech_gateway, default_speaker="888753761", default=True) 168 | 169 | # Gateway for en-US and zh-CN - Voice: Alloy 170 | unified_gateway.add_gateway("openai", openai_gateway, languages=["en-US", "zh-CN"], default_speaker="alloy") 171 | ``` 172 | 173 | Here is an example of client code to call this API. Switching the `language` enables easy support for multiple languages. 174 | 175 | ```python 176 | import httpx 177 | 178 | # Simply set the text and language - easily switch between multiple languages 179 | req = {"text": "こんにちは。これはデフォルトサービスだよ。"} 180 | # req = {"text": "Hello. This is the speech service for English.", "language": "en-US"} 181 | # req = {"text": "你好,这是英语的语音服务。", "language": "zh-CN"} 182 | 183 | resp = httpx.post("http://127.0.0.1:8000/tts", json=req, timeout=60) 184 | 185 | with open("tts.wav", "wb") as f: 186 | f.write(resp.content) 187 | ``` 188 | 189 | 190 | ## 🛠️ Customization 191 | 192 | You can add new speech synthesis services to relay. 193 | Additionally, you can extend the cache store, audio format converter, and performance recorder. For example, the default cache store uses the file system, but you can replace it with a cloud storage service or another alternative. 194 | 195 | We’ll provide documentation for these customizations as the need arises, so if you have specific requests, please open an issue! 🙏 196 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | --------------------------------------------------------------------------------