├── .python-version ├── .gitignore ├── LICENSE ├── pyproject.toml ├── .dockerignore ├── README.md ├── Dockerfile ├── tts.py ├── main.py ├── app.py └── templates └── index.html /.python-version: -------------------------------------------------------------------------------- 1 | 3.11 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pth 2 | *.onnx 3 | __pycache__/ 4 | .venv/ 5 | .DS_Store 6 | sample_output.wav 7 | output_unvocalized.wav 8 | output_*.wav -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | License: 2 | This dataset is licensed under CC BY-NC 4.0, with an additional restriction: 3 | It is intended only for academic research and educational use. 4 | 5 | Commercial use and non-academic non-commercial use are not permitted. 6 | For any other use, please contact the dataset creators. 7 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "phonikud-styletts2-dockerized" 3 | version = "0.1.0" 4 | description = "Add your description here" 5 | readme = "README.md" 6 | requires-python = ">=3.11" 7 | dependencies = [ 8 | "flask>=3.0.0", 9 | "flask-restx>=1.3.0", 10 | "phonikud", 11 | "phonikud-onnx>=1.0.4", 12 | "soundfile>=0.13.1", 13 | "stts2-light", 14 | ] 15 | 16 | [tool.uv.sources] 17 | stts2-light = { path = "StyleTTS2-lite" } 18 | phonikud = { git = "https://github.com/thewh1teagle/phonikud" } 19 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | # Git and version control 2 | .git 3 | .gitignore 4 | .github 5 | 6 | # Python cache and virtual environments 7 | __pycache__ 8 | *.pyc 9 | *.pyo 10 | *.pyd 11 | .Python 12 | .venv 13 | venv/ 14 | env/ 15 | ENV/ 16 | 17 | # IDE and editor files 18 | .vscode/ 19 | .idea/ 20 | *.swp 21 | *.swo 22 | *~ 23 | 24 | # OS generated files 25 | .DS_Store 26 | .DS_Store? 27 | ._* 28 | .Spotlight-V100 29 | .Trashes 30 | ehthumbs.db 31 | Thumbs.db 32 | 33 | # Documentation and readme files 34 | README.md 35 | *.md 36 | docs/ 37 | 38 | # Build artifacts 39 | build/ 40 | dist/ 41 | *.egg-info/ 42 | 43 | # Development and testing 44 | .pytest_cache/ 45 | .coverage 46 | .tox/ 47 | .mypy_cache/ 48 | .ruff_cache/ 49 | 50 | # Docker files 51 | Dockerfile* 52 | docker-compose* 53 | .dockerignore -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # phonikud-StyleTTS2-dockerized 2 | 3 | 4 | https://github.com/user-attachments/assets/bd6aae78-feb5-4896-923a-4fe77e1b5f61 5 | 6 | 7 | 8 | 9 | ## Prepare models 10 | 11 | ```console 12 | wget https://huggingface.co/thewh1teagle/phonikud-onnx/resolve/main/phonikud-1.0.int8.onnx 13 | wget https://huggingface.co/thewh1teagle/phonikud-tts-checkpoints/resolve/main/saspeech_automatic_stts2-light_epoch_00010.pth 14 | ``` 15 | 16 | ## Setup without Docker 17 | 18 | 1. Install https://docs.astral.sh/uv/getting-started/installation 19 | 2. Run 20 | ```console 21 | uv sync 22 | uv run main.py 23 | ``` 24 | 25 | ## Setup with Docker 26 | 27 | ```console 28 | wget https://github.com/thewh1teagle/StyleTTS2-lite branch: hebrew2 29 | docker build --platform linux/amd64 -t phonikud-styletts2-app . 30 | docker run -p 7860:7860 phonikud-styletts2-app 31 | ``` 32 | 33 | ## License 34 | 35 | Non commercial. See [LICENSE](LICENSE) 36 | 37 | Trained on data from OpenSLR Dataset 134, released under CC BY-NC-SA 4.0 38 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # Use uv base image with Python 3.11 2 | FROM ghcr.io/astral-sh/uv:python3.11-bookworm-slim AS builder 3 | 4 | # Install git and build tools for dependencies that need compilation 5 | RUN apt-get update && apt-get install -y \ 6 | git \ 7 | build-essential \ 8 | gcc \ 9 | g++ \ 10 | && rm -rf /var/lib/apt/lists/* 11 | 12 | # Set working directory 13 | WORKDIR /app 14 | 15 | # Set environment variables for optimal uv performance 16 | ENV UV_COMPILE_BYTECODE=1 \ 17 | UV_LINK_MODE=copy 18 | 19 | # Install dependencies first (better caching) 20 | RUN --mount=type=cache,target=/root/.cache/uv \ 21 | --mount=type=bind,source=uv.lock,target=uv.lock \ 22 | --mount=type=bind,source=pyproject.toml,target=pyproject.toml \ 23 | uv sync --frozen --no-install-project 24 | 25 | # Copy source code 26 | COPY . /app 27 | 28 | # Install the project 29 | RUN --mount=type=cache,target=/root/.cache/uv \ 30 | uv sync --frozen 31 | 32 | # Production stage 33 | FROM python:3.11-slim-bookworm 34 | 35 | # Copy the application and virtual environment 36 | COPY --from=builder /app /app 37 | 38 | # Set environment variables 39 | ENV PATH="/app/.venv/bin:$PATH" \ 40 | PYTHONPATH="/app" \ 41 | PYTHONDONTWRITEBYTECODE=1 \ 42 | PYTHONUNBUFFERED=1 43 | 44 | # Set working directory 45 | WORKDIR /app 46 | 47 | # Expose port for Gradio 48 | EXPOSE 7860 49 | 50 | # Run the application 51 | CMD ["python", "app.py"] 52 | -------------------------------------------------------------------------------- /tts.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import torch 3 | import numpy as np 4 | from pathlib import Path 5 | from functools import lru_cache 6 | 7 | # Add parent directory to path to import StyleTTS2 8 | root_dir = Path(__file__).parent / 'StyleTTS2-lite' 9 | sys.path.append(str(root_dir)) 10 | from inference import StyleTTS2 11 | 12 | 13 | class TextToSpeech: 14 | def __init__(self, config_path, models_path): 15 | self.device = 'cuda' if torch.cuda.is_available() else 'cpu' 16 | self.config_path = config_path 17 | self.models_path = models_path 18 | self.model = StyleTTS2(config_path, models_path).eval().to(self.device) 19 | 20 | @lru_cache(maxsize=128) 21 | def get_styles(self, speaker_path, speed, denoise, avg_style): 22 | """Get styles from speaker audio with LRU caching""" 23 | speaker = { 24 | "path": speaker_path, 25 | "speed": speed 26 | } 27 | with torch.no_grad(): 28 | return self.model.get_styles(speaker, denoise, avg_style) 29 | 30 | def _create(self, phonemes, styles, stabilize=True, alpha=18): 31 | """Generate audio from phonemes and styles""" 32 | with torch.no_grad(): 33 | audio = self.model.generate(phonemes, styles, stabilize, alpha) 34 | # Normalize audio 35 | audio = audio / np.max(np.abs(audio)) 36 | return audio 37 | 38 | def create(self, phonemes, speaker_path, speed=0.82, denoise=0.2, avg_style=True, stabilize=True, alpha=18): 39 | """Complete synthesis pipeline from phonemes to audio with cached styles""" 40 | # Use cached style extraction 41 | styles = self.get_styles(speaker_path, speed, denoise, avg_style) 42 | audio = self._create(phonemes, styles, stabilize, alpha) 43 | return audio -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | """ 2 | wget https://huggingface.co/thewh1teagle/phonikud-onnx/resolve/main/phonikud-1.0.int8.onnx 3 | wget https://huggingface.co/thewh1teagle/phonikud-tts-checkpoints/resolve/main/saspeech_automatic_stts2-light_epoch_00010.pth 4 | wget https://github.com/thewh1teagle/StyleTTS2-lite branch: hebrew2 5 | """ 6 | 7 | """ 8 | Script to create sample audio files for all WAV files in Demo/Audio directory 9 | """ 10 | import soundfile as sf 11 | from pathlib import Path 12 | from phonikud_onnx import Phonikud 13 | import phonikud 14 | from tts import TextToSpeech 15 | 16 | phonikud_model = Phonikud('phonikud-1.0.int8.onnx') 17 | 18 | default_text = """ 19 | ירושלים היא עיר עתיקה וחשובה במיוחד, שמכילה בתוכה שכבות רבות של היסטוריה, תרבות ורוחניות שנמשכות אלפי שנים, והיא מהווה מוקד מרכזי לשלושת הדתות הגדולות, יהדות, נצרות, ואסלאם. שמתחברות יחד במקום אחד ייחודי, מלא אנרגיה ומורכבות, שם אפשר למצוא אתרים קדושים, שכונות עתיקות ושווקים צבעוניים, וכל פינה מספרת סיפור של תקופות שונות, אנשים שונים ואירועים שהשפיעו על ההיסטוריה של העולם כולו, מה שהופך את ירושלים לא רק לעיר גאוגרפית, אלא גם למרכז של זהות, אמונה, וזיכרון קולקטיבי שממשיך לעורר השראה ולחבר בין אנשים מרקע שונה מכל קצוות תבל. 20 | """.strip() 21 | 22 | def phonemize(vocalized): 23 | phonemes = phonikud.phonemize(vocalized) 24 | return phonemes 25 | 26 | def main(): 27 | # Create samples directory 28 | samples_dir = Path("samples") 29 | samples_dir.mkdir(exist_ok=True) 30 | 31 | # Setup TTS model 32 | config_path = str(Path("StyleTTS2-lite") / "Configs" / "config.yaml") 33 | models_path = 'saspeech_automatic_stts2-light_epoch_00010.pth' 34 | tts = TextToSpeech(config_path, models_path) 35 | 36 | # Sample text to use for audio generation 37 | text = default_text 38 | vocalized = phonikud_model.add_diacritics(text) 39 | phonemes = phonemize(vocalized) 40 | 41 | # Parameters 42 | speed = 0.82 43 | denoise = 0.2 44 | avg_style = True 45 | stabilize = True 46 | 47 | # Use hardcoded reference audio file 48 | ref_audio_path = "StyleTTS2-lite/Demo/Audio/10_michael.wav" 49 | 50 | print(f"Processing reference audio: {ref_audio_path}") 51 | 52 | try: 53 | # Use the TTS synthesize method 54 | audio = tts.create( 55 | phonemes=phonemes, 56 | speaker_path=ref_audio_path, 57 | speed=speed, 58 | denoise=denoise, 59 | avg_style=avg_style, 60 | stabilize=stabilize, 61 | alpha=18 62 | ) 63 | 64 | # Create output filename 65 | output_name = "sample_output.wav" 66 | output_path = samples_dir / output_name 67 | 68 | # Save audio 69 | sr = 24000 70 | sf.write(str(output_path), audio, sr) 71 | print(f"Created {output_name}") 72 | 73 | except Exception as e: 74 | print(f"Error processing {ref_audio_path}: {e}") 75 | 76 | print(f"\nSample created in {samples_dir} directory") 77 | 78 | 79 | if __name__ == "__main__": 80 | main() -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | from flask import Flask, render_template, request, jsonify, send_file 2 | from flask_restx import Api, Resource, fields 3 | from pathlib import Path 4 | import soundfile as sf 5 | from phonikud_onnx import Phonikud 6 | import phonikud 7 | from tts import TextToSpeech 8 | import os 9 | 10 | app = Flask(__name__) 11 | 12 | # Configure Flask-RESTX 13 | api = Api( 14 | app, 15 | version='1.0', 16 | title='StyleTTS2 Hebrew TTS API', 17 | description='Hebrew Text-to-Speech API using StyleTTS2 and Phonikud', 18 | doc='/api/', 19 | prefix='/api' 20 | ) 21 | 22 | # Initialize models 23 | phonikud_model = Phonikud('phonikud-1.0.int8.onnx') 24 | 25 | # Setup TTS model 26 | config_path = str(Path("StyleTTS2-lite") / "Configs" / "config.yaml") 27 | models_path = 'saspeech_automatic_stts2-light_epoch_00010.pth' 28 | tts = TextToSpeech(config_path, models_path) 29 | 30 | # Create samples directory 31 | samples_dir = Path("samples") 32 | samples_dir.mkdir(exist_ok=True) 33 | 34 | # API Models 35 | generate_model = api.model('GenerateRequest', { 36 | 'text': fields.String(required=True, description='The input text'), 37 | 'type': fields.String(required=False, description='Input type: phonemes, unvocalized, or vocalized (default: unvocalized)', 38 | enum=['phonemes', 'unvocalized', 'vocalized'], default='unvocalized'), 39 | 'ref_audio': fields.String(required=False, description='Reference audio filename (default: 10_michael.wav)', default='10_michael.wav') 40 | }) 41 | 42 | generate_response = api.model('GenerateResponse', { 43 | 'success': fields.Boolean(description='Whether generation was successful'), 44 | 'filename': fields.String(description='Generated audio filename'), 45 | 'phonemes': fields.String(description='Generated phonemes'), 46 | 'vocalized_text': fields.String(description='Vocalized Hebrew text (if applicable)') 47 | }) 48 | 49 | error_response = api.model('ErrorResponse', { 50 | 'error': fields.String(description='Error message') 51 | }) 52 | 53 | def get_reference_audio_files(): 54 | """Get all WAV files from StyleTTS2-lite/Demo/Audio/""" 55 | audio_dir = Path("StyleTTS2-lite/Demo/Audio") 56 | if audio_dir.exists(): 57 | return sorted([f.name for f in audio_dir.glob("*.wav")]) 58 | return [] 59 | 60 | def phonemize_text(text): 61 | """Convert text to phonemes""" 62 | vocalized = phonikud_model.add_diacritics(text) 63 | phonemes = phonikud.phonemize(vocalized) 64 | return phonemes 65 | 66 | def vocalize_and_phonemize(text): 67 | """Vocalize text and convert to phonemes""" 68 | vocalized = phonikud_model.add_diacritics(text) 69 | phonemes = phonikud.phonemize(vocalized) 70 | return phonemes 71 | 72 | @app.route('/') 73 | def index(): 74 | """Serve the main page""" 75 | audio_files = get_reference_audio_files() 76 | return render_template('index.html', audio_files=audio_files) 77 | 78 | # API namespace 79 | ns = api.namespace('tts', description='Text-to-Speech operations') 80 | 81 | @ns.route('/generate') 82 | class GenerateAudio(Resource): 83 | @api.expect(generate_model) 84 | @api.marshal_with(generate_response, code=200) 85 | @api.marshal_with(error_response, code=400) 86 | @api.marshal_with(error_response, code=500) 87 | def post(self): 88 | """Generate audio from text input""" 89 | try: 90 | data = request.json 91 | if not data: 92 | return {'error': 'No JSON data provided'}, 400 93 | 94 | input_text = data.get('text', '').strip() 95 | input_type = data.get('type', 'unvocalized') 96 | ref_audio = data.get('ref_audio', '10_michael.wav') 97 | 98 | if not input_text: 99 | return {'error': 'Text input is required'}, 400 100 | 101 | if not ref_audio: 102 | return {'error': 'Reference audio file is required'}, 400 103 | 104 | # Process input based on type 105 | vocalized_text = None 106 | if input_type == 'phonemes': 107 | phonemes = input_text 108 | elif input_type == 'unvocalized': 109 | vocalized_text = phonikud_model.add_diacritics(input_text) 110 | phonemes = phonikud.phonemize(vocalized_text) 111 | else: # vocalized 112 | vocalized_text = input_text 113 | phonemes = phonikud.phonemize(input_text) 114 | 115 | # Reference audio path 116 | ref_audio_path = str(Path("StyleTTS2-lite/Demo/Audio") / ref_audio) 117 | 118 | # Generate audio 119 | audio = tts.create( 120 | phonemes=phonemes, 121 | speaker_path=ref_audio_path, 122 | speed=0.82, 123 | denoise=0.2, 124 | avg_style=True, 125 | stabilize=True, 126 | alpha=18 127 | ) 128 | 129 | # Save audio 130 | output_filename = f"output_{input_type}.wav" 131 | output_path = samples_dir / output_filename 132 | sr = 24000 133 | sf.write(str(output_path), audio, sr) 134 | 135 | return { 136 | 'success': True, 137 | 'filename': output_filename, 138 | 'phonemes': phonemes, 139 | 'vocalized_text': vocalized_text 140 | } 141 | 142 | except Exception as e: 143 | return {'error': str(e)}, 500 144 | 145 | @ns.route('/voices') 146 | class GetVoices(Resource): 147 | @api.marshal_with(api.model('VoicesResponse', { 148 | 'voices': fields.List(fields.String, description='Available voice files') 149 | })) 150 | def get(self): 151 | """Get list of available reference voices""" 152 | voices = get_reference_audio_files() 153 | return {'voices': voices} 154 | 155 | # Keep the original route for the web interface 156 | @app.route('/generate', methods=['POST']) 157 | def generate_audio_web(): 158 | """Generate audio based on input type (for web interface)""" 159 | try: 160 | data = request.json 161 | if not data: 162 | return jsonify({'error': 'No JSON data provided'}), 400 163 | 164 | input_text = data.get('text', '').strip() 165 | input_type = data.get('type', 'vocalized') 166 | ref_audio = data.get('ref_audio', '') 167 | 168 | if not input_text: 169 | return jsonify({'error': 'Text input is required'}), 400 170 | 171 | if not ref_audio: 172 | return jsonify({'error': 'Reference audio file is required'}), 400 173 | 174 | # Process input based on type 175 | vocalized_text = None 176 | if input_type == 'phonemes': 177 | phonemes = input_text 178 | elif input_type == 'unvocalized': 179 | vocalized_text = phonikud_model.add_diacritics(input_text) 180 | phonemes = phonikud.phonemize(vocalized_text) 181 | else: # vocalized 182 | vocalized_text = input_text 183 | phonemes = phonikud.phonemize(input_text) 184 | 185 | # Reference audio path 186 | ref_audio_path = str(Path("StyleTTS2-lite/Demo/Audio") / ref_audio) 187 | 188 | # Generate audio 189 | audio = tts.create( 190 | phonemes=phonemes, 191 | speaker_path=ref_audio_path, 192 | speed=0.82, 193 | denoise=0.2, 194 | avg_style=True, 195 | stabilize=True, 196 | alpha=18 197 | ) 198 | 199 | # Save audio 200 | output_filename = f"output_{input_type}.wav" 201 | output_path = samples_dir / output_filename 202 | sr = 24000 203 | sf.write(str(output_path), audio, sr) 204 | 205 | return jsonify({ 206 | 'success': True, 207 | 'filename': output_filename, 208 | 'phonemes': phonemes, 209 | 'vocalized_text': vocalized_text 210 | }) 211 | 212 | except Exception as e: 213 | return jsonify({'error': str(e)}), 500 214 | 215 | @app.route('/audio/') 216 | def serve_audio(filename): 217 | """Serve audio file for playback""" 218 | file_path = samples_dir / filename 219 | if file_path.exists(): 220 | return send_file(file_path, mimetype='audio/wav') 221 | return jsonify({'error': 'File not found'}), 404 222 | 223 | if __name__ == '__main__': 224 | app.run(host='0.0.0.0', port=7860, debug=True) -------------------------------------------------------------------------------- /templates/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | StyleTTS2 - Hebrew TTS 7 | 101 | 102 | 103 |
104 |
105 | 106 | 🔗 See Phonikud Project 107 | 108 |
109 |

StyleTTS2 - Hebrew Text-to-Speech

110 | 111 |
112 | 113 | 118 |
119 | 120 |
121 | 122 | 123 | 126 |
127 | 128 |
129 | 130 | 131 | 134 |
135 | 136 |
137 | 138 | 139 | 142 |
143 | 144 |
145 | 146 |
147 | 148 | 📖 API Documentation (Swagger) 149 | 150 |
151 |
152 | 153 | 240 | 241 | --------------------------------------------------------------------------------