├── .gitignore ├── demo.mp4 ├── emotions.txt ├── finetune ├── config.yaml └── train.py ├── pretrain ├── config.yaml ├── readme.md └── train.py ├── realtime_streaming_example ├── client.html └── main.py ├── orpheus_wrapper.py ├── setup_orpheus.sh ├── README.md └── orpheus.py /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store -------------------------------------------------------------------------------- /demo.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Saganaki22/OrpheusTTS-WebUI/HEAD/demo.mp4 -------------------------------------------------------------------------------- /emotions.txt: -------------------------------------------------------------------------------- 1 | happy 2 | normal 3 | digust 4 | disgust 5 | longer 6 | sad 7 | frustrated 8 | slow 9 | excited 10 | whisper 11 | panicky 12 | curious 13 | surprise 14 | fast 15 | crying 16 | deep 17 | sleepy 18 | angry 19 | high 20 | shout -------------------------------------------------------------------------------- /finetune/config.yaml: -------------------------------------------------------------------------------- 1 | # CHANGE THIS TO YOUR OWN DATASET 2 | TTS_dataset: 3 | 4 | model_name: "canopylabs/orpheus-tts-0.1-pretrained" 5 | 6 | # Training Args 7 | epochs: 1 8 | batch_size: 1 9 | number_processes: 1 10 | pad_token: 128263 11 | save_steps: 5000 12 | learning_rate: 5.0e-5 13 | 14 | # Naming and paths 15 | save_folder: "checkpoints" 16 | project_name: "tuning-orpheus" 17 | run_name: "5e5-0" 18 | -------------------------------------------------------------------------------- /pretrain/config.yaml: -------------------------------------------------------------------------------- 1 | # Model 2 | model_name: "meta-llama/Llama-3.2-3B-Instruct" # Replace with your base model must be compatible with the tokenizer and transformers library 3 | tokenizer_name: "meta-llama/Llama-3.2-3B-Instruct" 4 | 5 | # Training Args 6 | epochs: 1 7 | batch_size: 1 8 | number_processes: 8 9 | pad_token: 128263 10 | save_steps: 12000 11 | learning_rate: 5.0e-5 12 | ratio: 13 | 14 | # Datasets 15 | text_QA_dataset: 16 | TTS_dataset: 17 | 18 | # Naming and paths 19 | save_folder: "checkpoints" 20 | project_name: "pretrain-orpheus" 21 | run_name: "pretrain-orpheus" 22 | -------------------------------------------------------------------------------- /pretrain/readme.md: -------------------------------------------------------------------------------- 1 | # Pretraining 2 | ## Overview 3 | We find that trying to keep good semantic understanding of text boosts the models ability when speaking naturally and empathetically. We propose training the model on batches of speech and text. If you want the model to retain a large part of its text ability - i.e. function as an end-to-end speech model you should keep the ratio of text batch :speech batch as 2:1 to start and then gradually decrease to 1:1 throughout training. If your model is just trained for TTS start with 1:1 and gradually decrease to 0:1. 4 | 5 | 6 | ### Disclaimer 7 | 8 | This code was copy and pasted into this repo quickly so there maybe bugs. The general outline should be pretty straightforward. It's also set up for multinode training. 9 | 10 | Depending on how good the models reasoning abilities to be (and what specifically you want to retain), you can choose with text-based dataset you use. Using simple datasets with QA pairs (for finetuning like ) works pretty well. You can also try using wikipedia - to boost the 11 | -------------------------------------------------------------------------------- /realtime_streaming_example/client.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Streaming Audio Playback 6 | 7 | 8 |

Streaming Audio Playback

9 |
10 |
11 |
12 | 13 |
14 | 15 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /finetune/train.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset 2 | from transformers import AutoModelForCausalLM, Trainer, TrainingArguments, AutoTokenizer 3 | import numpy as np 4 | import yaml 5 | import wandb 6 | 7 | config_file = "config.yaml" 8 | 9 | with open(config_file, "r") as file: 10 | config = yaml.safe_load(file) 11 | 12 | dsn = config["TTS_dataset"] 13 | 14 | model_name = config["model_name"] 15 | run_name = config["run_name"] 16 | project_name = config["project_name"] 17 | base_repo_id = config["save_folder"] 18 | epochs = config["epochs"] 19 | batch_size = config["batch_size"] 20 | save_steps = config["save_steps"] 21 | pad_token = config["pad_token"] 22 | number_processes = config["number_processes"] 23 | learning_rate = config["learning_rate"] 24 | 25 | tokenizer = AutoTokenizer.from_pretrained(model_name) 26 | model = AutoModelForCausalLM.from_pretrained(model_name, attn_implementation="flash_attention_2") 27 | 28 | 29 | ds = load_dataset(dsn, split="train") 30 | 31 | wandb.init(project=project_name, name = run_name) 32 | 33 | training_args = TrainingArguments( 34 | overwrite_output_dir=True, 35 | num_train_epochs=epochs, 36 | per_device_train_batch_size=batch_size, 37 | logging_steps=1, 38 | bf16=True, 39 | output_dir=f"./{base_repo_id}", 40 | report_to="wandb", 41 | save_steps=save_steps, 42 | remove_unused_columns=True, 43 | learning_rate=learning_rate, 44 | ) 45 | 46 | trainer = Trainer( 47 | model=model, 48 | args=training_args, 49 | train_dataset=ds, 50 | ) 51 | 52 | trainer.train() 53 | 54 | -------------------------------------------------------------------------------- /realtime_streaming_example/main.py: -------------------------------------------------------------------------------- 1 | from flask import Flask, Response, request 2 | import struct 3 | from orpheus_tts import OrpheusModel 4 | 5 | app = Flask(__name__) 6 | engine = OrpheusModel(model_name="canopylabs/orpheus-tts-0.1-finetune-prod") 7 | 8 | def create_wav_header(sample_rate=24000, bits_per_sample=16, channels=1): 9 | byte_rate = sample_rate * channels * bits_per_sample // 8 10 | block_align = channels * bits_per_sample // 8 11 | 12 | data_size = 0 13 | 14 | header = struct.pack( 15 | '<4sI4s4sIHHIIHH4sI', 16 | b'RIFF', 17 | 36 + data_size, 18 | b'WAVE', 19 | b'fmt ', 20 | 16, 21 | 1, 22 | channels, 23 | sample_rate, 24 | byte_rate, 25 | block_align, 26 | bits_per_sample, 27 | b'data', 28 | data_size 29 | ) 30 | return header 31 | 32 | @app.route('/tts', methods=['GET']) 33 | def tts(): 34 | prompt = request.args.get('prompt', 'Hey there, looks like you forgot to provide a prompt!') 35 | 36 | def generate_audio_stream(): 37 | yield create_wav_header() 38 | 39 | syn_tokens = engine.generate_speech( 40 | prompt=prompt, 41 | voice="tara", 42 | repetition_penalty=1.1, 43 | stop_token_ids=[128258], 44 | max_tokens=2000, 45 | temperature=0.4, 46 | top_p=0.9 47 | ) 48 | for chunk in syn_tokens: 49 | yield chunk 50 | 51 | return Response(generate_audio_stream(), mimetype='audio/wav') 52 | 53 | if __name__ == '__main__': 54 | app.run(host='0.0.0.0', port=8080, threaded=True) 55 | -------------------------------------------------------------------------------- /orpheus_wrapper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Wrapper script for Orpheus TTS to enforce vLLM configuration. 4 | """ 5 | import os 6 | import sys 7 | import logging 8 | 9 | # Set environment variables to control vLLM 10 | os.environ["VLLM_MAX_MODEL_LEN"] = "100000" 11 | os.environ["VLLM_GPU_MEMORY_UTILIZATION"] = "0.9" 12 | os.environ["VLLM_DISABLE_LOGGING"] = "1" 13 | os.environ["VLLM_NO_USAGE_STATS"] = "1" 14 | os.environ["VLLM_DO_NOT_TRACK"] = "1" 15 | os.environ["GRADIO_ANALYTICS_ENABLED"] = "0" 16 | 17 | try: 18 | # Import the necessary modules 19 | from vllm.engine.arg_utils import EngineArgs 20 | from vllm.engine.async_llm_engine import AsyncLLMEngine 21 | from orpheus_tts.engine_class import OrpheusModel 22 | 23 | # Store the original from_engine_args method 24 | original_from_engine_args = AsyncLLMEngine.from_engine_args 25 | 26 | # Define a patched version that doesn't use disable_log_requests 27 | def patched_from_engine_args(engine_args, **kwargs): 28 | # Override the max_model_len in engine_args 29 | engine_args.max_model_len = 100000 30 | engine_args.gpu_memory_utilization = 0.9 31 | 32 | print(f"Patched from_engine_args called with max_model_len={engine_args.max_model_len}") 33 | 34 | # Call the original without any extra kwargs 35 | return original_from_engine_args(engine_args) 36 | 37 | # Replace the class method 38 | AsyncLLMEngine.from_engine_args = staticmethod(patched_from_engine_args) 39 | print("Successfully patched AsyncLLMEngine.from_engine_args") 40 | 41 | except Exception as e: 42 | print(f"Warning: Failed to patch AsyncLLMEngine: {e}") 43 | 44 | # Now import and run the Orpheus app 45 | print("Starting Orpheus TTS...") 46 | 47 | # Import the Gradio app 48 | import orpheus 49 | 50 | # Actually run the Gradio app 51 | if __name__ == "__main__": 52 | demo = orpheus.create_ui() 53 | demo.launch(share=False) 54 | -------------------------------------------------------------------------------- /setup_orpheus.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e # Exit on error 3 | 4 | echo "=======================================" 5 | echo "OrpheusTTS-WebUI Setup Script" 6 | echo "=======================================" 7 | 8 | # Create virtual environment if it doesn't exist 9 | if [ ! -d "venv" ]; then 10 | echo "Creating virtual environment..." 11 | python3 -m venv venv 12 | echo "Virtual environment created." 13 | fi 14 | 15 | # Activate virtual environment 16 | echo "Activating virtual environment..." 17 | source venv/bin/activate 18 | 19 | # Install required packages 20 | echo "Installing required packages..." 21 | pip install --upgrade pip 22 | 23 | # Install other required packages 24 | echo "Installing other dependencies..." 25 | pip install orpheus-speech gradio vllm torch huggingface_hub 26 | 27 | # Create launch script 28 | echo "Creating launch script..." 29 | cat > launch_orpheus.sh << 'EOF' 30 | #!/bin/bash 31 | set -e 32 | 33 | # Activate virtual environment 34 | source venv/bin/activate 35 | 36 | # Execute the wrapper script 37 | python orpheus_wrapper.py 38 | EOF 39 | 40 | # Make the launch script executable 41 | chmod +x launch_orpheus.sh 42 | 43 | # Log in to Hugging Face 44 | echo "=======================================" 45 | echo "You need to log in to Hugging Face to access the model." 46 | echo "If you don't have an account, create one at https://huggingface.co/join" 47 | echo "=======================================" 48 | read -p "Press Enter to continue to Hugging Face login..." 49 | huggingface-cli login 50 | 51 | # Remind about model access 52 | echo "=======================================" 53 | echo "IMPORTANT: The OrpheusTTS model is a gated model." 54 | echo "You need to request access at:" 55 | echo "https://huggingface.co/canopylabs/orpheus-tts-0.1-finetune-prod" 56 | echo "https://huggingface.co/canopylabs/orpheus-3b-0.1-pretrained" 57 | echo "=======================================" 58 | echo "Once approved, you'll be able to use the model." 59 | echo "=======================================" 60 | 61 | # Make the wrapper executable 62 | echo "Making the wrapper script executable..." 63 | chmod +x orpheus_wrapper.py 64 | 65 | echo "Setup complete! Run ./launch_orpheus.sh to start the application." 66 | echo "=======================================" 67 | -------------------------------------------------------------------------------- /pretrain/train.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from datasets import load_dataset 3 | from transformers import AutoModelForCausalLM, Trainer, TrainingArguments, AutoTokenizer 4 | import numpy as np 5 | from torch.distributed.fsdp.fully_sharded_data_parallel import FullStateDictConfig 6 | from torch.distributed.fsdp import ( 7 | FullyShardedDataParallel as FSDP, FullStateDictConfig, StateDictType) 8 | from torch.utils.data import DataLoader, Dataset 9 | from torch.utils.data.distributed import DistributedSampler 10 | import yaml 11 | import wandb 12 | from huggingface_hub import HfApi 13 | 14 | config_file = "config.yaml" 15 | 16 | with open(config_file, "r") as file: 17 | config = yaml.safe_load(file) 18 | 19 | dsn1 = config["text_QA_dataset"] 20 | dsn2 = config["TTS_dataset"] 21 | 22 | model_name = config["model_name"] 23 | tokenizer_name = config["tokenizer_name"] 24 | 25 | run_name = config["run_name"] 26 | project_name = config["project_name"] 27 | base_repo_id = config["save_folder"] 28 | 29 | epochs = config["epochs"] 30 | batch_size = config["batch_size"] 31 | save_steps = config["save_steps"] 32 | pad_token = config["pad_token"] 33 | number_processes = config["number_processes"] 34 | learning_rate = config["learning_rate"] 35 | config_ratio = config["ratio"] 36 | 37 | 38 | 39 | 40 | class BatchedRatioDataset(Dataset): 41 | def __init__(self, dataset1, dataset2, batch_total, ratio=config_ratio): 42 | self.dataset1 = dataset1 43 | self.dataset2 = dataset2 44 | self.batch_total = batch_total 45 | self.ratio = ratio 46 | 47 | num_cycles_ds1 = len(dataset1) // (batch_total * ratio) 48 | num_cycles_ds2 = len(dataset2) // batch_total 49 | self.num_cycles = min(num_cycles_ds1, num_cycles_ds2) 50 | 51 | self.length = self.num_cycles * (ratio + 1) * batch_total 52 | 53 | def __len__(self): 54 | print("accessing length", self.length) 55 | return int(self.length) 56 | 57 | def __getitem__(self, index): 58 | # Compute the cycle length in terms of samples. 59 | cycle_length = (self.ratio + 1) * self.batch_total 60 | cycle = index // cycle_length 61 | pos_in_cycle = index % cycle_length 62 | 63 | if pos_in_cycle < self.ratio * self.batch_total: 64 | batch_in_cycle = pos_in_cycle // self.batch_total 65 | sample_in_batch = pos_in_cycle % self.batch_total 66 | ds1_index = cycle * self.ratio * self.batch_total + batch_in_cycle * self.batch_total + sample_in_batch 67 | return self.dataset1[ds1_index] 68 | else: 69 | # We are in the dataset2 batch for this cycle. 70 | sample_in_batch = pos_in_cycle - self.ratio * self.batch_total 71 | ds2_index = cycle * self.batch_total + sample_in_batch 72 | return self.dataset2[ds2_index] 73 | 74 | 75 | 76 | class AlternatingDistributedSampler(DistributedSampler): 77 | def __init__(self, dataset, num_replicas=None, rank=None, shuffle=False): 78 | super().__init__(dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle) 79 | self.shuffle = shuffle 80 | 81 | def __iter__(self): 82 | indices = list(range(len(self.dataset))) 83 | indices = indices[self.rank:self.total_size:self.num_replicas] 84 | return iter(indices) 85 | 86 | 87 | class FSDPTrainer(Trainer): 88 | def __init__(self, *args, log_ratio=config_ratio, **kwargs): 89 | super().__init__(*args, **kwargs) 90 | self.repo_id = base_repo_id 91 | self.api = HfApi() 92 | 93 | self.log_ratio = log_ratio 94 | self.text_step = 0 95 | self.audio_step = 0 96 | 97 | def get_train_dataloader(self): 98 | sampler = AlternatingDistributedSampler( 99 | self.train_dataset, 100 | num_replicas=torch.distributed.get_world_size(), 101 | rank=torch.distributed.get_rank(), 102 | shuffle=False, 103 | ) 104 | 105 | return DataLoader( 106 | self.train_dataset, 107 | batch_size=self.args.per_device_train_batch_size, 108 | sampler=sampler, 109 | collate_fn=self.data_collator, 110 | drop_last=self.args.dataloader_drop_last, 111 | num_workers=0, 112 | pin_memory=self.args.dataloader_pin_memory, 113 | ) 114 | 115 | def log(self, logs, start_time=None): 116 | super().log(logs, start_time) 117 | if self.is_world_process_zero(): 118 | global_step = self.state.global_step 119 | # Each cycle is (log_ratio + 1) steps: first log_ratio steps for text_loss, then one for audio_loss. 120 | cycle_length = self.log_ratio + 1 121 | if (global_step % cycle_length) + self.log_ratio - 1 < self.log_ratio: 122 | wandb.log({"audio_loss": logs["loss"], "audio_step": self.audio_step}) 123 | self.audio_step += 1 124 | else: 125 | wandb.log({"text_loss": logs["loss"], "text_step": self.text_step}) 126 | self.text_step += 1 127 | 128 | def save_model(self, output_dir=None, _internal_call=False): 129 | if output_dir is None: 130 | output_dir = self.args.output_dir 131 | self.save_and_push_model(output_dir) 132 | 133 | def save_and_push_model(self, output_dir): 134 | save_policy = FullStateDictConfig(offload_to_cpu=True, rank0_only=True) 135 | with FSDP.state_dict_type(self.model, StateDictType.FULL_STATE_DICT, save_policy): 136 | cpu_state_dict = self.model.state_dict() 137 | self.model.save_pretrained(output_dir, state_dict=cpu_state_dict) 138 | 139 | 140 | def data_collator(features): 141 | # max_length = 2656 # set a crop based on vram - ideally you have stacked all sequences to the same length 142 | # from 3b on 8 h100s fsdp, at bf16, 8192 works well. 143 | input_ids = [f["input_ids"] for f in features] 144 | 145 | if any("attention_mask" not in f for f in features): 146 | attention_mask = [[1]*len(ids) for ids in input_ids] 147 | else: 148 | attention_mask = [f["attention_mask"] for f in features] 149 | 150 | if any("labels" not in f for f in features): 151 | labels = input_ids 152 | else: 153 | labels = [f["labels"] for f in features] 154 | 155 | input_ids = torch.nn.utils.rnn.pad_sequence([torch.tensor( 156 | i, dtype=torch.long) for i in input_ids], batch_first=True, padding_value=pad_token) 157 | attention_mask = torch.nn.utils.rnn.pad_sequence([torch.tensor( 158 | m, dtype=torch.long) for m in attention_mask], batch_first=True, padding_value=0) 159 | labels = torch.nn.utils.rnn.pad_sequence([torch.tensor( 160 | l, dtype=torch.long) for l in labels], batch_first=True, padding_value=-100) 161 | 162 | return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels} 163 | 164 | 165 | wandb.init(project=project_name, name=run_name) 166 | 167 | 168 | tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) 169 | model = AutoModelForCausalLM.from_pretrained( 170 | model_name, attn_implementation="flash_attention_2") 171 | 172 | 173 | number_add_tokens = 7 * 4096 + 10 174 | new_tokens = [f"" for i in range(0, number_add_tokens + 1)] 175 | tokenizer.add_tokens(new_tokens) 176 | model.resize_token_embeddings(len(tokenizer)) 177 | 178 | 179 | ds1 = load_dataset(dsn1, split="train") 180 | ds2 = load_dataset(dsn2, split="train") 181 | 182 | 183 | batch_total = batch_size * number_processes 184 | train_dataset = BatchedRatioDataset(ds1, ds2, batch_total, ratio=config_ratio) 185 | 186 | 187 | training_args = TrainingArguments( 188 | overwrite_output_dir=True, 189 | num_train_epochs=epochs, 190 | per_device_train_batch_size=batch_size, 191 | logging_steps=1, 192 | bf16=True, 193 | output_dir=f"./{base_repo_id}", 194 | fsdp="auto_wrap", 195 | report_to="wandb", 196 | save_steps=save_steps, 197 | remove_unused_columns=True, 198 | learning_rate=learning_rate, 199 | lr_scheduler_type="cosine", 200 | ) 201 | 202 | 203 | trainer = FSDPTrainer( 204 | model=model, 205 | args=training_args, 206 | train_dataset=train_dataset, 207 | data_collator=data_collator, 208 | log_ratio=config_ratio 209 | ) 210 | 211 | trainer.train() 212 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # OrpheusTTS-WebUI 2 | 3 | This is a fork of the [Orpheus TTS](https://github.com/canopyai/Orpheus-TTS) project, adding a Gradio WebUI that runs smoothly on WSL and CUDA. 4 | 5 | ![image](https://github.com/user-attachments/assets/4b738f1d-23ed-477b-ac84-db0d5b04c76c) 6 | 7 | https://github.com/user-attachments/assets/5e441285-b10f-4149-b691-df061c5ddcbb 8 | 9 | ## ✅ Latest Updates (20/03/2025) 10 | 11 | ### Long-Form Text Processing 12 | - **Tabbed Interface**: The UI now features a dedicated "Long Form Content" tab for processing larger text inputs 13 | - **Smart Text Chunking**: Automatically splits long text into smaller chunks at sentence boundaries 14 | - **Parallel Processing**: Processes multiple chunks simultaneously for faster generation 15 | - **Seamless Audio Stitching**: Combines multiple audio segments into one cohesive output file 16 | - **Progress Tracking**: Real-time progress indicators during the generation process 17 | 18 | ### Technical Improvements 19 | - **Enhanced Logging**: Better error handling and diagnostic information 20 | - **Memory Optimization**: Improved cleanup of temporary files 21 | - **Expanded Parameter Ranges**: Maximum tokens extended to 16384 for longer audio generation 22 | - **Batch Size Control**: Adjust the number of chunks processed in parallel to balance speed and resource usage 23 | 24 | ## Features 25 | 26 | - **Easy-to-use Web Interface**: Simple Gradio UI for text-to-speech generation 27 | - **WSL & CUDA Compatible**: Optimized for Windows Subsystem for Linux with CUDA support 28 | - **Memory Optimized**: Addresses common memory issues on consumer GPUs 29 | - **Voice Selection**: Access to all 8 voices from the original model 30 | - **Emotive Tags Support**: Full support for all emotion tags 31 | 32 | ## Quick Start (WSL/Linux) 33 | 34 | ```bash 35 | # Clone the repository 36 | git clone https://github.com/Saganaki22/OrpheusTTS-WebUI.git 37 | cd OrpheusTTS-WebUI 38 | 39 | # Run the setup script 40 | chmod +x setup_orpheus.sh 41 | ./setup_orpheus.sh 42 | 43 | # Launch the app 44 | ./launch_orpheus.sh 45 | ``` 46 | 47 | ## Requirements 48 | 49 | - Python 3.10+ 50 | - CUDA-capable GPU (tested on RTX 3090 / 4090) 51 | - WSL2 or Linux 52 | - PyTorch 2.6.0 with CUDA 53 | - Hugging Face account with access to the Orpheus TTS models 54 | 55 | ## Available Voices 56 | 57 | The WebUI provides access to all 8 voices in order of conversational realism: 58 | - tara 59 | - jess 60 | - leo 61 | - leah 62 | - dan 63 | - mia 64 | - zac 65 | - zoe 66 | 67 | ## Emotive Tags 68 | 69 | Add emotion to your speech with tags: 70 | - `` 71 | - `` 72 | - `` 73 | - `` 74 | - `` 75 | - `` 76 | - `` 77 | - `` 78 | 79 | ## Long Form Text Processing 80 | 81 | The new Long Form feature lets you generate speech for larger text inputs: 82 | 83 | 1. **Text Chunking**: Text is automatically split into manageable chunks at sentence boundaries 84 | 2. **Parallel Processing**: Process multiple chunks simultaneously based on the batch size setting 85 | 3. **Parameter Optimization**: The Long Form tab offers optimized default settings for extended content 86 | 4. **Simple Assembly**: All audio chunks are automatically combined into a single cohesive output file 87 | 88 | This is ideal for: 89 | - Articles and blog posts 90 | - Scripts and dialogues 91 | - Books and stories 92 | - Any text content that exceeds a few paragraphs 93 | 94 | ## Troubleshooting 95 | 96 | If you encounter "KV cache" errors, the setup script should address these automatically. If problems persist, try: 97 | - Reducing `max_model_len` in the `orpheus_wrapper.py` file 98 | - Ensuring your GPU has enough VRAM (recommended 12GB+) 99 | - Setting `gpu_memory_utilization` to a lower value (0.7-0.8) 100 | - For Long Form processing, try reducing the batch size to limit memory usage 101 | 102 | --- 103 | 104 | # Official Orpheus TTS Documentation 105 | 106 | ## Overview 107 | Orpheus TTS is an open-source text-to-speech system built on the Llama-3b backbone. Orpheus demonstrates the emergent capabilities of using LLMs for speech synthesis. We offer comparisons of the models below to leading closed models like Eleven Labs and PlayHT in our blog post. 108 | 109 | [Check out our blog post](https://canopylabs.ai/model-releases) 110 | 111 | 112 | https://github.com/user-attachments/assets/ce17dd3a-f866-4e67-86e4-0025e6e87b8a 113 | 114 | 115 | ## Abilities 116 | 117 | - **Human-Like Speech**: Natural intonation, emotion, and rhythm that is superior to SOTA closed source models 118 | - **Zero-Shot Voice Cloning**: Clone voices without prior fine-tuning 119 | - **Guided Emotion and Intonation**: Control speech and emotion characteristics with simple tags 120 | - **Low Latency**: ~200ms streaming latency for realtime applications, reducible to ~100ms with input streaming 121 | 122 | ## Models 123 | 124 | We provide three models in this release, and additionally we offer the data processing scripts and sample datasets to make it very straightforward to create your own finetune. 125 | 126 | 1. [**Finetuned Prod**](https://huggingface.co/canopylabs/orpheus-tts-0.1-finetune-prod) – A finetuned model for everyday TTS applications 127 | 128 | 2. [**Pretrained**](https://huggingface.co/canopylabs/orpheus-tts-0.1-pretrained) – Our base model trained on 100k+ hours of English speech data 129 | 130 | 131 | ### Inference 132 | #### Simple setup on colab 133 | 1. [Colab For Tuned Model](https://colab.research.google.com/drive/1KhXT56UePPUHhqitJNUxq63k-pQomz3N?usp=sharing) (not streaming, see below for realtime streaming) – A finetuned model for everyday TTS applications. 134 | 2. [Colab For Pretrained Model](https://colab.research.google.com/drive/10v9MIEbZOr_3V8ZcPAIh8MN7q2LjcstS?usp=sharing) – This notebook is set up for conditioned generation but can be extended to a range of tasks. 135 | 136 | #### Prompting 137 | 138 | 1. The `finetune-prod` models: for the primary model, your text prompt is formatted as `{name}: I went to the ...`. The options for name in order of conversational realism (subjective benchmarks) are "tara", "jess", "leo", "leah", "dan", "mia", "zac", "zoe". Our python package does this formatting for you, and the notebook also prepends the appropriate string. You can additionally add the following emotive tags: ``, ``, ``, ``, ``, ``, ``, ``. 139 | 140 | 2. The pretrained model: you can either generate speech just conditioned on text, or generate speech conditioned on one or more existing text-speech pairs in the prompt. Since this model hasn't been explicitly trained on the zero-shot voice cloning objective, the more text-speech pairs you pass in the prompt, the more reliably it will generate in the correct voice. 141 | 142 | Additionally, use regular LLM generation args like `temperature`, `top_p`, etc. as you expect for a regular LLM. `repetition_penalty>=1.1`is required for stable generations. Increasing `repetition_penalty` and `temperature` makes the model speak faster. 143 | 144 | 145 | ## Finetune Model 146 | 147 | Here is an overview of how to finetune your model on any text and speech. 148 | This is a very simple process analogous to tuning an LLM using Trainer and Transformers. 149 | 150 | You should start to see high quality results after ~50 examples but for best results, aim for 300 examples/speaker. 151 | 152 | 1. Your dataset should be a huggingface dataset in [this format](https://huggingface.co/datasets/canopylabs/zac-sample-dataset) 153 | 2. We prepare the data using this [this notebook](https://colab.research.google.com/drive/1wg_CPCA-MzsWtsujwy-1Ovhv-tn8Q1nD?usp=sharing). This pushes an intermediate dataset to your Hugging Face account which you can can feed to the training script in finetune/train.py. Preprocessing should take less than 1 minute/thousand rows. 154 | 3. Modify the `finetune/config.yaml` file to include your dataset and training properties, and run the training script. You can additionally run any kind of huggingface compatible process like Lora to tune the model. 155 | ```bash 156 | pip install transformers datasets wandb trl flash_attn torch 157 | huggingface-cli login 158 | wandb login 159 | accelerate launch train.py 160 | ``` 161 |
162 |
163 | 164 | 165 | 171 | 177 | Star History Chart 181 | 182 | -------------------------------------------------------------------------------- /orpheus.py: -------------------------------------------------------------------------------- 1 | import gradio as gr 2 | from orpheus_tts import OrpheusModel 3 | import wave 4 | import time 5 | import os 6 | import logging 7 | import re 8 | import asyncio 9 | from concurrent.futures import ThreadPoolExecutor 10 | 11 | # Configure logging 12 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') 13 | logger = logging.getLogger(__name__) 14 | 15 | # Global variables 16 | model = None 17 | MODEL_SAMPLE_RATE = 24000 18 | 19 | model_path = None # Set this to your local model path if needed 20 | model_name = model_path if model_path else "canopylabs/orpheus-tts-0.1-finetune-prod" 21 | 22 | def load_model(model_name=model_name): 23 | """Load the Orpheus TTS model.""" 24 | global model 25 | try: 26 | logger.info(f"Loading model from: {model_name}") 27 | model = OrpheusModel(model_name=model_name) 28 | return True 29 | except Exception as e: 30 | logger.error(f"Error loading model: {str(e)}") 31 | return False 32 | 33 | def generate_speech(prompt, voice, temperature, top_p, repetition_penalty, max_tokens): 34 | """Generate speech for a single text input.""" 35 | if model is None: 36 | load_model() 37 | 38 | # Start timing 39 | start_time = time.monotonic() 40 | 41 | # Generate speech from the provided text 42 | syn_tokens = model.generate_speech( 43 | prompt=prompt, 44 | voice=voice, 45 | temperature=temperature, 46 | top_p=top_p, 47 | repetition_penalty=repetition_penalty, 48 | max_tokens=max_tokens 49 | ) 50 | 51 | # Create a unique output filename to avoid overwriting previous generations 52 | output_filename = f"output_{int(time.time())}.wav" 53 | 54 | # Write the audio to a WAV file 55 | with wave.open(output_filename, "wb") as wf: 56 | wf.setnchannels(1) 57 | wf.setsampwidth(2) 58 | wf.setframerate(MODEL_SAMPLE_RATE) 59 | 60 | total_frames = 0 61 | for audio_chunk in syn_tokens: 62 | frame_count = len(audio_chunk) // (wf.getsampwidth() * wf.getnchannels()) 63 | total_frames += frame_count 64 | wf.writeframes(audio_chunk) 65 | 66 | duration = total_frames / wf.getframerate() 67 | 68 | processing_time = time.monotonic() - start_time 69 | result_message = f"Generated {duration:.2f} seconds of audio in {processing_time:.2f} seconds" 70 | logger.info(result_message) 71 | 72 | return output_filename, result_message 73 | 74 | def chunk_text(text, max_chunk_size=300): 75 | """Split text into smaller chunks at sentence boundaries.""" 76 | # Replace multiple spaces with a single space 77 | text = re.sub(r"\s+", " ", text) 78 | 79 | # Split on sentence delimiters while preserving the delimiter 80 | delimiter_pattern = r'(?<=[.!?])\s+' 81 | segments = re.split(delimiter_pattern, text) 82 | 83 | # Process segments to ensure each has appropriate ending punctuation 84 | sentences = [] 85 | for segment in segments: 86 | segment = segment.strip() 87 | if not segment: 88 | continue 89 | 90 | # Check if segment already ends with a delimiter 91 | if not segment[-1] in ['.', '!', '?']: 92 | segment += '.' 93 | 94 | sentences.append(segment) 95 | chunks = [] 96 | current_chunk = "" 97 | 98 | for sentence in sentences: 99 | # If adding this sentence would make the chunk too long, start a new chunk 100 | if len(current_chunk) + len(sentence) > max_chunk_size and current_chunk: 101 | chunks.append(current_chunk) 102 | current_chunk = sentence 103 | else: 104 | current_chunk += " " + sentence if current_chunk else sentence 105 | 106 | # Add the last chunk if there's anything left 107 | if current_chunk: 108 | chunks.append(current_chunk) 109 | 110 | logger.info(f"Text chunked into {len(chunks)} segments") 111 | return chunks 112 | 113 | async def process_chunk(chunk, voice, temperature, top_p, repetition_penalty, max_tokens, temp_dir, current_idx, total_chunks): 114 | """Process a single chunk asynchronously.""" 115 | # Run the model inference in a separate thread since it's blocking 116 | loop = asyncio.get_event_loop() 117 | 118 | def generate_for_chunk(): 119 | return model.generate_speech( 120 | prompt=chunk, 121 | voice=voice, 122 | temperature=temperature, 123 | top_p=top_p, 124 | repetition_penalty=repetition_penalty, 125 | max_tokens=max_tokens 126 | ) 127 | 128 | # Execute the model inference (this runs in a thread) 129 | syn_tokens = await loop.run_in_executor(None, generate_for_chunk) 130 | 131 | # Create a filename for this chunk 132 | chunk_filename = os.path.join(temp_dir, f"chunk_{current_idx}.wav") 133 | 134 | # Write the audio to a WAV file 135 | with wave.open(chunk_filename, "wb") as wf: 136 | wf.setnchannels(1) 137 | wf.setsampwidth(2) 138 | wf.setframerate(MODEL_SAMPLE_RATE) 139 | 140 | chunk_frames = 0 141 | for audio_chunk in syn_tokens: 142 | frame_count = len(audio_chunk) // (wf.getsampwidth() * wf.getnchannels()) 143 | chunk_frames += frame_count 144 | wf.writeframes(audio_chunk) 145 | 146 | chunk_duration = chunk_frames / wf.getframerate() 147 | 148 | return chunk_filename, chunk_duration 149 | 150 | async def generate_long_form_speech_async(long_text, voice, temperature, top_p, repetition_penalty, 151 | batch_size=4, max_tokens=4096, progress=None): 152 | """Async version of generate_long_form_speech.""" 153 | start_time = time.monotonic() 154 | if progress is not None: 155 | progress(0, desc="Preparing text chunks") 156 | 157 | # Chunk the text 158 | chunks = chunk_text(long_text) 159 | if progress is not None: 160 | progress(0.1, desc=f"Text split into {len(chunks)} chunks") 161 | 162 | # Create a directory for batch files 163 | temp_dir = f"longform_{int(time.time())}" 164 | os.makedirs(temp_dir, exist_ok=True) 165 | logger.info(f"Created temp directory: {temp_dir}") 166 | 167 | # Use a semaphore to limit concurrent processing to batch_size 168 | semaphore = asyncio.Semaphore(batch_size) 169 | total_chunks = len(chunks) 170 | all_audio_files = [] 171 | total_duration = 0 172 | processed_chunks = 0 173 | 174 | async def process_chunk_with_semaphore(chunk, idx): 175 | nonlocal processed_chunks 176 | async with semaphore: 177 | try: 178 | filename, duration = await process_chunk( 179 | chunk, voice, temperature, top_p, repetition_penalty, 180 | max_tokens, temp_dir, idx, total_chunks 181 | ) 182 | processed_chunks += 1 183 | if progress is not None: 184 | progress(processed_chunks / total_chunks, 185 | desc=f"Processed chunk {processed_chunks}/{total_chunks}") 186 | return filename, duration 187 | except Exception as e: 188 | logger.error(f"Error processing chunk {idx}: {str(e)}") 189 | raise # Re-raise to be caught by gather 190 | 191 | # Create tasks for ALL chunks and process them concurrently with semaphore limiting parallelism 192 | tasks = [process_chunk_with_semaphore(chunk, idx) for idx, chunk in enumerate(chunks)] 193 | results = await asyncio.gather(*tasks) 194 | 195 | # Process results 196 | for filename, duration in results: 197 | all_audio_files.append(filename) 198 | total_duration += duration 199 | # Combine all audio files 200 | if progress is not None: 201 | progress(0.9, desc="Combining audio files") 202 | 203 | combined_filename = f"longform_output_{int(time.time())}.wav" 204 | logger.info(f"Combining {len(all_audio_files)} audio chunks into {combined_filename}") 205 | 206 | # Use a simple concatenation approach 207 | data = [] 208 | for file in sorted(all_audio_files, key=lambda f: int(os.path.basename(f).split('_')[1].split('.')[0])): 209 | with wave.open(file, 'rb') as w: 210 | data.append([w.getparams(), w.readframes(w.getnframes())]) 211 | 212 | with wave.open(combined_filename, 'wb') as output: 213 | if data: 214 | output.setparams(data[0][0]) 215 | for i in range(len(data)): 216 | output.writeframes(data[i][1]) 217 | 218 | # Clean up temporary files 219 | for file in all_audio_files: 220 | try: 221 | os.remove(file) 222 | except Exception as e: 223 | logger.warning(f"Failed to delete temp file {file}: {e}") 224 | 225 | try: 226 | os.rmdir(temp_dir) 227 | except Exception as e: 228 | logger.warning(f"Failed to delete temp directory {temp_dir}: {e}") 229 | 230 | # Calculate processing time 231 | processing_time = time.monotonic() - start_time 232 | result_message = f"Generated {total_duration:.2f} seconds of audio from {total_chunks} chunks in {processing_time:.2f} seconds" 233 | logger.info(result_message) 234 | 235 | if progress is not None: 236 | progress(1.0, desc="Complete") 237 | 238 | return combined_filename, result_message 239 | 240 | def generate_long_form_speech(long_text, voice, temperature, top_p, repetition_penalty, batch_size=4, max_tokens=4096, progress=gr.Progress()): 241 | """Generate speech for long-form text by chunking and processing in parallel batches.""" 242 | if model is None: 243 | load_model() 244 | 245 | # Use asyncio to run the async function 246 | loop = asyncio.new_event_loop() 247 | asyncio.set_event_loop(loop) 248 | try: 249 | print("Running long form speech generation") 250 | return loop.run_until_complete( 251 | generate_long_form_speech_async( 252 | long_text, voice, temperature, top_p, 253 | repetition_penalty, batch_size, max_tokens, progress 254 | ) 255 | ) 256 | finally: 257 | loop.close() 258 | 259 | def cleanup_files(): 260 | """Clean up generated audio files.""" 261 | count = 0 262 | for file in os.listdir(): 263 | if (file.startswith("output_") or file.startswith("longform_output_")) and file.endswith(".wav"): 264 | try: 265 | os.remove(file) 266 | count += 1 267 | except Exception as e: 268 | logger.warning(f"Failed to delete file {file}: {e}") 269 | 270 | # Also clean up any leftover temporary directories 271 | for dir_name in os.listdir(): 272 | if dir_name.startswith("longform_") and os.path.isdir(dir_name): 273 | try: 274 | # Remove any files inside 275 | for file in os.listdir(dir_name): 276 | os.remove(os.path.join(dir_name, file)) 277 | os.rmdir(dir_name) 278 | count += 1 279 | except Exception as e: 280 | logger.warning(f"Failed to delete directory {dir_name}: {e}") 281 | 282 | logger.info(f"Cleanup completed. Removed {count} files/directories.") 283 | 284 | # Create the Gradio interface 285 | def create_ui(): 286 | """Create the Gradio user interface.""" 287 | with gr.Blocks(title="OrpheusTTS-WebUI", theme=gr.themes.Default()) as demo: 288 | # Title and description 289 | gr.Markdown("

OrpheusTTS-WebUI

") 290 | 291 | gr.Markdown("""
Generate realistic speech from text using the OrpheusTTS model. 292 | **Available voices:** tara, jess, leo, leah, dan, mia, zac, zoe (in order of conversational realism) 293 | 294 | **Available emotive tags:** ``, ``, ``, ``, ``, ``, ``, `` 295 | 296 | **Note:** Increasing repetition_penalty and temperature makes the model speak faster. Increasing Max Tokens extends the maximum duration of genrated audio. 297 |
298 | """) 299 | 300 | # Create tabs container 301 | with gr.Tabs(selected=0) as tabs: 302 | # Tab 1: Single Text Generation 303 | with gr.Tab("Single Text"): 304 | with gr.Row(): 305 | with gr.Column(scale=2): 306 | # Text input area 307 | prompt = gr.Textbox( 308 | label="Text Input", 309 | placeholder="Enter text to convert to speech...", 310 | lines=3 311 | ) 312 | 313 | with gr.Row(): 314 | voice = gr.Dropdown( 315 | choices=["tara", "jess", "leo", "leah", "dan", "mia", "zac", "zoe"], 316 | label="Voice", 317 | value="tara" 318 | ) 319 | 320 | with gr.Row(): 321 | max_tokens = gr.Slider( 322 | label="Max Tokens", 323 | value=2048, 324 | minimum=128, 325 | maximum=16384, 326 | step=128 327 | ) 328 | 329 | temperature = gr.Slider( 330 | minimum=0.1, 331 | maximum=2.0, 332 | value=1.0, 333 | step=0.1, 334 | label="Temperature" 335 | ) 336 | top_p = gr.Slider( 337 | minimum=0.1, 338 | maximum=1.0, 339 | value=0.9, 340 | step=0.05, 341 | label="Top P" 342 | ) 343 | rep_penalty = gr.Slider( 344 | minimum=1.1, 345 | maximum=2.0, 346 | value=1.2, 347 | step=0.1, 348 | label="Repetition Penalty" 349 | ) 350 | max_tokens = gr.Slider( 351 | minimum=1200, 352 | maximum=3600, 353 | value=1200, 354 | step=100, 355 | label="Max Tokens" 356 | ) 357 | 358 | submit_btn = gr.Button("Generate Speech", variant="primary") 359 | gr.Examples( 360 | examples=[ 361 | "Man, the way social media has, um, completely changed how we interact is just wild, right?", 362 | "I just got back from my vacation and I'm already feeling stressed about work.", 363 | "Did you hear what happened at the party last night? It was absolutely ridiculous!", 364 | "I've been working on this project for hours and I still have so much to do.", 365 | "The concert was amazing! You should have seen the light show!" 366 | ], 367 | inputs=prompt, 368 | label="Example Prompts" 369 | ) 370 | 371 | with gr.Column(scale=1): 372 | audio_output = gr.Audio(label="Generated Speech") 373 | result_text = gr.Textbox(label="Generation Stats", interactive=False) 374 | 375 | # Connect the generate_speech function to the interface 376 | submit_btn.click( 377 | fn=generate_speech, 378 | inputs=[prompt, voice, temperature, top_p, rep_penalty, max_tokens], 379 | outputs=[audio_output, result_text] 380 | ) 381 | 382 | # Tab 2: Long Form Content 383 | with gr.Tab("Long Form Content"): 384 | with gr.Row(): 385 | with gr.Column(scale=2): 386 | long_form_prompt = gr.Textbox( 387 | label="Long Form Text Input", 388 | placeholder="Enter long text to convert to speech...", 389 | lines=15 390 | ) 391 | 392 | with gr.Row(): 393 | lf_voice = gr.Dropdown( 394 | choices=["tara", "jess", "leo", "leah", "dan", "mia", "zac", "zoe"], 395 | label="Voice", 396 | value="tara" 397 | ) 398 | 399 | with gr.Row(): 400 | lf_max_tokens = gr.Slider( 401 | label="Max Tokens", 402 | value=4096, 403 | minimum=128, 404 | maximum=16384, 405 | step=128 406 | ) 407 | 408 | lf_temperature = gr.Slider( 409 | minimum=0.1, 410 | maximum=2.0, 411 | value=0.6, 412 | step=0.1, 413 | label="Temperature" 414 | ) 415 | 416 | lf_top_p = gr.Slider( 417 | minimum=0.1, 418 | maximum=1.0, 419 | value=0.8, 420 | step=0.05, 421 | label="Top P" 422 | ) 423 | 424 | lf_rep_penalty = gr.Slider( 425 | minimum=1.0, 426 | maximum=2.0, 427 | value=1.1, 428 | step=0.1, 429 | label="Repetition Penalty" 430 | ) 431 | 432 | batch_size = gr.Slider( 433 | minimum=1, 434 | maximum=10, 435 | value=4, 436 | step=1, 437 | label="Batch Size (chunks processed in parallel)" 438 | ) 439 | 440 | lf_submit_btn = gr.Button("Generate Long Form Speech", variant="primary") 441 | 442 | gr.Examples( 443 | examples=[ 444 | """How Long Form Processing Works: 445 | Text is automatically split into chunks at sentence boundaries. 446 | Chunks are processed in batches based on the batch size. 447 | Higher batch sizes may be faster but require more memory. 448 | Finally, all chunks are combined into a single audio file. 449 | """, 450 | """I just got back from my vacation and I'm already feeling stressed about work. 451 | Did you hear what happened at the party last night? It was absolutely ridiculous! 452 | I've been working on this project for hours and I still have so much to do. 453 | The concert was amazing! You should have seen the light show! 454 | """ 455 | ], 456 | inputs=long_form_prompt, 457 | label="Example Prompts" 458 | ) 459 | 460 | 461 | with gr.Column(scale=1): 462 | lf_audio_output = gr.Audio(label="Generated Long Form Speech") 463 | lf_result_text = gr.Textbox(label="Generation Stats", interactive=False) 464 | 465 | # Connect the long form generation function 466 | lf_submit_btn.click( 467 | fn=generate_long_form_speech, 468 | inputs=[long_form_prompt, lf_voice, lf_temperature, lf_top_p, lf_rep_penalty, batch_size, lf_max_tokens], 469 | outputs=[lf_audio_output, lf_result_text] 470 | ) 471 | 472 | # Add footer with links 473 | gr.Markdown("""
474 | Hugging Face | 475 | WebUI GitHub | 476 | Official GitHub 477 |
""") 478 | 479 | # Register cleanup for when the interface closes 480 | demo.load(cleanup_files) 481 | 482 | return demo 483 | 484 | # Main function to run the app 485 | if __name__ == "__main__": 486 | # Initialize the app 487 | logger.info("Starting OrpheusTTS-WebUI") 488 | 489 | # Create and launch the UI 490 | demo = create_ui() 491 | demo.launch(share=False) # Set share=False to disable public URL --------------------------------------------------------------------------------