├── .gitignore ├── requirements.txt ├── README.md └── app.py /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | tmpdir* 3 | .env -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | certifi==2022.12.7 2 | charset-normalizer==3.0.1 3 | click==8.1.3 4 | docopt==0.6.2 5 | ffmpeg-python==0.2.0 6 | filelock==3.9.0 7 | Flask==2.2.3 8 | future==0.18.3 9 | huggingface-hub==0.12.1 10 | HyperPyYAML==1.1.0 11 | idna==3.4 12 | importlib-metadata==6.0.0 13 | itsdangerous==2.1.2 14 | Jinja2==3.1.6 15 | joblib==1.2.0 16 | MarkupSafe==2.1.2 17 | more-itertools==9.0.0 18 | num2words==0.5.12 19 | numpy==1.24.2 20 | openai-whisper==20230124 21 | packaging==23.0 22 | pydub==0.25.1 23 | PyYAML==6.0 24 | regex==2022.10.31 25 | requests==2.28.2 26 | ruamel.yaml==0.17.21 27 | ruamel.yaml.clib==0.2.7 28 | scipy==1.10.1 29 | sentencepiece==0.1.97 30 | speechbrain==0.5.13 31 | tokenizers==0.13.2 32 | torch==1.13.1 33 | torchaudio==0.13.1 34 | tqdm==4.64.1 35 | transformers==4.26.1 36 | typing_extensions==4.5.0 37 | urllib3==1.26.14 38 | Werkzeug==2.2.3 39 | zipp==3.14.0 40 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Speech REST API 2 | 3 | This project provides a simple Flask API to transcribe speech from an audio file using the Whisper speech recognition library. 4 | The API loads a pre-trained deep learning model to detect the spoken language and transcribe the speech to text. 5 | 6 | It also provides an endpoint to generate speech from text using the Tacotron2 and HiFiGAN models. 7 | 8 | ## Requirements 9 | 10 | - Python 3.9 or later 11 | 12 | ## Installation 13 | 14 | ```bash 15 | # Clone the repository 16 | git clone https://github.com/askrella/speech-rest-api.git 17 | 18 | # Navigate to the project directory 19 | cd speech-rest-api 20 | 21 | # Install ffmpeg (Ubuntu & Debian) 22 | sudo apt update && sudo apt install ffmpeg -y 23 | 24 | # Install the dependencies 25 | pip install -r requirements.txt 26 | 27 | # (Optional) Set PORT environment variable 28 | export PORT=3000 29 | 30 | # Run the REST API 31 | python app.py 32 | ``` 33 | 34 | ## Documentation 35 | 36 | This endpoint generates speech from a text using the Tacotron2 and HiFiGAN models. 37 | 38 | ```bash 39 | POST http://localhost:80/tts 40 | ``` 41 | 42 | The request body must be a JSON object with the following field: 43 | 44 | - text (required): The text you want to hear spoken in the audio file. 45 | 46 | Example: 47 | 48 | ```json 49 | { 50 | "text": "Hello, I can do some text to speech. Thats awesome!" 51 | } 52 | ``` 53 | 54 | 55 | The response is an audio file in WAV format containing the generated speech. 56 | 57 | 58 | Here's an example curl command that generates speech from a list of input sentences: 59 | 60 | ```sh 61 | curl -X POST \ 62 | -H "Content-Type: application/json" \ 63 | --data '{"text": "Hello, how are you? My name is Shubh. It is a pleasure to meet you."}' \ 64 | http://localhost:80/tts \ 65 | --output output.wav 66 | ``` 67 | 68 | This curl command sends a JSON payload with a list of three input sentences to the /tts endpoint, and saves the resulting audio file to output.wav. 69 | 70 | 71 | This endpoint transcribes audio files using the Whisper model. 72 | 73 | 74 | ```bash 75 | POST http://localhost:80/transcribe 76 | ``` 77 | 78 | 79 | The request body must be a form data object containing an audio file. 80 | 81 | Example: 82 | 83 | ```form 84 | audio=@path/to/your/audio/file.wav 85 | ``` 86 | 87 | 88 | The response is a JSON object containing the detected language and the transcribed text. 89 | 90 | Example: 91 | 92 | ```json 93 | { 94 | "language": "en-US", 95 | "text": "Hello, how are you?" 96 | } 97 | ``` 98 | 99 | 100 | Here's an example curl command that transcribes an audio file: 101 | 102 | ```css 103 | curl -X POST \ 104 | -F "audio=@path/to/your/audio/file.wav" \ 105 | http://localhost:80/transcribe 106 | ``` 107 | 108 | This curl command sends an HTTP POST request to the /transcribe endpoint with a form data containing the audio file. The detected language and transcribed text are returned as a JSON object. 109 | -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import tempfile 4 | import uuid 5 | from flask import Flask, jsonify, request, send_file 6 | from num2words import num2words 7 | from pydub import AudioSegment 8 | import torchaudio 9 | from speechbrain.pretrained import HIFIGAN, Tacotron2 10 | import whisper 11 | 12 | # Flask app 13 | app = Flask(__name__) 14 | 15 | # Load TTS model 16 | tacotron2 = Tacotron2.from_hparams(source="speechbrain/tts-tacotron2-ljspeech", savedir="tmpdir_tts") 17 | hifi_gan = HIFIGAN.from_hparams(source="speechbrain/tts-hifigan-ljspeech", savedir="tmpdir_vocoder") 18 | 19 | # TTS file prefix 20 | speech_tts_prefix = "speech-tts-" 21 | wav_suffix = ".wav" 22 | opus_suffix = ".opus" 23 | 24 | # Load transcription model 25 | model = whisper.load_model("base") 26 | 27 | # Clean temporary files (called every 5 minutes) 28 | def clean_tmp(): 29 | tmp_dir = tempfile.gettempdir() 30 | for file in os.listdir(tmp_dir): 31 | if file.startswith(speech_tts_prefix): 32 | os.remove(os.path.join(tmp_dir, file)) 33 | print("[Speech REST API] Temporary files cleaned!") 34 | 35 | # Preprocess text to replace numerals with words 36 | def preprocess_text(text): 37 | text = re.sub(r'\d+', lambda m: num2words(int(m.group(0))), text) 38 | return text 39 | 40 | # Run TTS and save file 41 | # Returns the path to the file 42 | def run_tts_and_save_file(text): 43 | # Running the TTS 44 | mel_outputs, mel_length, alignment = tacotron2.encode_batch([text]) 45 | 46 | # Running Vocoder (spectrogram-to-waveform) 47 | waveforms = hifi_gan.decode_batch(mel_outputs) 48 | 49 | # Get temporary directory 50 | tmp_dir = tempfile.gettempdir() 51 | 52 | # Save wav to temporary file 53 | tmp_path_wav = os.path.join(tmp_dir, speech_tts_prefix + str(uuid.uuid4()) + wav_suffix) 54 | torchaudio.save(tmp_path_wav, waveforms.squeeze(1), 22050) 55 | return tmp_path_wav 56 | 57 | # TTS endpoint 58 | @app.route('/tts', methods=['POST']) 59 | def generate_tts(): 60 | if not request.json or 'text' not in request.json: 61 | return jsonify({'error': 'Invalid input: text missing'}), 400 62 | 63 | # Sentences to generate 64 | text = request.json['text'] 65 | 66 | # Remove ' and " and from text 67 | text = text.replace("'", "") 68 | text = text.replace('"', "") 69 | 70 | # Preprocess text to replace numerals with words 71 | text = preprocess_text(text) 72 | 73 | # Split text by . ? ! 74 | sentences = re.split(r' *[\.\?!][\'"\)\]]* *', text) 75 | 76 | # Trim sentences 77 | sentences = [sentence.strip() for sentence in sentences] 78 | 79 | # Remove empty sentences 80 | sentences = [sentence for sentence in sentences if sentence] 81 | 82 | # Logging 83 | print("[Speech REST API] Got request: length (" + str(len(text)) + "), sentences (" + str(len(sentences)) + ")") 84 | 85 | # Run TTS for each sentence 86 | output_files = [] 87 | 88 | for sentence in sentences: 89 | print("[Speech REST API] Generating TTS: " + sentence) 90 | tmp_path_wav = run_tts_and_save_file(sentence) 91 | output_files.append(tmp_path_wav) 92 | 93 | # Concatenate all files 94 | audio = AudioSegment.empty() 95 | 96 | for file in output_files: 97 | audio += AudioSegment.from_wav(file) 98 | 99 | # Save audio to file 100 | tmp_dir = tempfile.gettempdir() 101 | tmp_path_opus = os.path.join(tmp_dir, speech_tts_prefix + str(uuid.uuid4()) + opus_suffix) 102 | audio.export(tmp_path_opus, format="opus") 103 | 104 | # Delete tmp files 105 | for file in output_files: 106 | os.remove(file) 107 | 108 | # Send file response 109 | return send_file(tmp_path_opus, mimetype='audio/ogg, codecs=opus') 110 | 111 | # Transcribe endpoint 112 | @app.route('/transcribe', methods=['POST']) 113 | def transcribe(): 114 | if 'audio' not in request.files: 115 | return jsonify({'error': 'Invalid input, form-data: audio'}), 400 116 | 117 | # Audio file 118 | audio_file = request.files['audio'] 119 | 120 | # Save audio file into tmp folder 121 | tmp_dir = tempfile.gettempdir() 122 | tmp_path = os.path.join(tmp_dir, str(uuid.uuid4())) 123 | audio_file.save(tmp_path) 124 | 125 | # Load audio and pad/trim it to fit 30 seconds 126 | audio = whisper.load_audio(tmp_path) 127 | audio = whisper.pad_or_trim(audio) 128 | 129 | # Make log-Mel spectrogram and move to the same device as the model 130 | mel = whisper.log_mel_spectrogram(audio).to(model.device) 131 | 132 | # Detect the spoken language 133 | _, probs = model.detect_language(mel) 134 | language = max(probs, key=probs.get) 135 | 136 | # Decode the audio 137 | result = whisper.transcribe(model, tmp_path) 138 | text_result = result["text"] 139 | text_result_trim = text_result.strip() 140 | 141 | # Delete tmp file 142 | os.remove(tmp_path) 143 | 144 | return jsonify({ 145 | 'language': language, 146 | 'text': text_result_trim 147 | }), 200 148 | 149 | # Health endpoint 150 | @app.route('/health', methods=['GET']) 151 | def health(): 152 | return jsonify({'status': 'ok'}), 200 153 | 154 | @app.route('/clean', methods=['GET']) 155 | def clean(): 156 | clean_tmp() 157 | return jsonify({'status': 'ok'}), 200 158 | 159 | # Entry point 160 | if __name__ == '__main__': 161 | port = int(os.environ.get('PORT', 3000)) 162 | 163 | # Start server 164 | print("[Speech REST API] Starting server on port " + str(port)) 165 | 166 | app.run(host='0.0.0.0', port=3000) --------------------------------------------------------------------------------