├── .gitignore
├── requirements.txt
├── README.md
└── app.py


/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | tmpdir*
3 | .env


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | certifi==2022.12.7
 2 | charset-normalizer==3.0.1
 3 | click==8.1.3
 4 | docopt==0.6.2
 5 | ffmpeg-python==0.2.0
 6 | filelock==3.9.0
 7 | Flask==2.2.3
 8 | future==0.18.3
 9 | huggingface-hub==0.12.1
10 | HyperPyYAML==1.1.0
11 | idna==3.4
12 | importlib-metadata==6.0.0
13 | itsdangerous==2.1.2
14 | Jinja2==3.1.6
15 | joblib==1.2.0
16 | MarkupSafe==2.1.2
17 | more-itertools==9.0.0
18 | num2words==0.5.12
19 | numpy==1.24.2
20 | openai-whisper==20230124
21 | packaging==23.0
22 | pydub==0.25.1
23 | PyYAML==6.0
24 | regex==2022.10.31
25 | requests==2.28.2
26 | ruamel.yaml==0.17.21
27 | ruamel.yaml.clib==0.2.7
28 | scipy==1.10.1
29 | sentencepiece==0.1.97
30 | speechbrain==0.5.13
31 | tokenizers==0.13.2
32 | torch==1.13.1
33 | torchaudio==0.13.1
34 | tqdm==4.64.1
35 | transformers==4.26.1
36 | typing_extensions==4.5.0
37 | urllib3==1.26.14
38 | Werkzeug==2.2.3
39 | zipp==3.14.0
40 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Speech REST API
  2 | 
  3 | This project provides a simple Flask API to transcribe speech from an audio file using the Whisper speech recognition library.
  4 | The API loads a pre-trained deep learning model to detect the spoken language and transcribe the speech to text.
  5 | 
  6 | It also provides an endpoint to generate speech from text using the Tacotron2 and HiFiGAN models.
  7 | 
  8 | ## Requirements
  9 | 
 10 | - Python 3.9 or later
 11 | 
 12 | ## Installation
 13 | 
 14 | ```bash
 15 | # Clone the repository
 16 | git clone https://github.com/askrella/speech-rest-api.git
 17 | 
 18 | # Navigate to the project directory
 19 | cd speech-rest-api
 20 | 
 21 | # Install ffmpeg (Ubuntu & Debian)
 22 | sudo apt update && sudo apt install ffmpeg -y
 23 | 
 24 | # Install the dependencies
 25 | pip install -r requirements.txt
 26 | 
 27 | # (Optional) Set PORT environment variable
 28 | export PORT=3000
 29 | 
 30 | # Run the REST API
 31 | python app.py
 32 | ```
 33 | 
 34 | ## Documentation
 35 | 
 36 | This endpoint generates speech from a text using the Tacotron2 and HiFiGAN models.
 37 | 
 38 | ```bash
 39 | POST http://localhost:80/tts
 40 | ```
 41 | 
 42 | The request body must be a JSON object with the following field:
 43 | 
 44 | - text (required): The text you want to hear spoken in the audio file.
 45 | 
 46 | Example:
 47 | 
 48 | ```json
 49 | {
 50 |     "text": "Hello, I can do some text to speech. Thats awesome!"
 51 | }
 52 | ```
 53 | 
 54 | 
 55 | The response is an audio file in WAV format containing the generated speech.
 56 | 
 57 | 
 58 | Here's an example curl command that generates speech from a list of input sentences:
 59 | 
 60 | ```sh
 61 | curl -X POST \
 62 |   -H "Content-Type: application/json" \
 63 |   --data '{"text": "Hello, how are you? My name is Shubh. It is a pleasure to meet you."}' \
 64 |   http://localhost:80/tts \
 65 |   --output output.wav
 66 | ```
 67 | 
 68 | This curl command sends a JSON payload with a list of three input sentences to the /tts endpoint, and saves the resulting audio file to output.wav.
 69 | 
 70 | 
 71 | This endpoint transcribes audio files using the Whisper model.
 72 | 
 73 | 
 74 | ```bash
 75 | POST http://localhost:80/transcribe
 76 | ```
 77 | 
 78 | 
 79 | The request body must be a form data object containing an audio file.
 80 | 
 81 | Example:
 82 | 
 83 | ```form
 84 | audio=@path/to/your/audio/file.wav
 85 | ```
 86 | 
 87 | 
 88 | The response is a JSON object containing the detected language and the transcribed text.
 89 | 
 90 | Example:
 91 | 
 92 | ```json
 93 | {
 94 |     "language": "en-US",
 95 |     "text": "Hello, how are you?"
 96 | }
 97 | ```
 98 | 
 99 | 
100 | Here's an example curl command that transcribes an audio file:
101 | 
102 | ```css 
103 | curl -X POST \
104 |   -F "audio=@path/to/your/audio/file.wav" \
105 |   http://localhost:80/transcribe
106 | ```
107 | 
108 | This curl command sends an HTTP POST request to the /transcribe endpoint with a form data containing the audio file. The detected language and transcribed text are returned as a JSON object.
109 | 


--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | import tempfile
  4 | import uuid
  5 | from flask import Flask, jsonify, request, send_file
  6 | from num2words import num2words
  7 | from pydub import AudioSegment
  8 | import torchaudio
  9 | from speechbrain.pretrained import HIFIGAN, Tacotron2
 10 | import whisper
 11 | 
 12 | # Flask app
 13 | app = Flask(__name__)
 14 | 
 15 | # Load TTS model
 16 | tacotron2 = Tacotron2.from_hparams(source="speechbrain/tts-tacotron2-ljspeech", savedir="tmpdir_tts")
 17 | hifi_gan = HIFIGAN.from_hparams(source="speechbrain/tts-hifigan-ljspeech", savedir="tmpdir_vocoder")
 18 | 
 19 | # TTS file prefix
 20 | speech_tts_prefix = "speech-tts-"
 21 | wav_suffix = ".wav"
 22 | opus_suffix = ".opus"
 23 | 
 24 | # Load transcription model
 25 | model = whisper.load_model("base")
 26 | 
 27 | # Clean temporary files (called every 5 minutes)
 28 | def clean_tmp():
 29 |     tmp_dir = tempfile.gettempdir()
 30 |     for file in os.listdir(tmp_dir):
 31 |         if file.startswith(speech_tts_prefix):
 32 |             os.remove(os.path.join(tmp_dir, file))
 33 |     print("[Speech REST API] Temporary files cleaned!")
 34 | 
 35 | # Preprocess text to replace numerals with words
 36 | def preprocess_text(text):
 37 |     text = re.sub(r'\d+', lambda m: num2words(int(m.group(0))), text)
 38 |     return text
 39 | 
 40 | # Run TTS and save file
 41 | # Returns the path to the file
 42 | def run_tts_and_save_file(text):
 43 |     # Running the TTS
 44 |     mel_outputs, mel_length, alignment = tacotron2.encode_batch([text])
 45 | 
 46 |     # Running Vocoder (spectrogram-to-waveform)
 47 |     waveforms = hifi_gan.decode_batch(mel_outputs)
 48 | 
 49 |     # Get temporary directory
 50 |     tmp_dir = tempfile.gettempdir()
 51 | 
 52 |     # Save wav to temporary file
 53 |     tmp_path_wav = os.path.join(tmp_dir, speech_tts_prefix + str(uuid.uuid4()) + wav_suffix)
 54 |     torchaudio.save(tmp_path_wav, waveforms.squeeze(1), 22050)
 55 |     return tmp_path_wav
 56 | 
 57 | # TTS endpoint
 58 | @app.route('/tts', methods=['POST'])
 59 | def generate_tts():
 60 |     if not request.json or 'text' not in request.json:
 61 |         return jsonify({'error': 'Invalid input: text missing'}), 400
 62 | 
 63 |     # Sentences to generate
 64 |     text = request.json['text']
 65 | 
 66 |     # Remove ' and " and  from text
 67 |     text = text.replace("'", "")
 68 |     text = text.replace('"', "")
 69 | 
 70 |     # Preprocess text to replace numerals with words
 71 |     text = preprocess_text(text)
 72 | 
 73 |     # Split text by . ? !
 74 |     sentences = re.split(r' *[\.\?!][\'"\)\]]* *', text)
 75 | 
 76 |     # Trim sentences
 77 |     sentences = [sentence.strip() for sentence in sentences]
 78 | 
 79 |     # Remove empty sentences
 80 |     sentences = [sentence for sentence in sentences if sentence]
 81 | 
 82 |     # Logging
 83 |     print("[Speech REST API] Got request: length (" + str(len(text)) + "), sentences (" + str(len(sentences)) + ")")
 84 | 
 85 |     # Run TTS for each sentence
 86 |     output_files = []
 87 | 
 88 |     for sentence in sentences:
 89 |         print("[Speech REST API] Generating TTS: " + sentence)
 90 |         tmp_path_wav = run_tts_and_save_file(sentence)
 91 |         output_files.append(tmp_path_wav)
 92 | 
 93 |     # Concatenate all files
 94 |     audio = AudioSegment.empty()
 95 | 
 96 |     for file in output_files:
 97 |         audio += AudioSegment.from_wav(file)
 98 | 
 99 |     # Save audio to file
100 |     tmp_dir = tempfile.gettempdir()
101 |     tmp_path_opus = os.path.join(tmp_dir, speech_tts_prefix + str(uuid.uuid4()) + opus_suffix)
102 |     audio.export(tmp_path_opus, format="opus")
103 | 
104 |     # Delete tmp files
105 |     for file in output_files:
106 |         os.remove(file)
107 | 
108 |     # Send file response
109 |     return send_file(tmp_path_opus, mimetype='audio/ogg, codecs=opus')
110 | 
111 | # Transcribe endpoint
112 | @app.route('/transcribe', methods=['POST'])
113 | def transcribe():
114 |     if 'audio' not in request.files:
115 |         return jsonify({'error': 'Invalid input, form-data: audio'}), 400
116 | 
117 |     # Audio file
118 |     audio_file = request.files['audio']
119 | 
120 |     # Save audio file into tmp folder
121 |     tmp_dir = tempfile.gettempdir()
122 |     tmp_path = os.path.join(tmp_dir, str(uuid.uuid4()))
123 |     audio_file.save(tmp_path)
124 | 
125 |     # Load audio and pad/trim it to fit 30 seconds
126 |     audio = whisper.load_audio(tmp_path)
127 |     audio = whisper.pad_or_trim(audio)
128 | 
129 |     # Make log-Mel spectrogram and move to the same device as the model
130 |     mel = whisper.log_mel_spectrogram(audio).to(model.device)
131 | 
132 |     # Detect the spoken language
133 |     _, probs = model.detect_language(mel)
134 |     language = max(probs, key=probs.get)
135 | 
136 |     # Decode the audio
137 |     result = whisper.transcribe(model, tmp_path)
138 |     text_result = result["text"]
139 |     text_result_trim = text_result.strip()
140 | 
141 |     # Delete tmp file
142 |     os.remove(tmp_path)
143 | 
144 |     return jsonify({
145 |         'language': language,
146 |         'text': text_result_trim
147 |     }), 200
148 | 
149 | # Health endpoint
150 | @app.route('/health', methods=['GET'])
151 | def health():
152 |     return jsonify({'status': 'ok'}), 200
153 | 
154 | @app.route('/clean', methods=['GET'])
155 | def clean():
156 |     clean_tmp()
157 |     return jsonify({'status': 'ok'}), 200
158 | 
159 | # Entry point
160 | if __name__ == '__main__':
161 |     port = int(os.environ.get('PORT', 3000))
162 | 
163 |     # Start server
164 |     print("[Speech REST API] Starting server on port " + str(port))
165 | 
166 |     app.run(host='0.0.0.0', port=3000)


--------------------------------------------------------------------------------