├── requirements.txt ├── static ├── audio_mobile.css ├── audio_desktop.css └── audio.js ├── README.md ├── templates ├── inputs.html └── output.html ├── translation.py └── translation_GOAT.py /requirements.txt: -------------------------------------------------------------------------------- 1 | Flask==2.2.3 2 | Flask-Bootstrap==3.3.7.1 3 | Flask-SocketIO==5.3.2 4 | Jinja2==3.1.2 5 | moviepy==1.0.3 6 | ngrok==0.1.6 7 | nltk==3.8.1 8 | openai==0.27.1 9 | pytube==12.1.2 10 | requests==2.28.2 11 | -------------------------------------------------------------------------------- /static/audio_mobile.css: -------------------------------------------------------------------------------- 1 | body { 2 | padding-bottom: 150px; 3 | } 4 | 5 | .container { 6 | display: flex; 7 | flex-direction: column; 8 | align-items: center; 9 | } 10 | 11 | 12 | .user-label { 13 | background-color: #f5f5f5; 14 | } 15 | 16 | .userInput-label { 17 | font-weight: bold; 18 | color: blue; 19 | fontSize: 20px 20 | } 21 | 22 | .user-message { 23 | background-color: #FAFAFA; 24 | padding: 25px; 25 | width: 83%; 26 | 27 | } 28 | 29 | .bot-message { 30 | background-color: #f5f5f5; 31 | padding: 25px; 32 | } 33 | 34 | 35 | .audio-container { 36 | top: 20px; 37 | 38 | position: fixed; 39 | justify-content: center; 40 | align-items: center; 41 | } 42 | 43 | .audio-container audio { 44 | top: 20px; 45 | justify-content: center; 46 | align-items: center; 47 | } 48 | 49 | 50 | .video-container { 51 | position: relative; 52 | margin: 0 auto; 53 | } 54 | 55 | .video-container video { 56 | display: block; 57 | width: 100%; 58 | height: auto; 59 | width: 320px; 60 | height: 240px; 61 | } 62 | .convo-heading { 63 | background-color: #f5f5f5; 64 | padding: 10px; 65 | } 66 | 67 | .conversation { 68 | background-color: #FAFAFA; 69 | align-items: center; 70 | margin-right: 40px; 71 | margin-left: 40px; 72 | } 73 | 74 | .lang-heading { 75 | background-color: #f5f5f5; 76 | padding: 10px; 77 | } 78 | 79 | .form-grouping { 80 | position: fixed 81 | 82 | } 83 | 84 | .mt-3 { 85 | position: fixed; 86 | bottom: 10px; 87 | background-color: #FFFFFF; 88 | width: 83%; 89 | } 90 | 91 | .centered { 92 | display: flex; 93 | flex-direction: column; 94 | align-items: center; 95 | 96 | } 97 | 98 | .col-md-6 { 99 | width: 80%; 100 | } 101 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Speech and Speech Translation App 2 | 3 | This is a Flask web app that allows you to translate audio and video files to an audio output in any languag of your choice using OpenAI's whisper, gpt-3.5-turbo and Eleven labs. It uses the pytube, moviepy, and pydub libraries to download and process the videos, and nltk for tokenizing the text. 4 | 5 | ## Watch a simple demo of the project: 6 | 7 | https://user-images.githubusercontent.com/96517814/229331997-4525a982-a482-462a-8a3a-559073905151.mp4 8 | 9 | ## Installation 10 | Clone this repository and navigate to the project directory: 11 | 12 | ```bash 13 | git clone https://github.com//.git 14 | cd 15 | ``` 16 | 17 | Create a virtual environment and install the required packages: 18 | 19 | ```bash 20 | python -m venv venv 21 | source venv/bin/activate # on Windows, use "venv\Scripts\activate" 22 | pip install -r requirements.txt 23 | ``` 24 | 25 | ## Usage 26 | Run the app locally using Flask: 27 | 28 | ```arduino 29 | export FLASK_APP=app.py 30 | export FLASK_ENV=development 31 | flask run 32 | ``` 33 | Navigate to `http://localhost:5000/` in your web browser to access the app. 34 | 35 | ## Features 36 | - Upload audio or video files or links. 37 | - Transcribe audio or video and generate a text output. 38 | - Chuck words of over 3000 tokens. 39 | - Use the OpenAI API to translate. 40 | - Use Elevenlabs API to convert text to audio. 41 | 42 | ## Dependencies 43 | ``` 44 | - Flask 45 | - Flask-SocketIO 46 | - Flask-Bootstrap 47 | - PyTube 48 | - moviepy 49 | - pydub 50 | - nltk 51 | - OpenAI API key 52 | - Elevenlabs API key 53 | ``` 54 | 55 | ## Contributing 56 | Contributions to this project are welcome. To contribute, please follow these steps: 57 | 1. Fork this repository. 58 | 2. Create a new branch: `git checkout -b my-new-branch` 59 | 3. Make your changes and commit them: `git commit -m "Add some feature"` 60 | 4. Push to the branch: `git push origin my-new-branch` 61 | 5. Create a new pull request. 62 | 6. Please include a clear description of your changes and their purpose. 63 | -------------------------------------------------------------------------------- /static/audio_desktop.css: -------------------------------------------------------------------------------- 1 | body { 2 | padding-bottom: 150px; 3 | } 4 | 5 | .user-label { 6 | background-color: #f5f5f5; 7 | } 8 | 9 | .userInput-label { 10 | font-weight: bold; 11 | color: blue; 12 | fontSize: 20px 13 | } 14 | 15 | .user-message { 16 | background-color: #FAFAFA; 17 | padding: 25px; 18 | } 19 | 20 | .bot-message { 21 | background-color: #f5f5f5; 22 | padding: 25px; 23 | } 24 | 25 | .audio-container { 26 | top: 50%; 27 | 28 | position: fixed; 29 | width: 45%; 30 | justify-content: center; 31 | align-items: center; 32 | } 33 | 34 | .audio-container audio { 35 | top: 50%; 36 | justify-content: center; 37 | align-items: center; 38 | } 39 | 40 | .video-container { 41 | position: fixed; 42 | width: 45%; 43 | top: 10px; 44 | left: 30px; 45 | } 46 | 47 | .video-container video { 48 | position: absolute; 49 | top: 0; 50 | left: 0; 51 | width: 100%; 52 | } 53 | 54 | 55 | .col-md-4 { 56 | bottom: 10px; 57 | } 58 | 59 | .container { 60 | display: flex; 61 | flex-direction: row; 62 | } 63 | 64 | .col-md-6 { 65 | width: 70%; 66 | padding: 10px; 67 | box-sizing: border-box; 68 | } 69 | 70 | .col-md-7 { 71 | width: 30%; 72 | padding: 10px; 73 | box-sizing: border-box; 74 | } 75 | 76 | 77 | .centered { 78 | left: 50%; 79 | } 80 | 81 | .conversation { 82 | background-color: #FAFAFA; 83 | padding: 50px; 84 | padding-top: 120px; 85 | height: 100%; 86 | } 87 | 88 | .lang-heading { 89 | background-color: #f5f5f5; 90 | padding: 10px; 91 | } 92 | 93 | 94 | .convo-heading { 95 | background-color: #f5f5f5; 96 | padding: 10px; 97 | } 98 | 99 | .form-grouping { 100 | position: fixed 101 | 102 | } 103 | 104 | .mt-3 { 105 | position: fixed; 106 | bottom: 10px; 107 | background-color: #FFFFFF; 108 | width: 47%; 109 | left: 51%; 110 | } 111 | -------------------------------------------------------------------------------- /templates/inputs.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Video Transcription App 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 16 | 17 | 18 |
19 |

Audio to Audio Translation

20 |


Translate audio to any language within minutes, and break language barriers.

21 |
22 |
23 |
Video file?
24 |
25 | 26 |
27 | 28 |
29 |
30 |
31 |
YouTube video?
32 |
33 | 34 | 35 |
36 | 37 |
38 |
39 | 40 | 41 | 42 |
43 |
Audio file?
44 |
45 | 46 |
47 | 48 |
49 | 50 |
51 | 52 |
53 |
Audio link?
54 |
55 | 56 | 57 | 58 |
59 | 60 |
61 | 62 |
63 | 64 | {% for message in get_flashed_messages() %} 65 |
{{ message }}
66 | {% endfor %} 67 | 68 | 69 | -------------------------------------------------------------------------------- /templates/output.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Video Transcription App 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | {% if request.user_agent.platform == 'android' or request.user_agent.platform == 'iphone' %} 13 | 14 | {% else %} 15 | 16 | {% endif %} 17 | 18 | 19 | 21 | 22 | 23 |
24 | 25 |
26 |
27 | 28 | {% if video_url %} 29 |
30 |

Watch, Ask

31 | 32 |
33 | 34 | {% endif %} 35 | 36 | {% if filename %} 37 |
38 | 41 |
42 | 43 | {% endif %} 44 | 45 |
46 |
47 | 48 |
49 | 50 |
51 | 52 |
53 |
54 |
55 | 56 | 57 |
58 | 59 |
60 | 61 | 62 | 63 |
64 | Choose thy tongue of choice, and it shall be granted! 65 |
66 |
67 | 68 | 73 |
74 | 75 |
76 | 77 |
78 |
79 |

80 |

81 |
82 |
83 |
84 |
85 |
86 | 87 | 88 | 89 | 90 | 91 | -------------------------------------------------------------------------------- /static/audio.js: -------------------------------------------------------------------------------- 1 | socket = io(); 2 | // Connect to the WebSocket server 3 | socket.connect('https://divineux23-code50-96517814-445597p7x7f5rx6-5000.preview.app.github.dev/'); 4 | // When the SocketIO connection is established 5 | socket.on('connect', function() { 6 | console.log('Connected to server'); 7 | }); 8 | 9 | // Send user input to the server when the form is submitted 10 | const form = document.getElementById('chat-form'); 11 | console.log(form); 12 | const input = document.getElementById('user_input'); 13 | const chat = document.getElementById('conversation-history'); 14 | 15 | const botAudio = document.getElementById('bot-audio'); 16 | 17 | form.addEventListener('submit', function(event) { 18 | event.preventDefault(); 19 | console.log('Form submitted'); 20 | 21 | // Handle submission of user input to server 22 | console.log(input.value); 23 | const message = document.createElement('div'); 24 | message.innerHTML = `LANGUAGE: ${input.value}`; 25 | 26 | message.classList.add('user-message'); 27 | chat.appendChild(message); 28 | socket.emit('user_input', input.value); 29 | input.value = ''; 30 | 31 | 32 | // Handle submission of voice to server 33 | const voiceSelect = document.getElementById('choice'); 34 | const selectedVoice = voiceSelect.value; 35 | console.log(selectedVoice); 36 | socket.emit('voice_id', selectedVoice); 37 | }); 38 | 39 | 40 | // Listen for incoming bot responses 41 | socket.on('bot_response', function(data) { 42 | 43 | // Update the chat UI with the bot response 44 | console.log(data); 45 | const chat = document.getElementById('conversation-history'); 46 | const message = document.createElement('div'); 47 | 48 | const ai = document.createElement('span'); 49 | ai.innerText = 'TEXT: '; 50 | ai.style.color = 'blue'; 51 | ai.style.fontWeight = 'meduim'; 52 | ai.style.fontSize = '14px'; 53 | 54 | message.appendChild(ai); 55 | 56 | const response = document.createElement('span'); 57 | response.innerText = data; 58 | response.style.color = 'black'; 59 | 60 | message.appendChild(response); 61 | 62 | message.classList.add('bot-message'); 63 | 64 | chat.appendChild(message); 65 | 66 | }); 67 | 68 | 69 | // Handle the new_audio event 70 | socket.on('new_audio', data => { 71 | 72 | console.log(data); 73 | 74 | const key = new Blob([data.data], { type: data.type }); 75 | 76 | const url = URL.createObjectURL(key); 77 | 78 | const audio = new Audio(url); 79 | 80 | audio.controls = true; 81 | 82 | const chat = document.getElementById('conversation-history'); 83 | 84 | chat.appendChild(audio); 85 | 86 | }); 87 | 88 | 89 | 90 | //Deleting the video 91 | 92 | window.addEventListener("beforeunload", function(event) { 93 | var xhr = new XMLHttpRequest(); 94 | xhr.open("POST", "/delete_video", true); 95 | xhr.send(); 96 | }); 97 | 98 | 99 | var timeout; 100 | 101 | function deleteVideoFile() { 102 | var xhr = new XMLHttpRequest(); 103 | xhr.open('POST', '/delete_video', true); 104 | xhr.send(); 105 | } 106 | 107 | function startTimeout() { 108 | timeout = setTimeout(deleteVideoFile, 1800000); 109 | } 110 | 111 | function clearTimeoutIfInteracted() { 112 | clearTimeout(timeout); 113 | document.removeEventListener('mousemove', clearTimeoutIfInteracted); 114 | } 115 | 116 | function clearTimeIfInteracted(){ 117 | clearTimeout(timeout); 118 | document.removeEventListener('click', clearTimeoutIfInteracted); 119 | } 120 | function clearAndDelete() { 121 | clearTimeout(timeout); 122 | deleteVideoFile(); 123 | } 124 | 125 | startTimeout(); 126 | 127 | document.addEventListener('click', clearTimeoutIfInteracted); 128 | 129 | document.addEventListener('mousemove', clearTimeoutIfInteracted); 130 | 131 | window.addEventListener('beforeunload', clearAndDelete); 132 | -------------------------------------------------------------------------------- /translation.py: -------------------------------------------------------------------------------- 1 | from flask import Flask, render_template, request, send_file, Response 2 | from flask_bootstrap import Bootstrap 3 | from pytube import YouTube 4 | import openai 5 | 6 | #Using socketIO to for js interaction: 7 | from flask_socketio import SocketIO, emit 8 | from flask import session 9 | import os 10 | import io 11 | 12 | #Elevenlabs: 13 | import requests 14 | 15 | #chucking video: 16 | from moviepy.video.io.VideoFileClip import VideoFileClip 17 | from pydub import AudioSegment 18 | import math 19 | 20 | #for playing vidoe: 21 | import uuid 22 | from moviepy.audio.io.AudioFileClip import AudioFileClip 23 | 24 | #chucking words of over 3000 tokens: 25 | import nltk 26 | from nltk.tokenize import sent_tokenize 27 | from nltk.tokenize import word_tokenize 28 | 29 | 30 | # Use your own API key 31 | openai.api_key = os.environ["OPENAI_API_KEY"] 32 | 33 | #Elevenlabs API key 34 | user.api_key = os.environ["OPENAI_API_KEY"] 35 | 36 | 37 | transcript = [] 38 | 39 | conversation_history = [] 40 | 41 | bot_response = None 42 | 43 | prompt = None 44 | 45 | filepath = None 46 | 47 | current_filepath = None 48 | 49 | voice = None 50 | 51 | app = Flask(__name__) 52 | app.config['SECRET_KEY'] = 'divine' 53 | app.config['UPLOAD_FOLDER'] = 'static' 54 | 55 | socketio = SocketIO(app) 56 | Bootstrap(app) 57 | 58 | 59 | @app.route('/') 60 | def index(): 61 | return render_template('audio_input.html') 62 | 63 | @socketio.on('connect') 64 | def handle_connect(): 65 | print('Client connected') 66 | 67 | 68 | # Upload video page 69 | @app.route('/upload', methods=['GET', 'POST']) 70 | def upload(): 71 | global transcript 72 | global prompt 73 | global bot_response 74 | global conversation_history 75 | global filepath 76 | global current_filepath 77 | 78 | 79 | if request.method == 'POST': 80 | if 'file' in request.files: 81 | file = request.files['file'] 82 | filename = file.filename 83 | filepath = os.path.join(app.config['UPLOAD_FOLDER'], filename) 84 | file.save(filepath) 85 | 86 | # Transcribe video and generate timestamped transcript 87 | transcript = transcribe_video(filepath) 88 | print(transcript) 89 | current_filepath = filepath 90 | 91 | return render_template('audio.html', video_url=filepath, transcript=transcript) 92 | 93 | 94 | elif 'youtube_link' in request.form: 95 | youtube_link = request.form['youtube_link'] 96 | 97 | # Use pytube to download the YouTube video 98 | yt = YouTube(youtube_link) 99 | stream = yt.streams.get_highest_resolution() 100 | file = stream.download(output_path='static', filename='my_video.mp4') 101 | filepath = os.path.join('static', 'my_video.mp4') 102 | 103 | # Transcribe video and generate timestamped transcript 104 | transcript = transcribe_video(filepath) 105 | print(transcript) 106 | current_filepath = filepath 107 | 108 | return render_template('audio.html', video_url=filepath, transcript=transcript) 109 | 110 | 111 | 112 | elif 'audio' in request.files: 113 | file = request.files['audio'] 114 | filename = str(uuid.uuid4()) + '.' + file.filename.split('.')[-1] 115 | filepath = os.path.join(app.config['UPLOAD_FOLDER'], filename) 116 | file.save(filepath) 117 | 118 | # Transcribe video and generate timestamped transcript 119 | transcript = transcribe_audio(filepath) 120 | print(transcript) 121 | current_filepath = filepath 122 | 123 | # Return the path to the downloaded video file 124 | return render_template('audio.html', filename=filepath, transcript=transcript) 125 | 126 | 127 | elif 'link' in request.form: 128 | link = request.form['link'] 129 | response = requests.get(link) 130 | filename = str(uuid.uuid4()) + '.mp3' 131 | filepath = os.path.join(app.config['UPLOAD_FOLDER'], filename) 132 | with open(filepath, 'wb') as f: 133 | f.write(response.content) 134 | 135 | # Transcribe video and generate timestamped transcript 136 | transcript = transcribe_audio(filepath) 137 | print(transcript) 138 | current_filepath = filepath 139 | 140 | # Return the path to the downloaded video file 141 | return render_template('audio.html', filename=filepath, transcript=transcript) 142 | 143 | return render_template('audio.html') 144 | else: 145 | 146 | return render_template('audio.html') 147 | 148 | 149 | 150 | # Play video on page 151 | @app.route('/play/') 152 | def play(video_url): 153 | # Remove the extra 'static' directory from the file path 154 | file_path = os.path.join(app.config['UPLOAD_FOLDER'], video_url.replace('static/', '', 1)) 155 | return send_file(file_path, mimetype='video/mp4') 156 | 157 | 158 | # Play audio on page 159 | @app.route('/play_file/') 160 | def play_file(filename): 161 | file_path = os.path.join(app.config['UPLOAD_FOLDER'], filename.replace('static/', '', 1)) 162 | return send_file(file_path, mimetype='audio/mp3') 163 | 164 | 165 | 166 | #For generating of video the transcript with wisper 167 | def transcribe_video(filepath): 168 | 169 | # Load the video file 170 | video = VideoFileClip(filepath) 171 | segment_duration = 10 * 60 # seconds 172 | transcripts = [] 173 | num_segments = math.ceil(video.duration / segment_duration) 174 | 175 | # Loop through the segments 176 | for i in range(num_segments): 177 | 178 | start_time = i * segment_duration 179 | end_time = min((i + 1) * segment_duration, video.duration) 180 | segment = video.subclip(start_time, end_time) 181 | segment_name = f"segment_{i+1}.mp3" 182 | segment.audio.write_audiofile(segment_name) 183 | 184 | # Pass the audio segment to WISPR for speech recognition 185 | audio = open(segment_name, "rb") 186 | transcripting = openai.Audio.transcribe("whisper-1", audio).text 187 | transcripts.append(transcripting) 188 | os.remove(segment_name) 189 | 190 | transcript = "\n".join(transcripts) 191 | return transcript 192 | 193 | 194 | #For generating of audio the transcript with wisper 195 | def transcribe_audio(filepath): 196 | 197 | audio = AudioFileClip(filepath) 198 | segment_duration = 10 * 60 # seconds 199 | transcripts = [] 200 | num_segments = math.ceil(audio.duration / segment_duration) 201 | 202 | # Loop through the segments 203 | for i in range(num_segments): 204 | 205 | start_time = i * segment_duration 206 | end_time = min((i + 1) * segment_duration, audio.duration) 207 | segment = audio.subclip(start_time, end_time) 208 | segment_name = f"segment_{i+1}.mp3" 209 | segment.write_audiofile(segment_name) 210 | 211 | # Pass the audio segment to WISPR for speech recognition 212 | audio = open(segment_name, "rb") 213 | transcripting = openai.Audio.transcribe("whisper-1", audio).text 214 | transcripts.append(transcripting) 215 | 216 | os.remove(segment_name) 217 | transcript = "\n".join(transcripts) 218 | 219 | return transcript 220 | 221 | 222 | # Getting users to choose a voice 223 | 224 | @socketio.on('voice_id') 225 | def get_audio(voice_id): 226 | 227 | global voice 228 | 229 | print(f"Voice ID = {voice_id}") 230 | 231 | word2 = "Jane" 232 | 233 | if set(voice_id) == set(word2): 234 | voice = 'EXAVITQu4vr4xnSDxMaL' 235 | 236 | else: 237 | voice = 'pNInz6obpgDQGcFmaJgB' 238 | 239 | print(f"Voice ID = {voice}") 240 | 241 | 242 | 243 | #opeanAI for the chat converation: 244 | nltk.download('punkt') 245 | 246 | @socketio.on('user_input') 247 | 248 | def handle_conversation(user_input): 249 | 250 | 251 | print(f"Voice ID 2 = {voice}") 252 | 253 | global bot_response 254 | 255 | 256 | if len(word_tokenize(transcript)) <= 3000: 257 | 258 | print("Token count less = ", len(word_tokenize(str(transcript)))) 259 | 260 | bot_response = generate_response(transcript, user_input) 261 | 262 | print(f"less than 3000 tokens = {bot_response}\n") 263 | 264 | else: 265 | 266 | print("Token count more = ", len(word_tokenize(transcript))) 267 | chunk_size = 3000 268 | chunks = [] 269 | sentences = sent_tokenize(transcript) 270 | current_chunk = "" 271 | 272 | for sentence in sentences: 273 | tokens = nltk.word_tokenize(sentence) 274 | 275 | if len(current_chunk.split()) + len(tokens) <= chunk_size: 276 | current_chunk += " " + sentence 277 | 278 | else: 279 | chunks.append(current_chunk.strip()) 280 | current_chunk = sentence 281 | 282 | print(f"TOKEN LENT OF unsent CHUNK = \n\n{len(word_tokenize(str(current_chunk.strip())))}\n\n\n") 283 | 284 | if current_chunk: 285 | chunks.append(current_chunk.strip()) 286 | 287 | 288 | responses = [] 289 | for chunk in chunks: 290 | response = generate_response(chunk, user_input) 291 | 292 | print(f"TOKEN LENT OF CHUNK = \n\n{len(word_tokenize(str(response)))}\n\n\n") 293 | 294 | responses.append(response) 295 | 296 | joined_response = ' '.join(responses) 297 | bot_response = joined_response 298 | 299 | new_audio = audio_output(bot_response, voice) 300 | 301 | # Create a Flask response object with the mp3 data and appropriate headers 302 | response = Response(new_audio, mimetype='audio/mpeg') 303 | response.headers.set('Content-Disposition', 'attachment', filename='responding.mp3') 304 | 305 | # Emit the audio data to the client-side 306 | socketio.emit('new_audio', {'data': new_audio, 'type': 'audio/mpeg'}) 307 | socketio.emit('bot_response', bot_response) 308 | 309 | 310 | 311 | 312 | #passing transcript or each chucks to chatgpt 313 | def generate_response(transcript, user_input): 314 | 315 | prompt = f"Translate {transcript} to {user_input}, don't say anything else except the translation," 316 | 317 | completion = openai.ChatCompletion.create( 318 | model="gpt-3.5-turbo", 319 | messages=[ 320 | {"role": "system", "content": "You're a proffesional language translator"}, 321 | {"role": "user", "content": prompt} 322 | ] 323 | ) 324 | bot_first_response = completion.choices[0].message.content 325 | 326 | return bot_first_response 327 | 328 | 329 | 330 | #Eleven-labs: Text to audio for new lang 331 | def audio_output(bot_response, voice): 332 | 333 | print(voice) 334 | 335 | CHUNK_SIZE = 1024 336 | 337 | 338 | url = f"https://api.elevenlabs.io/v1/text-to-speech/{voice}/stream" 339 | 340 | headers = { 341 | "Accept": "audio/mpeg", 342 | "Content-Type": "application/json", 343 | "xi-api-key": user 344 | } 345 | 346 | data = { 347 | "text": bot_response, 348 | "voice_settings": { 349 | "stability": 0, 350 | "similarity_boost": 0 351 | } 352 | } 353 | response = requests.post(url, json=data, headers=headers, stream=True) 354 | 355 | 356 | audio_data = io.BytesIO() 357 | for chunk in response.iter_content(chunk_size=CHUNK_SIZE): 358 | if chunk: 359 | audio_data.write(chunk) 360 | 361 | return audio_data.getvalue() 362 | 363 | 364 | 365 | #Automatic delete video/audio 366 | @app.route('/delete_video', methods=['POST']) 367 | def delete_video(): 368 | global current_filepath 369 | print("Dead & Gone") 370 | 371 | if os.path.exists(current_filepath): 372 | os.remove(current_filepath) 373 | print("Dead & Gone") 374 | 375 | return "Ooops! Time out" 376 | 377 | 378 | if __name__ == '__main__': 379 | app.run(host='0.0.0.0', port=5000) 380 | 381 | 382 | -------------------------------------------------------------------------------- /translation_GOAT.py: -------------------------------------------------------------------------------- 1 | from flask import Flask, render_template, request, send_file, Response 2 | from flask_bootstrap import Bootstrap 3 | from pytube import YouTube 4 | import openai 5 | 6 | #Using socketIO to for js interaction: 7 | from flask_socketio import SocketIO, emit 8 | from flask import session 9 | import os 10 | import io 11 | 12 | #Elevenlabs: 13 | import requests 14 | 15 | #chucking video: 16 | from moviepy.video.io.VideoFileClip import VideoFileClip 17 | from pydub import AudioSegment 18 | import math 19 | 20 | #for playing vidoe: 21 | import uuid 22 | from moviepy.audio.io.AudioFileClip import AudioFileClip 23 | 24 | #chucking words of over 3000 tokens: 25 | import nltk 26 | from nltk.tokenize import sent_tokenize 27 | from nltk.tokenize import word_tokenize 28 | 29 | 30 | # Use your own API key 31 | openai.api_key = os.environ["OPENAI_API_KEY"] 32 | 33 | #Elevenlabs API key 34 | user = os.environ["user"] 35 | 36 | transcript = [] 37 | 38 | conversation_history = [] 39 | 40 | bot_response = None 41 | 42 | prompt = None 43 | 44 | filepath = None 45 | 46 | current_filepath = None 47 | 48 | voice = None 49 | 50 | app = Flask(__name__) 51 | app.config['SECRET_KEY'] = 'divine' 52 | app.config['UPLOAD_FOLDER'] = 'static' 53 | 54 | socketio = SocketIO(app) 55 | Bootstrap(app) 56 | 57 | @app.route('/') 58 | def index(): 59 | return render_template('inputpage.html') 60 | 61 | @socketio.on('connect') 62 | def handle_connect(): 63 | print('Client connected') 64 | 65 | 66 | # Upload video page 67 | @app.route('/upload', methods=['GET', 'POST']) 68 | def upload(): 69 | global transcript 70 | global prompt 71 | global bot_response 72 | global conversation_history 73 | global filepath 74 | global current_filepath 75 | 76 | 77 | if request.method == 'POST': 78 | if 'file' in request.files: 79 | file = request.files['file'] 80 | filename = file.filename 81 | filepath = os.path.join(app.config['UPLOAD_FOLDER'], filename) 82 | file.save(filepath) 83 | 84 | # Transcribe video and generate timestamped transcript 85 | transcript = transcribe_video(filepath) 86 | print(transcript) 87 | current_filepath = filepath 88 | return render_template('audio.html', video_url=filepath, transcript=transcript) 89 | 90 | 91 | # Check if a YouTube link was provided 92 | elif 'youtube_link' in request.form: 93 | youtube_link = request.form['youtube_link'] 94 | # Use pytube to download the YouTube video 95 | 96 | yt = YouTube(youtube_link) 97 | stream = yt.streams.get_highest_resolution() 98 | file = stream.download(output_path='static', filename='my_video.mp4') 99 | filepath = os.path.join('static', 'my_video.mp4') 100 | 101 | # Transcribe video and generate timestamped transcript 102 | transcript = transcribe_video(filepath) 103 | print(transcript) 104 | current_filepath = filepath 105 | return render_template('audio.html', video_url=filepath, transcript=transcript) 106 | 107 | 108 | elif 'audio' in request.files: 109 | file = request.files['audio'] 110 | filename = str(uuid.uuid4()) + '.' + file.filename.split('.')[-1] 111 | filepath = os.path.join(app.config['UPLOAD_FOLDER'], filename) 112 | file.save(filepath) 113 | 114 | # Transcribe video 115 | transcript = transcribe_audio(filepath) 116 | print(transcript) 117 | current_filepath = filepath 118 | return render_template('audio.html', filename=filepath, transcript=transcript) 119 | 120 | 121 | elif 'link' in request.form: 122 | link = request.form['link'] 123 | response = requests.get(link) 124 | filename = str(uuid.uuid4()) + '.mp3' 125 | filepath = os.path.join(app.config['UPLOAD_FOLDER'], filename) 126 | with open(filepath, 'wb') as f: 127 | f.write(response.content) 128 | 129 | # Transcribe video and generate timestamped transcript 130 | transcript = transcribe_audio(filepath) 131 | print(transcript) 132 | current_filepath = filepath 133 | return render_template('audio.html', filename=filepath, transcript=transcript) 134 | 135 | return render_template('audio.html') 136 | else: 137 | 138 | return render_template('audio.html') 139 | 140 | 141 | # Play video page 142 | @app.route('/play/') 143 | def play(video_url): 144 | # Remove the extra 'static' directory from the file path 145 | file_path = os.path.join(app.config['UPLOAD_FOLDER'], video_url.replace('static/', '', 1)) 146 | return send_file(file_path, mimetype='video/mp4') 147 | 148 | 149 | # Play audio on page 150 | @app.route('/play_file/') 151 | def play_file(filename): 152 | file_path = os.path.join(app.config['UPLOAD_FOLDER'], filename.replace('static/', '', 1)) 153 | return send_file(file_path, mimetype='audio/mp3') 154 | 155 | 156 | 157 | #For generating the transcript with wisper 158 | def transcribe_video(filepath): 159 | 160 | video = VideoFileClip(filepath) 161 | segment_duration = 10 * 60 # seconds 162 | transcripts = [] 163 | num_segments = math.ceil(video.duration / segment_duration) 164 | 165 | # Loop through the segments 166 | for i in range(num_segments): 167 | # Calculate the start and end times for the current segment 168 | start_time = i * segment_duration 169 | end_time = min((i + 1) * segment_duration, video.duration) 170 | segment = video.subclip(start_time, end_time) 171 | segment_name = f"segment_{i+1}.mp3" 172 | segment.audio.write_audiofile(segment_name) 173 | 174 | # Pass the audio segment to WISPR for speech recognition 175 | audio = open(segment_name, "rb") 176 | transcripting = openai.Audio.transcribe("whisper-1", audio).text 177 | transcripts.append(transcripting) 178 | os.remove(segment_name) 179 | transcript = "\n".join(transcripts) 180 | 181 | return transcript 182 | 183 | 184 | #For generating of audio the transcript with wisper 185 | def transcribe_audio(filepath): 186 | 187 | audio = AudioFileClip(filepath) 188 | segment_duration = 10 * 60 # seconds 189 | transcripts = [] 190 | num_segments = math.ceil(audio.duration / segment_duration) 191 | 192 | # Loop through the segments 193 | for i in range(num_segments): 194 | start_time = i * segment_duration 195 | end_time = min((i + 1) * segment_duration, audio.duration) 196 | segment = audio.subclip(start_time, end_time) 197 | segment_name = f"segment_{i+1}.mp3" 198 | segment.write_audiofile(segment_name) 199 | 200 | # Pass the audio segment to WISPR for speech recognition 201 | audio = open(segment_name, "rb") 202 | transcripting = openai.Audio.transcribe("whisper-1", audio).text 203 | transcripts.append(transcripting) 204 | os.remove(segment_name) 205 | transcript = "\n".join(transcripts) 206 | 207 | return transcript 208 | 209 | 210 | 211 | 212 | #Training with the speakers voice 213 | def transcribe_video(filepath): 214 | global voice 215 | video = VideoFileClip(filepath) 216 | segment_duration = video.duration / 2 217 | num_segments = 2 218 | 219 | # Loop through the segments 220 | for i in range(num_segments): 221 | start_time = i * segment_duration 222 | end_time = min((i + 1) * segment_duration, video.duration) 223 | segment = video.subclip(start_time, end_time) 224 | segment_name = f"segment_{i+1}.mp3" 225 | segment.audio.write_audiofile(segment_name) 226 | 227 | voice = get_audio("segment_1.mp3", "segment_2.mp3") 228 | 229 | # Delete the segment MP3 file 230 | os.remove("segment_1.mp3") 231 | os.remove("segment_1.mp3") 232 | print(f"Voice ID = {voice}") 233 | 234 | 235 | #Training with the speakers voice 236 | def get_audio(voice_i, voice_ii): 237 | 238 | add_voice_url = "https://api.elevenlabs.io/v1/voices/add" 239 | 240 | headers = { 241 | "Accept": "application/json", 242 | "xi-api-key": XI_API_KEY 243 | } 244 | 245 | data = { 246 | 'name': 'Voice name', 247 | 'labels': '{"accent": "American", "gender": "Female"}' 248 | } 249 | 250 | files = [ 251 | ('files', ('sample1.mp3', open(voice_i, 'rb'), 'audio/mpeg')), 252 | ('files', ('sample2.mp3', open(voice_ii, 'rb'), 'audio/mpeg')) 253 | ] 254 | 255 | response = requests.post(add_voice_url, headers=headers, data=data, files=files) 256 | voice_id = response.json()[response.content] 257 | 258 | return voice_id 259 | 260 | 261 | 262 | 263 | #opeanAI for the chat converation: 264 | nltk.download('punkt') 265 | 266 | @socketio.on('user_input') 267 | 268 | def handle_conversation(user_input): 269 | 270 | print(f"Voice ID 2 = {voice}") 271 | 272 | global bot_response 273 | 274 | 275 | if len(word_tokenize(transcript)) <= 3000: 276 | 277 | print("Token count less = ", len(word_tokenize(str(transcript)))) 278 | bot_response = generate_response(transcript, user_input) 279 | print(f"less than 3000 tokens = {bot_response}\n") 280 | 281 | else: 282 | 283 | print("Token count more = ", len(word_tokenize(transcript))) 284 | chunk_size = 3000 285 | chunks = [] 286 | sentences = sent_tokenize(transcript) 287 | current_chunk = "" 288 | 289 | for sentence in sentences: 290 | tokens = nltk.word_tokenize(sentence) 291 | 292 | if len(current_chunk.split()) + len(tokens) <= chunk_size: 293 | current_chunk += " " + sentence 294 | 295 | else: 296 | chunks.append(current_chunk.strip()) 297 | current_chunk = sentence 298 | 299 | print(f"TOKEN LENT OF unsent CHUNK = \n\n{len(word_tokenize(str(current_chunk.strip())))}\n\n\n") 300 | 301 | if current_chunk: 302 | chunks.append(current_chunk.strip()) 303 | 304 | 305 | responses = [] 306 | for chunk in chunks: 307 | response = generate_response(chunk, user_input) 308 | 309 | print(f"TOKEN LENT OF CHUNK = \n\n{len(word_tokenize(str(response)))}\n\n\n") 310 | 311 | responses.append(response) 312 | 313 | joined_response = ' '.join(responses) 314 | bot_response = joined_response 315 | 316 | new_audio = audio_output(bot_response, voice) 317 | 318 | # Create a Flask response object with the mp3 data and appropriate headers 319 | response = Response(new_audio, mimetype='audio/mpeg') 320 | response.headers.set('Content-Disposition', 'attachment', filename='responding.mp3') 321 | 322 | # Emit the audio data to the client-side 323 | socketio.emit('new_audio', {'data': new_audio, 'type': 'audio/mpeg'}) 324 | socketio.emit('bot_response', bot_response) 325 | 326 | 327 | 328 | 329 | #passing transcript or each chucks to chatgpt 330 | def generate_response(transcript, user_input): 331 | 332 | prompt = f"Translate {transcript} to {user_input}" 333 | 334 | completion = openai.ChatCompletion.create( 335 | model="gpt-3.5-turbo", 336 | messages=[ 337 | {"role": "system", "content": "You're a proffesional language translator"}, 338 | {"role": "user", "content": prompt} 339 | ] 340 | ) 341 | bot_first_response = completion.choices[0].message.content 342 | 343 | return bot_first_response 344 | 345 | 346 | 347 | 348 | #Eleven-labs: Text to audio for new lang 349 | def audio_output(bot_response, voice): 350 | 351 | print(voice) 352 | 353 | CHUNK_SIZE = 1024 354 | 355 | 356 | url = f"https://api.elevenlabs.io/v1/text-to-speech/{voice}/stream" 357 | 358 | headers = { 359 | "Accept": "audio/mpeg", 360 | "Content-Type": "application/json", 361 | "xi-api-key": user 362 | } 363 | 364 | data = { 365 | "text": bot_response, 366 | "voice_settings": { 367 | "stability": 0, 368 | "similarity_boost": 0 369 | } 370 | } 371 | response = requests.post(url, json=data, headers=headers, stream=True) 372 | 373 | 374 | audio_data = io.BytesIO() 375 | for chunk in response.iter_content(chunk_size=CHUNK_SIZE): 376 | if chunk: 377 | audio_data.write(chunk) 378 | 379 | return audio_data.getvalue() 380 | 381 | 382 | 383 | #Automatic delele 384 | @app.route('/delete_video', methods=['POST']) 385 | def delete_video(): 386 | global current_filepath 387 | 388 | # Check if a video file path has been set 389 | if os.path.exists(current_filepath): 390 | os.remove(current_filepath) 391 | print("Dead & Gone") 392 | return "Ooops! Time out" 393 | 394 | 395 | if __name__ == '__main__': 396 | app.run(host='0.0.0.0', port=5050) 397 | 398 | 399 | --------------------------------------------------------------------------------