├── .DS_Store ├── README.md ├── branch.py ├── frequency_analyzer.py └── skinnyvideos.py /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crystal-zq-wang/VATT/028d8f342b2740174bbf0ca34587688a1a376a7c/.DS_Store -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # VATT 2 | Video-Audio Translation Tool 3 | -------------------------------------------------------------------------------- /branch.py: -------------------------------------------------------------------------------- 1 | import io 2 | import os 3 | import cv2 4 | import moviepy 5 | import crepe 6 | import numpy 7 | import csv 8 | import srt 9 | 10 | from time import sleep 11 | from google.cloud import translate_v2, speech_v1, texttospeech 12 | from moviepy.editor import * 13 | 14 | 15 | #This function centers the text vertically in the terminal 16 | def enters(n): 17 | row= os.get_terminal_size().lines 18 | enter= (row-n)/2 19 | for i in range(int(enter)): 20 | print() 21 | 22 | #This function prints out the text centered horizontally and at the specified speed 23 | def prints(str, n, input_size=0): 24 | col= os.get_terminal_size().columns 25 | length_sen= len(str) 26 | space= (col-length_sen)/2-input_size 27 | 28 | for i in range(int(space)): 29 | print(" ", end='', flush=True) 30 | 31 | for i in str: 32 | sleep(n) 33 | print(i, end='', flush=True) 34 | 35 | 36 | class VideoTranslator: 37 | 38 | def __init__(self): 39 | 40 | #BEGIN TRANSLATION SETUP 41 | self.translate_client = translate_v2.Client() 42 | #END TRANSLATION SETUP 43 | 44 | #BEGIN GENERAL SETUP 45 | self.languages = {} 46 | for lng in self.translate_client.get_languages(): 47 | self.languages[lng['name'].lower()] = lng['language'] 48 | #END GENERAL SETUP 49 | 50 | #BEGIN AUDIO-TEXT SETUP 51 | self.audio_text_client = speech_v1.SpeechClient() 52 | self.audio_channel_count = 2 53 | self.enable_separate_recognition_per_channel = True 54 | #END AUDIO-TEXT SETUP 55 | 56 | #START TEXT-AUDIO SETUP 57 | self.text_audio_client = texttospeech.TextToSpeechClient() 58 | #END TEXT-AUDIO SETUP 59 | 60 | def translate(self, text, lng="english"): 61 | translation = self.translate_client.translate(text, 62 | target_language=self.languages[lng.lower()]) 63 | return self.edit_transcript(translation['translatedText']) 64 | 65 | def get_text(self, transcript): 66 | return transcript[0] 67 | 68 | def get_audio(self, local_file_path): 69 | with io.open(local_file_path, "rb") as f: 70 | content = f.read() 71 | return content 72 | 73 | 74 | def translate_video(self, url, native_lng, lng="english"): 75 | #video, audio = self.retrieve_video_and_audio(url) 76 | audio = {"content": self.get_audio(url)} 77 | 78 | full_transcript = self.split_transcript(self.get_transcript(audio, native_lng)) 79 | translated_transcript = [] 80 | for line in full_transcript: 81 | translated_transcript.append(self.translate(line, lng)) 82 | 83 | # print(translated_transcript) 84 | 85 | translated_audio = None 86 | for i in range(len(full_transcript)): 87 | native_line = full_transcript[i] 88 | translated_line = translated_transcript[i] 89 | #speed_factor = 4 90 | if not translated_audio: 91 | translated_audio = self.text_to_audio(translated_line, lng, speed_factor=1) 92 | else: 93 | translated_audio = translated_audio + self.text_to_audio(translated_line, lng, speed_factor=1) 94 | 95 | 96 | with open("cache.mp3", "wb") as out: 97 | out.write(translated_audio) 98 | 99 | cache_audio = AudioFileClip("cache.mp3") 100 | new_duration = cache_audio.duration 101 | old_duration = videoclip.duration 102 | factor = new_duration/old_duration 103 | if factor > 4: 104 | factor = 4 105 | if factor < 0.25: 106 | factor = 0.25 107 | 108 | 109 | translated_audio2 = None 110 | for i in range(len(full_transcript)): 111 | native_line = full_transcript[i] 112 | translated_line = translated_transcript[i] 113 | speed_factor = factor 114 | if not translated_audio2: 115 | translated_audio2 = self.text_to_audio(translated_line, lng, speed_factor=factor) 116 | else: 117 | translated_audio2 = translated_audio2 + self.text_to_audio(translated_line, lng, speed_factor=factor) 118 | 119 | os.system("clear") 120 | 121 | display_text= self.get_text(full_transcript) 122 | display_text= "This is a transcript of the input video file in the language " + s1.upper() + " provided by the user. \n \n" + display_text+ "\n \n \n" 123 | 124 | 125 | display_text2= self.get_text(translated_transcript) 126 | display_text2= "This is a copy of the transcript in the language " +s2.upper()+ " auto-generated by google cloud api. \n \n" + display_text2 127 | 128 | display_text= display_text+ display_text2 129 | 130 | with open("transcript.txt", "w") as out: 131 | out.write(display_text) 132 | 133 | with open("output.mp3", "wb") as out: 134 | out.write(translated_audio2) 135 | 136 | audio_background = AudioFileClip("output.mp3") 137 | 138 | final_audio = CompositeAudioClip([audio_background]) 139 | final_clip = videoclip.set_audio(audio_background) 140 | final_clip.write_videofile("result.mp4") 141 | 142 | os.system("clear") 143 | 144 | clip_resized = final_clip.fx(vfx.resize, newsize=(h, w)) 145 | 146 | os.system("clear") 147 | 148 | clip_resized.write_videofile("result.mp4") 149 | 150 | os.system("clear") 151 | 152 | def edit_transcript(self, transcript): 153 | return transcript.replace("'", "'") 154 | 155 | def split_transcript(self, transcript): 156 | return [transcript] 157 | 158 | def get_transcript(self, audio, native_lng): 159 | config = { 160 | "audio_channel_count": self.audio_channel_count, 161 | "enable_separate_recognition_per_channel": self.enable_separate_recognition_per_channel, 162 | "language_code": self.languages[native_lng], 163 | } 164 | response = self.audio_text_client.recognize(config, audio) 165 | for result in response.results: 166 | alternative = result.alternatives[0] 167 | 168 | return format(alternative.transcript) 169 | 170 | def get_speed_factor(self, native_line, translated_line): # incomplete 171 | return len(translated_line)/len(native_line) 172 | 173 | def determine_gender(self, frequency): 174 | return frequency > 170 and "female" or "male" 175 | 176 | def text_to_audio(self, text, lng, speed_factor, gender=None): 177 | 178 | gender = self.determine_gender(frequency) 179 | 180 | if gender == "female": 181 | ssml_gender=texttospeech.enums.SsmlVoiceGender.FEMALE 182 | elif gender == "male": 183 | ssml_gender=texttospeech.enums.SsmlVoiceGender.MALE 184 | else: 185 | ssml_gender=texttospeech.enums.SsmlVoiceGender.NEUTRAL 186 | 187 | # print(speed_factor) 188 | synthesis_input = texttospeech.types.SynthesisInput(text=text) 189 | voice = texttospeech.types.VoiceSelectionParams(language_code=self.languages[lng], ssml_gender=ssml_gender) 190 | audio_config = texttospeech.types.AudioConfig(audio_encoding=texttospeech.enums.AudioEncoding.MP3, speaking_rate=speed_factor, pitch=frequency/100) 191 | response = self.text_audio_client.synthesize_speech(synthesis_input, voice, audio_config) 192 | return response.audio_content 193 | 194 | 195 | os.system("clear") 196 | 197 | # Takes in the name of the input file, which is a .mov file 198 | enters(1) 199 | prints("Specify Filename: ", 0.01, 10) 200 | s= input() 201 | 202 | # Getting the dimensions of the original video 203 | vid = cv2.VideoCapture(s) 204 | h = int(vid.get(cv2.CAP_PROP_FRAME_HEIGHT)) 205 | w = int(vid.get(cv2.CAP_PROP_FRAME_WIDTH)) 206 | 207 | # Making an audiofile out of the the video file, named "trying.wav" 208 | videoclip = VideoFileClip(s) 209 | audioclip = videoclip.audio 210 | audioclip.write_audiofile("trying.wav", verbose=False) 211 | os.system("clear") 212 | 213 | # taking inputs of native language and changin language 214 | enters(1) 215 | prints("Input Language: ", 0.01, 8) 216 | s1 = str(input()).strip() 217 | 218 | print() 219 | print() 220 | 221 | prints("Output Language: ", 0.01, 8) 222 | s2 = str(input()).strip() 223 | 224 | #finsing average frequency of the input audio 225 | lst = list() 226 | os.system("clear") 227 | os.system("crepe trying.wav --step-size 100") 228 | 229 | with open ('trying.f0.csv',newline='') as csvfile: 230 | data = csv.reader(csvfile, delimiter=',') 231 | for row in data: 232 | lst.append(row[1]) 233 | 234 | lst = lst[1:] 235 | lst = [float(s) for s in lst] 236 | lst.sort() 237 | length = len(lst) 238 | 239 | DELTA = 0.1 240 | lst = lst[int(length*DELTA):-int(length*DELTA)] 241 | frequency = sum(lst) / len(lst) 242 | 243 | os.system("clear") 244 | 245 | """ Ideas: Randomized language """ 246 | 247 | vt = VideoTranslator() 248 | os.system("clear") 249 | vt.translate_video("trying.wav", s1, s2) 250 | os.system("clear") 251 | os.remove("trying.wav") 252 | os.remove("cache.mp3") 253 | os.remove("output.mp3") 254 | os.remove("trying.f0.csv") 255 | os.system("clear") 256 | 257 | enters(1) 258 | prints("Task completed. Please check directory for new video.", 0.02) 259 | for _ in range(5): 260 | print() -------------------------------------------------------------------------------- /frequency_analyzer.py: -------------------------------------------------------------------------------- 1 | import crepe 2 | import numpy 3 | import csv 4 | import os 5 | 6 | lst = list() 7 | filename = 'Recording' 8 | cmd = 'crepe ' + filename + '.wav' 9 | os.system(cmd) 10 | 11 | 12 | with open (filename+'.f0.csv',newline='') as csvfile: 13 | data = csv.reader(csvfile, delimiter=',') 14 | for row in data: 15 | lst.append(row[1]) 16 | 17 | lst = lst[1:] 18 | lst = [float(s) for s in lst] 19 | # print(type(lst[1])) 20 | 21 | lst.sort() 22 | 23 | length = len(lst) 24 | 25 | DELTA = 0.1 26 | assert DELTA < 0.5, "not gonna work bro" 27 | 28 | lst = lst[int(length*DELTA):-int(length*DELTA)] 29 | average = sum(lst) / len(lst) 30 | print(average) 31 | 32 | 33 | -------------------------------------------------------------------------------- /skinnyvideos.py: -------------------------------------------------------------------------------- 1 | import io 2 | import os 3 | import crepe 4 | 5 | from google.cloud import translate_v2, speech_v1, texttospeech 6 | from moviepy.editor import * 7 | from scipy.io import wavfile 8 | 9 | class VideoTranslator: 10 | 11 | def __init__(self): 12 | 13 | #BEGIN TRANSLATION SETUP 14 | self.translate_client = translate_v2.Client() 15 | #END TRANSLATION SETUP 16 | 17 | #BEGIN GENERAL SETUP 18 | self.languages = {} 19 | for lng in self.translate_client.get_languages(): 20 | self.languages[lng['name'].lower()] = lng['language'] 21 | #END GENERAL SETUP 22 | 23 | #BEGIN AUDIO-TEXT SETUP 24 | self.audio_text_client = speech_v1.SpeechClient() 25 | self.audio_channel_count = 2 26 | self.enable_separate_recognition_per_channel = True 27 | #END AUDIO-TEXT SETUP 28 | 29 | #START TEXT-AUDIO SETUP 30 | self.text_audio_client = texttospeech.TextToSpeechClient() 31 | #END TEXT-AUDIO SETUP 32 | 33 | def translate(self, text, lng="english"): 34 | translation = self.translate_client.translate(text, 35 | target_language=self.languages[lng.lower()]) 36 | return self.edit_transcript(translation['translatedText']) 37 | 38 | def get_audio(self, local_file_path): 39 | with io.open(local_file_path, "rb") as f: 40 | content = f.read() 41 | return content 42 | 43 | 44 | def translate_video(self, url, native_lng, lng="english"): 45 | #video, audio = self.retrieve_video_and_audio(url) 46 | audio = {"content": self.get_audio(url)} 47 | 48 | full_transcript = self.split_transcript(self.get_transcript(audio, native_lng)) 49 | translated_transcript = [] 50 | for line in full_transcript: 51 | translated_transcript.append(self.translate(line, lng)) 52 | 53 | translated_audio = None 54 | for i in range(len(full_transcript)): 55 | native_line = full_transcript[i] 56 | translated_line = translated_transcript[i] 57 | speed_factor = self.get_speed_factor(native_line, translated_line) 58 | if not translated_audio: 59 | translated_audio = self.text_to_audio(translated_line, lng, speed_factor=speed_factor) 60 | else: 61 | translated_audio = translated_audio + self.text_to_audio(translated_line, lng, speed_factor=speed_factor) 62 | 63 | 64 | with open("output.mp3", "wb") as out: 65 | out.write(translated_audio) 66 | 67 | audio_background = AudioFileClip("output.mp3") 68 | final_audio = CompositeAudioClip([audio_background]) 69 | final_clip = videoclip.set_audio(final_audio) 70 | final_clip.write_videofile("result.mp4") 71 | 72 | # new_clip = videoclip.set_audio(AudioFileClip("output.mp3")) 73 | # videoclip.write_videofile("output.mp4") 74 | 75 | #videoclip.write_videofile("output.mp4", audio="trying.mp3") 76 | #return self.splice_video_and_audio(video, translated_audio) 77 | 78 | def edit_transcript(self, transcript): 79 | return transcript.replace("'", "'") 80 | 81 | def split_transcript(self, transcript): 82 | return transcript.split(' ') 83 | 84 | 85 | def retrieve_video_and_audio(self, url): #ARUSHI HAS THIS CODE 86 | return None 87 | 88 | def get_transcript(self, audio, native_lng): #CRYSTAL HAS THIS CODE 89 | config = { 90 | "audio_channel_count": self.audio_channel_count, 91 | "enable_separate_recognition_per_channel": self.enable_separate_recognition_per_channel, 92 | "language_code": self.languages[native_lng], 93 | } 94 | response = self.audio_text_client.recognize(config, audio) 95 | for result in response.results: 96 | alternative = result.alternatives[0] 97 | 98 | return format(alternative.transcript) 99 | 100 | def get_speed_factor(self, native_line, translated_line): #CAN EDIT THIS LATER, FUNCTIONAL FOR NOW 101 | return len(translated_line)/len(native_line) 102 | 103 | def determine_gender(frequency): 104 | return frequency > 170 and "female" or "male" 105 | 106 | def text_to_audio(self, text, lng, speed_factor=1, gender=None): #CRYSTAL IS WORKING ON THIS CODE 107 | 108 | gender = determine_gender(frequency) 109 | if gender == "female": 110 | ssml_gender=texttospeech.enums.SsmlVoiceGender.FEMALE 111 | elif gender == "male": 112 | ssml_gender=texttospeech.enums.SsmlVoiceGender.MALE 113 | else: 114 | ssml_gender=texttospeech.enums.SsmlVoiceGender.NEUTRAL 115 | 116 | synthesis_input = texttospeech.types.SynthesisInput(text=text) 117 | voice = texttospeech.types.VoiceSelectionParams(language_code=self.languages[lng], ssml_gender=ssml_gender) 118 | audio_config = texttospeech.types.AudioConfig(audio_encoding=texttospeech.enums.AudioEncoding.MP3) 119 | response = self.text_audio_client.synthesize_speech(synthesis_input, voice, audio_config) 120 | return response.audio_content 121 | 122 | s = input("Specify Filename: ") 123 | 124 | sr, audio = wavfile.read('nani.mov') 125 | time, frequency, confidence, activation = crepe.predict(audio, sr, viterbi= True) 126 | 127 | """ 128 | if the frequency is between upto 170, then it is a male voice. 129 | if the frequency is between 171 and infinity, then it is a female voice. 130 | """ 131 | 132 | videoclip = VideoFileClip(s) 133 | audioclip = videoclip.audio 134 | audioclip.write_audiofile("trying.wav", verbose=True) 135 | 136 | s1 = str(input("Input Language: ")) 137 | s2 = str(input("Output Language: ")) 138 | 139 | vt = VideoTranslator() 140 | vt.translate_video("trying.wav", s1, s2) 141 | 142 | os.remove("trying.wav") 143 | --------------------------------------------------------------------------------