├── .DS_Store
├── README.md
├── branch.py
├── frequency_analyzer.py
└── skinnyvideos.py


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crystal-zq-wang/VATT/028d8f342b2740174bbf0ca34587688a1a376a7c/.DS_Store


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # VATT
2 | Video-Audio Translation Tool
3 | 


--------------------------------------------------------------------------------
/branch.py:
--------------------------------------------------------------------------------
  1 | import io
  2 | import os
  3 | import cv2
  4 | import moviepy
  5 | import crepe
  6 | import numpy
  7 | import csv
  8 | import srt
  9 | 
 10 | from time import sleep
 11 | from google.cloud import translate_v2, speech_v1, texttospeech
 12 | from moviepy.editor import *
 13 | 
 14 | 
 15 | #This function centers the text vertically in the terminal
 16 | def enters(n):
 17 |     row= os.get_terminal_size().lines
 18 |     enter= (row-n)/2
 19 |     for i in range(int(enter)):
 20 |         print()
 21 | 
 22 | #This function prints out the text centered horizontally and at the specified speed
 23 | def prints(str, n, input_size=0):
 24 |     col= os.get_terminal_size().columns
 25 |     length_sen= len(str)
 26 |     space= (col-length_sen)/2-input_size
 27 | 
 28 |     for i in range(int(space)):
 29 |         print(" ", end='', flush=True)
 30 |         
 31 |     for i in str:
 32 |         sleep(n)
 33 |         print(i, end='', flush=True)
 34 | 
 35 | 
 36 | class VideoTranslator:
 37 | 
 38 |     def __init__(self):
 39 | 
 40 |         #BEGIN TRANSLATION SETUP
 41 |         self.translate_client = translate_v2.Client()
 42 |         #END TRANSLATION SETUP
 43 | 
 44 |         #BEGIN GENERAL SETUP
 45 |         self.languages = {}
 46 |         for lng in self.translate_client.get_languages():
 47 |             self.languages[lng['name'].lower()] = lng['language']
 48 |         #END GENERAL SETUP
 49 | 
 50 |         #BEGIN AUDIO-TEXT SETUP
 51 |         self.audio_text_client = speech_v1.SpeechClient()
 52 |         self.audio_channel_count = 2
 53 |         self.enable_separate_recognition_per_channel = True
 54 |         #END AUDIO-TEXT SETUP
 55 | 
 56 |         #START TEXT-AUDIO SETUP
 57 |         self.text_audio_client = texttospeech.TextToSpeechClient()
 58 |         #END TEXT-AUDIO SETUP
 59 | 
 60 |     def translate(self, text, lng="english"):
 61 |         translation = self.translate_client.translate(text,
 62 |             target_language=self.languages[lng.lower()])
 63 |         return self.edit_transcript(translation['translatedText'])
 64 | 
 65 |     def get_text(self, transcript):
 66 |         return transcript[0] 
 67 |         
 68 |     def get_audio(self, local_file_path):
 69 |         with io.open(local_file_path, "rb") as f:
 70 |             content = f.read()
 71 |         return content
 72 | 
 73 | 
 74 |     def translate_video(self, url, native_lng, lng="english"):
 75 |         #video, audio = self.retrieve_video_and_audio(url)
 76 |         audio = {"content": self.get_audio(url)}
 77 |         
 78 |         full_transcript = self.split_transcript(self.get_transcript(audio, native_lng))
 79 |         translated_transcript = []
 80 |         for line in full_transcript:
 81 |             translated_transcript.append(self.translate(line, lng))
 82 | 
 83 |         # print(translated_transcript)
 84 | 
 85 |         translated_audio = None 
 86 |         for i in range(len(full_transcript)):
 87 |             native_line = full_transcript[i]
 88 |             translated_line = translated_transcript[i]
 89 |             #speed_factor = 4
 90 |             if not translated_audio:
 91 |                 translated_audio = self.text_to_audio(translated_line, lng, speed_factor=1)
 92 |             else:
 93 |                 translated_audio = translated_audio + self.text_to_audio(translated_line, lng, speed_factor=1)
 94 | 
 95 | 
 96 |         with open("cache.mp3", "wb") as out:
 97 |             out.write(translated_audio)
 98 | 
 99 |         cache_audio = AudioFileClip("cache.mp3")
100 |         new_duration = cache_audio.duration
101 |         old_duration = videoclip.duration
102 |         factor = new_duration/old_duration
103 |         if factor > 4:
104 |             factor = 4
105 |         if factor < 0.25:
106 |             factor = 0.25
107 | 
108 | 
109 |         translated_audio2 = None
110 |         for i in range(len(full_transcript)):
111 |             native_line = full_transcript[i]
112 |             translated_line = translated_transcript[i]
113 |             speed_factor = factor
114 |             if not translated_audio2:
115 |                 translated_audio2 = self.text_to_audio(translated_line, lng, speed_factor=factor)
116 |             else:
117 |                 translated_audio2 = translated_audio2 + self.text_to_audio(translated_line, lng, speed_factor=factor)
118 | 
119 |         os.system("clear")
120 | 
121 |         display_text= self.get_text(full_transcript)
122 |         display_text= "This is a transcript of the input video file in the language " + s1.upper() + " provided by the user. \n \n" + display_text+ "\n \n \n"
123 | 
124 | 
125 |         display_text2= self.get_text(translated_transcript)
126 |         display_text2= "This is a copy of the transcript in the language " +s2.upper()+ " auto-generated by google cloud api. \n \n" + display_text2
127 | 
128 |         display_text= display_text+ display_text2
129 | 
130 |         with open("transcript.txt", "w") as out:
131 |             out.write(display_text)
132 | 
133 |         with open("output.mp3", "wb") as out:
134 |             out.write(translated_audio2)
135 | 
136 |         audio_background = AudioFileClip("output.mp3")
137 | 
138 |         final_audio = CompositeAudioClip([audio_background])
139 |         final_clip = videoclip.set_audio(audio_background)
140 |         final_clip.write_videofile("result.mp4")
141 | 
142 |         os.system("clear")
143 | 
144 |         clip_resized = final_clip.fx(vfx.resize, newsize=(h, w))
145 | 
146 |         os.system("clear")
147 | 
148 |         clip_resized.write_videofile("result.mp4")
149 | 
150 |         os.system("clear")
151 | 
152 |     def edit_transcript(self, transcript):
153 |         return transcript.replace("&#39;", "'")
154 | 
155 |     def split_transcript(self, transcript):
156 |         return [transcript]
157 | 
158 |     def get_transcript(self, audio, native_lng):
159 |         config = {
160 |             "audio_channel_count": self.audio_channel_count,
161 |             "enable_separate_recognition_per_channel": self.enable_separate_recognition_per_channel,
162 |             "language_code": self.languages[native_lng],
163 |         }
164 |         response = self.audio_text_client.recognize(config, audio)
165 |         for result in response.results:
166 |             alternative = result.alternatives[0]
167 | 
168 |         return format(alternative.transcript)
169 | 
170 |     def get_speed_factor(self, native_line, translated_line): # incomplete
171 |         return len(translated_line)/len(native_line)
172 | 
173 |     def determine_gender(self, frequency):
174 |         return frequency > 170 and "female" or "male"
175 | 
176 |     def text_to_audio(self, text, lng, speed_factor, gender=None): 
177 |         
178 |         gender = self.determine_gender(frequency)
179 | 
180 |         if gender == "female":
181 |             ssml_gender=texttospeech.enums.SsmlVoiceGender.FEMALE
182 |         elif gender == "male":
183 |             ssml_gender=texttospeech.enums.SsmlVoiceGender.MALE
184 |         else:
185 |             ssml_gender=texttospeech.enums.SsmlVoiceGender.NEUTRAL
186 | 
187 |         # print(speed_factor)
188 |         synthesis_input = texttospeech.types.SynthesisInput(text=text)
189 |         voice = texttospeech.types.VoiceSelectionParams(language_code=self.languages[lng], ssml_gender=ssml_gender)
190 |         audio_config = texttospeech.types.AudioConfig(audio_encoding=texttospeech.enums.AudioEncoding.MP3, speaking_rate=speed_factor, pitch=frequency/100)
191 |         response = self.text_audio_client.synthesize_speech(synthesis_input, voice, audio_config)
192 |         return response.audio_content
193 | 
194 | 
195 | os.system("clear")
196 | 
197 | # Takes in the name of the input file, which is a .mov file
198 | enters(1)
199 | prints("Specify Filename: ", 0.01, 10)
200 | s= input()
201 | 
202 | # Getting the dimensions of the original video
203 | vid = cv2.VideoCapture(s)
204 | h = int(vid.get(cv2.CAP_PROP_FRAME_HEIGHT))
205 | w = int(vid.get(cv2.CAP_PROP_FRAME_WIDTH))
206 | 
207 | # Making an audiofile out of the the video file, named "trying.wav"
208 | videoclip = VideoFileClip(s)
209 | audioclip = videoclip.audio
210 | audioclip.write_audiofile("trying.wav", verbose=False)
211 | os.system("clear")
212 | 
213 | # taking inputs of native language and changin language
214 | enters(1)
215 | prints("Input Language: ", 0.01, 8)
216 | s1 = str(input()).strip()
217 | 
218 | print()
219 | print()
220 | 
221 | prints("Output Language: ", 0.01, 8)
222 | s2 = str(input()).strip()
223 | 
224 | #finsing average frequency of the input audio
225 | lst = list()
226 | os.system("clear")
227 | os.system("crepe trying.wav --step-size 100")
228 | 
229 | with open ('trying.f0.csv',newline='') as csvfile:
230 |     data = csv.reader(csvfile, delimiter=',')
231 |     for row in data:
232 |         lst.append(row[1])
233 | 
234 | lst = lst[1:]
235 | lst = [float(s) for s in lst]
236 | lst.sort()
237 | length = len(lst)
238 | 
239 | DELTA = 0.1
240 | lst = lst[int(length*DELTA):-int(length*DELTA)]
241 | frequency = sum(lst) / len(lst)
242 | 
243 | os.system("clear")
244 | 
245 | """ Ideas: Randomized language """
246 | 
247 | vt = VideoTranslator()
248 | os.system("clear")
249 | vt.translate_video("trying.wav", s1, s2)
250 | os.system("clear")
251 | os.remove("trying.wav")
252 | os.remove("cache.mp3")
253 | os.remove("output.mp3")
254 | os.remove("trying.f0.csv")
255 | os.system("clear")
256 | 
257 | enters(1)
258 | prints("Task completed. Please check directory for new video.", 0.02)
259 | for _ in range(5):
260 |     print()


--------------------------------------------------------------------------------
/frequency_analyzer.py:
--------------------------------------------------------------------------------
 1 | import crepe
 2 | import numpy
 3 | import csv
 4 | import os
 5 | 
 6 | lst = list()
 7 | filename = 'Recording'
 8 | cmd = 'crepe ' + filename + '.wav'
 9 | os.system(cmd)
10 | 
11 | 
12 | with open (filename+'.f0.csv',newline='') as csvfile:
13 |     data = csv.reader(csvfile, delimiter=',')
14 |     for row in data:
15 |         lst.append(row[1])
16 | 
17 | lst = lst[1:]
18 | lst = [float(s) for s in lst]
19 | # print(type(lst[1]))
20 | 
21 | lst.sort()
22 | 
23 | length = len(lst)
24 | 
25 | DELTA = 0.1
26 | assert DELTA < 0.5, "not gonna work bro"
27 | 
28 | lst = lst[int(length*DELTA):-int(length*DELTA)]
29 | average = sum(lst) / len(lst)
30 | print(average)
31 | 
32 |     
33 | 


--------------------------------------------------------------------------------
/skinnyvideos.py:
--------------------------------------------------------------------------------
  1 | import io
  2 | import os
  3 | import crepe
  4 | 
  5 | from google.cloud import translate_v2, speech_v1, texttospeech
  6 | from moviepy.editor import *
  7 | from scipy.io import wavfile
  8 | 
  9 | class VideoTranslator:
 10 | 
 11 |     def __init__(self):
 12 | 
 13 |         #BEGIN TRANSLATION SETUP
 14 |         self.translate_client = translate_v2.Client()
 15 |         #END TRANSLATION SETUP
 16 | 
 17 |         #BEGIN GENERAL SETUP
 18 |         self.languages = {}
 19 |         for lng in self.translate_client.get_languages():
 20 |             self.languages[lng['name'].lower()] = lng['language']
 21 |         #END GENERAL SETUP
 22 | 
 23 |         #BEGIN AUDIO-TEXT SETUP
 24 |         self.audio_text_client = speech_v1.SpeechClient()
 25 |         self.audio_channel_count = 2
 26 |         self.enable_separate_recognition_per_channel = True
 27 |         #END AUDIO-TEXT SETUP
 28 | 
 29 |         #START TEXT-AUDIO SETUP
 30 |         self.text_audio_client = texttospeech.TextToSpeechClient()
 31 |         #END TEXT-AUDIO SETUP
 32 | 
 33 |     def translate(self, text, lng="english"):
 34 |         translation = self.translate_client.translate(text,
 35 |             target_language=self.languages[lng.lower()])
 36 |         return self.edit_transcript(translation['translatedText'])
 37 |         
 38 |     def get_audio(self, local_file_path):
 39 |         with io.open(local_file_path, "rb") as f:
 40 |             content = f.read()
 41 |         return content
 42 | 
 43 | 
 44 |     def translate_video(self, url, native_lng, lng="english"):
 45 |         #video, audio = self.retrieve_video_and_audio(url)
 46 |         audio = {"content": self.get_audio(url)}
 47 |         
 48 |         full_transcript = self.split_transcript(self.get_transcript(audio, native_lng))
 49 |         translated_transcript = []
 50 |         for line in full_transcript:
 51 |             translated_transcript.append(self.translate(line, lng))
 52 | 
 53 |         translated_audio = None 
 54 |         for i in range(len(full_transcript)):
 55 |             native_line = full_transcript[i]
 56 |             translated_line = translated_transcript[i]
 57 |             speed_factor = self.get_speed_factor(native_line, translated_line)
 58 |             if not translated_audio:
 59 |                 translated_audio = self.text_to_audio(translated_line, lng, speed_factor=speed_factor)
 60 |             else:
 61 |                 translated_audio = translated_audio + self.text_to_audio(translated_line, lng, speed_factor=speed_factor)
 62 | 
 63 | 
 64 |         with open("output.mp3", "wb") as out:
 65 |             out.write(translated_audio)
 66 | 
 67 |         audio_background = AudioFileClip("output.mp3")
 68 |         final_audio = CompositeAudioClip([audio_background])
 69 |         final_clip = videoclip.set_audio(final_audio)
 70 |         final_clip.write_videofile("result.mp4")
 71 | 
 72 |         # new_clip = videoclip.set_audio(AudioFileClip("output.mp3"))
 73 |         # videoclip.write_videofile("output.mp4")
 74 | 
 75 |         #videoclip.write_videofile("output.mp4", audio="trying.mp3")
 76 |         #return self.splice_video_and_audio(video, translated_audio)
 77 | 
 78 |     def edit_transcript(self, transcript):
 79 |         return transcript.replace("&#39;", "'")
 80 | 
 81 |     def split_transcript(self, transcript):
 82 |         return transcript.split(' ')
 83 | 
 84 | 
 85 |     def retrieve_video_and_audio(self, url): #ARUSHI HAS THIS CODE
 86 |         return None
 87 | 
 88 |     def get_transcript(self, audio, native_lng): #CRYSTAL HAS THIS CODE
 89 |         config = {
 90 |             "audio_channel_count": self.audio_channel_count,
 91 |             "enable_separate_recognition_per_channel": self.enable_separate_recognition_per_channel,
 92 |             "language_code": self.languages[native_lng],
 93 |         }
 94 |         response = self.audio_text_client.recognize(config, audio)
 95 |         for result in response.results:
 96 |             alternative = result.alternatives[0]
 97 | 
 98 |         return format(alternative.transcript)
 99 | 
100 |     def get_speed_factor(self, native_line, translated_line): #CAN EDIT THIS LATER, FUNCTIONAL FOR NOW
101 |         return len(translated_line)/len(native_line)
102 | 
103 |     def determine_gender(frequency):
104 |         return frequency > 170 and "female" or "male"
105 | 
106 |     def text_to_audio(self, text, lng, speed_factor=1, gender=None): #CRYSTAL IS WORKING ON THIS CODE
107 |         
108 |         gender = determine_gender(frequency)
109 |         if gender == "female":
110 |             ssml_gender=texttospeech.enums.SsmlVoiceGender.FEMALE
111 |         elif gender == "male":
112 |             ssml_gender=texttospeech.enums.SsmlVoiceGender.MALE
113 |         else:
114 |             ssml_gender=texttospeech.enums.SsmlVoiceGender.NEUTRAL
115 | 
116 |         synthesis_input = texttospeech.types.SynthesisInput(text=text)
117 |         voice = texttospeech.types.VoiceSelectionParams(language_code=self.languages[lng], ssml_gender=ssml_gender)
118 |         audio_config = texttospeech.types.AudioConfig(audio_encoding=texttospeech.enums.AudioEncoding.MP3)
119 |         response = self.text_audio_client.synthesize_speech(synthesis_input, voice, audio_config)
120 |         return response.audio_content
121 | 
122 | s = input("Specify Filename: ")
123 | 
124 | sr, audio = wavfile.read('nani.mov')
125 | time, frequency, confidence, activation = crepe.predict(audio, sr, viterbi= True)
126 | 
127 | """
128 | if the frequency is between upto 170, then it is a male voice.
129 | if the frequency is between 171 and infinity, then it is a female voice.
130 | """
131 | 
132 | videoclip = VideoFileClip(s)
133 | audioclip = videoclip.audio
134 | audioclip.write_audiofile("trying.wav", verbose=True)
135 | 
136 | s1 = str(input("Input Language: "))
137 | s2 = str(input("Output Language: "))
138 | 
139 | vt = VideoTranslator()
140 | vt.translate_video("trying.wav", s1, s2)
141 | 
142 | os.remove("trying.wav")
143 | 


--------------------------------------------------------------------------------