├── README.MD ├── compressor.py ├── requirements.txt └── srt_generator.py /README.MD: -------------------------------------------------------------------------------- 1 | Compressor.py is a suite of functions leveraging the ffmpeg framework for compressing and transcoding videos. 2 | 3 | srt_generator.py generates the subtitles file(.srt) for a video using google speech api 4 | 5 | Install the following before getting started: 6 | 7 | Install python 3.6 8 | Install ffmpeg (provides free CLI for multimedia editing) 9 | Install google cloud services 10 | 11 | 12 | Additionally, I use following terminal commands to rescale videos and images. For fixed width and height - 13 | 14 | ffmpeg -i input.avi -vf scale="720:480" output.avi 15 | 16 | 17 | and if you want to retain aspect ratio just give height as -1 and it will automatically resize based on the width - 18 | 19 | ffmpeg -i input.avi -vf scale="720:-1" output.avi 20 | 21 | -------------------------------------------------------------------------------- /compressor.py: -------------------------------------------------------------------------------- 1 | import os 2 | import ffmpeg 3 | 4 | 5 | def get_video_info(video_path): 6 | 7 | probe = ffmpeg.probe(video_path) 8 | 9 | size = probe["format"]["size"] 10 | duration = float(probe["format"]["duration"]) 11 | video_bitrate = probe["format"]["bit_rate"] 12 | for i in probe["streams"]: 13 | if "width" in i: 14 | width = i["width"] 15 | height = i["height"] 16 | if "nb_frames" in i: 17 | fps = float(i["nb_frames"]) / float(duration) 18 | # if a video has no sound, it won't have audio data values 19 | audio_stream = next( 20 | (s for s in probe["streams"] if s["codec_type"] == "audio"), None 21 | ) 22 | for i in probe["streams"]: 23 | if audio_stream: 24 | audio_bitrate = float(audio_stream["bit_rate"]) 25 | audio_channels = audio_stream["channels"] 26 | audio_codec = audio_stream["codec_name"] 27 | else: 28 | audio_bitrate = None 29 | audio_channels = None 30 | audio_codec = None 31 | # audio_bitrate = float(next((s for s in probe['streams'] if s['codec_type'] == 'audio'), None)['bit_rate']) 32 | # audio_channels = next((s for s in probe['streams'] if s['codec_type'] == 'audio'), None)['channels'] 33 | # audio_codec = next((s for s in probe['streams'] if s['codec_type'] == 'audio'), None)['codec_name'] 34 | video_codec = next( 35 | (s for s in probe["streams"] if s["codec_type"] == "video"), None 36 | )["codec_name"] 37 | 38 | video_info = { 39 | "probe": probe, 40 | "video_path": video_path, 41 | "size": size, 42 | "width": width, 43 | "height": height, 44 | "duration": duration, 45 | "video_bitrate": video_bitrate, 46 | "fps": fps, 47 | "audio_bitrate": audio_bitrate, 48 | "video_codec": video_codec, 49 | "audio_codec": audio_codec, 50 | "audio_channels": audio_channels, 51 | } 52 | # improvement needed: need to extract caption file if it exists ffmpeg -i my_file.mkv -f webvtt outfile 53 | return video_info 54 | 55 | 56 | def get_precompression_settings(video_info, target_size): 57 | 58 | duration = video_info["duration"] 59 | audio_bitrate = video_info["audio_bitrate"] 60 | size_upper_bound = target_size * 1000 * (duration / 60) # max video size in KB 61 | 62 | total_bitrate_lower_bound = 100 * 1000 # in bps 63 | min_audio_bitrate = 64 * 1000 # in bps 64 | max_audio_bitrate = 128 * 1000 # in bps 65 | min_video_bitrate = 1000 * 1000 # in bps 66 | 67 | """Video quality settings: 68 | SD: 1,000 kbps video, 128 kbps audio 69 | HD: 2,000 kbps video, 128 kbps audio (recommended for Vidiren) 70 | Full HD: 4,500 kbps video, 256 kbps audio""" 71 | 72 | # Target total bitrate, in bps. 73 | target_total_bitrate = (size_upper_bound * 1024 * 8) / (1.073741824 * duration) 74 | if target_total_bitrate < total_bitrate_lower_bound: 75 | print("Bitrate is extremely low! Stop compress!") 76 | exit() 77 | 78 | # Mininmum size, in kb. 79 | min_size = ( 80 | (min_audio_bitrate + min_video_bitrate) * (1.073741824 * duration) / (8 * 1024) 81 | ) 82 | if size_upper_bound < min_size: 83 | print( 84 | "Quality not good! Recommended minimum size:", 85 | "{:,}".format(int(min_size)), 86 | "KB.", 87 | ) 88 | exit() 89 | 90 | # target audio bitrate, in bps 91 | if 10 * audio_bitrate > target_total_bitrate: 92 | audio_bitrate = target_total_bitrate / 10 93 | if audio_bitrate < min_audio_bitrate < target_total_bitrate: 94 | audio_bitrate = min_audio_bitrate 95 | elif audio_bitrate > max_audio_bitrate: 96 | audio_bitrate = max_audio_bitrate 97 | 98 | # Target video bitrate, in bps. 99 | video_bitrate = target_total_bitrate - audio_bitrate 100 | if video_bitrate < 1000: 101 | print("Bitrate {} is extremely low! Stop compress.".format(video_bitrate)) 102 | 103 | precompression_settings = { 104 | "target_total_bitrate": target_total_bitrate, 105 | "min_size": min_size, 106 | "video_bitrate": video_bitrate, 107 | "audio_bitrate": audio_bitrate, 108 | } 109 | 110 | return precompression_settings 111 | 112 | 113 | def compress_video(video_path, video_bitrate, audio_bitrate): 114 | 115 | filename_suffix = "_compressed" 116 | filename, extension = os.path.splitext(video_info["video_path"]) 117 | extension = ".mp4" 118 | compressed_video = filename + filename_suffix + extension 119 | two_pass = False 120 | """1 pass is faster than 2 passes. 2-pass does not make a better quality or \ 121 | smaller file: it only lets you set the output file size (but not the quality),\ 122 | whereas -crf lets you choose the quality (but not the file size).""" 123 | 124 | try: 125 | stream = ffmpeg.input(video_path) 126 | # stream = ffmpeg.filter(stream, 'fps', fps=30) this filter kills the audio 127 | if two_pass: 128 | ffmpeg.output( 129 | stream, 130 | "/dev/null" if os.path.exists("/dev/null") else "NUL", 131 | **{"c:v": "libx264", "b:v": video_bitrate, "pass": 1, "f": "mp4"} 132 | ).overwrite_output().run() 133 | ffmpeg.output( 134 | stream, 135 | compressed_video, 136 | **{ 137 | "c:v": "libx264", 138 | "b:v": video_bitrate, 139 | "pass": 2, 140 | "c:a": "aac", 141 | "b:a": audio_bitrate, 142 | } 143 | ).overwrite_output().run() 144 | else: 145 | ffmpeg.output( 146 | stream, 147 | compressed_video, 148 | **{ 149 | "c:v": "libx264", 150 | "b:v": video_bitrate, 151 | "c:a": "aac", 152 | "b:a": audio_bitrate, 153 | } 154 | ).overwrite_output().run() 155 | except ffmpeg.Error as e: 156 | print(e.stderr) 157 | print("\nAUDIO BITRATE USED FOR COMPRESSION: ", audio_bitrate) 158 | return compressed_video 159 | 160 | 161 | def print_data(video_info, precompression_settings, compressed_video): 162 | 163 | print( 164 | "\nMinimum size threshold: {} kb".format( 165 | round(precompression_settings["min_size"]) 166 | ) 167 | ) 168 | print( 169 | "Target total bitrate: {} kbps".format( 170 | round(precompression_settings["target_total_bitrate"] / 1000) 171 | ) 172 | ) 173 | print( 174 | "Target audio bitrate: {} kbps".format( 175 | round(precompression_settings["audio_bitrate"] / 1000) 176 | ) 177 | ) 178 | print( 179 | "Target video bitrate: {} kbps".format( 180 | round(precompression_settings["video_bitrate"] / 1000) 181 | ) 182 | ) 183 | print("\nVideo successfully compressed and saved as {}".format(compressed_video)) 184 | print("\nData before compression:") 185 | print( 186 | "\nSize: {} MB \nResolution: {}x{} pixels \nDuration: {} sec \n" 187 | "Video bitrate: {} Kbits per sec \nAudio bitrate {} Kbits per sec \n" 188 | "Frames per second: {} \nVideo codec: {} \n" 189 | "Audio codec: {} \nAudio channels: {}".format( 190 | round(int(video_info["size"]) / 1000000, 1), 191 | video_info["width"], 192 | video_info["height"], 193 | int(video_info["duration"]), 194 | round(int(video_info["video_bitrate"]) / 1000), 195 | round(int(video_info["audio_bitrate"]) / 1000), 196 | int(video_info["fps"]), 197 | video_info["video_codec"], 198 | video_info["audio_codec"], 199 | video_info["audio_channels"], 200 | ) 201 | ) 202 | print("\nData after compression:") 203 | compressed_video_info = get_video_info(compressed_video) 204 | print( 205 | "\nSize: {} MB \nResolution: {}x{} pixels \nDuration: {} sec \n" 206 | "Video bitrate: {} Kbits per sec \nAudio bitrate {} Kbits per sec \n" 207 | "Frames per second: {} \nVideo codec: {} \nAudio codec: {} \nAudio channels: {}".format( 208 | round(int(compressed_video_info["size"]) / 1000000, 1), 209 | compressed_video_info["width"], 210 | compressed_video_info["height"], 211 | int(compressed_video_info["duration"]), 212 | round(int(compressed_video_info["video_bitrate"]) / 1000), 213 | round(int(video_info["audio_bitrate"]) / 1000), 214 | int(compressed_video_info["fps"]), 215 | compressed_video_info["video_codec"], 216 | compressed_video_info["audio_codec"], 217 | compressed_video_info["audio_channels"], 218 | ) 219 | ) 220 | 221 | 222 | video_path = "myvideo.mp4" 223 | 224 | 225 | def main(): 226 | video_info = get_video_info(video_path) 227 | precompression_settings = get_precompression_settings( 228 | video_info, target_size=48 229 | ) # target size in MB per min of video 230 | compressed_video = compress_video( 231 | video_path, 232 | precompression_settings["video_bitrate"], 233 | precompression_settings["audio_bitrate"], 234 | ) 235 | print_data(video_info, precompression_settings, compressed_video) 236 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | SpeechRecognition==3.8.1 2 | ffmpeg_python==0.2.0 3 | pydub==0.24.1 4 | moviepy==1.0.3 5 | ffmpeg==1.4 6 | 7 | -------------------------------------------------------------------------------- /srt_generator.py: -------------------------------------------------------------------------------- 1 | import os 2 | import srt 3 | import datetime 4 | import ffmpeg 5 | from google.cloud import speech 6 | from google.cloud import storage 7 | 8 | file_name = "beam" 9 | 10 | bucket_name = "srt_file_generator" 11 | os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "rapid-stage-289208-23915bd205ba.json" 12 | gcs_uri = "gs://srt_file_generator/" + file_name + ".flac" 13 | # gcs_uri = "gs://fabian_compression_test/rescue.mp4" 14 | video_path = file_name + ".mp4" 15 | flac_filepath = file_name + ".flac" 16 | storage_object_name = file_name + ".flac" 17 | speech_context = speech.SpeechContext(phrases=["EMK", "EMK products", "Beam Eye Gel"]) 18 | """as an improvement we need to generate a unique set of phrases for each store. Phrases can include: 19 | - brand name 20 | - products 21 | - main SEO keywords""" 22 | 23 | 24 | def transcode_to_flac(video_path, flac_filepath): 25 | """Trancodes video files to audio flac files""" 26 | ffmpeg.input(video_path).output(flac_filepath, ac=1).run() 27 | 28 | 29 | def upload_to_bucket(bucket_name, flac_filepath, storage_object_name): 30 | """Uploads a file to the cloud bucket.""" 31 | 32 | storage_client = storage.Client() 33 | bucket = storage_client.bucket(bucket_name) 34 | blob = bucket.blob(storage_object_name) 35 | 36 | blob.upload_from_filename(flac_filepath) 37 | 38 | print("File {} uploaded to bucket: {}.".format(flac_filepath, bucket_name)) 39 | 40 | 41 | def delete_from_bucket(bucket_name, flac_filepath, storage_object_name): 42 | 43 | storage_client = storage.Client() 44 | bucket = storage_client.bucket(bucket_name) 45 | blob = bucket.blob(storage_object_name) 46 | 47 | blob.delete() 48 | 49 | 50 | def long_running_recognize(gcs_uri): 51 | 52 | client = speech.SpeechClient() 53 | 54 | """ config = { 55 | "language_code": "en-US", 56 | "speech_contexts": speech_context, 57 | "enable_word_time_offsets": True, 58 | "model": "video", 59 | "enable_automatic_punctuation":True 60 | }""" 61 | 62 | config = speech.RecognitionConfig( 63 | language_code="en-US", 64 | speech_contexts=[speech_context], 65 | enable_word_time_offsets=True, 66 | model="video", 67 | enable_automatic_punctuation=True, 68 | ) 69 | 70 | audio = {"uri": gcs_uri} 71 | operation = client.long_running_recognize( 72 | request={"config": config, "audio": audio} 73 | ) 74 | print("Waiting for operation to complete...") 75 | response = operation.result() 76 | return response 77 | 78 | 79 | def subtitle_generation(speech_to_text_response, bin_size=3): 80 | """We define a bin of time period to display the words in sync with audio. 81 | Here, bin_size = 3 means each bin is of 3 secs. 82 | All the words in the interval of 3 secs in result will be grouped togather.""" 83 | transcriptions = [] 84 | index = 0 85 | 86 | for result in response.results: 87 | try: 88 | if result.alternatives[0].words[0].start_time.seconds: 89 | # bin start -> for first word of result 90 | start_sec = result.alternatives[0].words[0].start_time.seconds 91 | start_microsec = result.alternatives[0].words[0].start_time.microseconds 92 | else: 93 | # bin start -> For First word of response 94 | start_sec = 0 95 | start_microsec = 0 96 | end_sec = start_sec + bin_size # bin end sec 97 | 98 | # for last word of result 99 | last_word_end_sec = result.alternatives[0].words[-1].end_time.seconds 100 | last_word_end_microsec = ( 101 | result.alternatives[0].words[-1].end_time.microseconds 102 | ) 103 | 104 | # bin transcript 105 | transcript = result.alternatives[0].words[0].word 106 | 107 | index += 1 # subtitle index 108 | 109 | for i in range(len(result.alternatives[0].words) - 1): 110 | try: 111 | word = result.alternatives[0].words[i + 1].word 112 | word_start_sec = ( 113 | result.alternatives[0].words[i + 1].start_time.seconds 114 | ) 115 | word_start_microsec = ( 116 | result.alternatives[0].words[i + 1].start_time.microseconds 117 | ) # 0.001 to convert nana -> micro 118 | word_end_sec = result.alternatives[0].words[i + 1].end_time.seconds 119 | 120 | if word_end_sec < end_sec: 121 | transcript = transcript + " " + word 122 | else: 123 | previous_word_end_sec = ( 124 | result.alternatives[0].words[i].end_time.seconds 125 | ) 126 | previous_word_end_microsec = ( 127 | result.alternatives[0].words[i].end_time.microseconds 128 | ) 129 | 130 | # append bin transcript 131 | transcriptions.append( 132 | srt.Subtitle( 133 | index, 134 | datetime.timedelta(0, start_sec, start_microsec), 135 | datetime.timedelta( 136 | 0, previous_word_end_sec, previous_word_end_microsec 137 | ), 138 | transcript, 139 | ) 140 | ) 141 | 142 | # reset bin parameters 143 | start_sec = word_start_sec 144 | start_microsec = word_start_microsec 145 | end_sec = start_sec + bin_size 146 | transcript = result.alternatives[0].words[i + 1].word 147 | 148 | index += 1 149 | except IndexError: 150 | pass 151 | # append transcript of last transcript in bin 152 | transcriptions.append( 153 | srt.Subtitle( 154 | index, 155 | datetime.timedelta(0, start_sec, start_microsec), 156 | datetime.timedelta(0, last_word_end_sec, last_word_end_microsec), 157 | transcript, 158 | ) 159 | ) 160 | index += 1 161 | except IndexError: 162 | pass 163 | 164 | # turn transcription list into subtitles 165 | subtitles = srt.compose(transcriptions) 166 | return subtitles 167 | 168 | 169 | if __name__ == "__main__": 170 | transcode_to_flac(video_path, flac_filepath) 171 | upload_to_bucket(bucket_name, flac_filepath, storage_object_name) 172 | response = long_running_recognize(gcs_uri) 173 | subtitles = subtitle_generation(response) 174 | with open(file_name + ".srt", "w") as f: 175 | f.write(subtitles) 176 | delete_from_bucket(bucket_name, flac_filepath, storage_object_name) 177 | print("Subtitle file generated") 178 | --------------------------------------------------------------------------------