├── README.MD
├── compressor.py
├── requirements.txt
└── srt_generator.py


/README.MD:
--------------------------------------------------------------------------------
 1 | Compressor.py is a suite of functions leveraging the ffmpeg framework for compressing and transcoding videos.
 2 | 
 3 | srt_generator.py generates the subtitles file(.srt) for a video using google speech api
 4 | 
 5 | Install the following before getting started:
 6 | 
 7 |     Install python 3.6
 8 |     Install ffmpeg (provides free CLI for multimedia editing)
 9 |     Install google cloud services
10 | 
11 | 
12 | Additionally, I use following terminal commands to rescale videos and images. For fixed width and height -
13 | 
14 | ffmpeg -i input.avi -vf scale="720:480" output.avi
15 | 
16 | 
17 | and if you want to retain aspect ratio just give height as -1 and it will automatically resize based on the width -
18 | 
19 | ffmpeg -i input.avi -vf scale="720:-1" output.avi
20 | 
21 | 


--------------------------------------------------------------------------------
/compressor.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import ffmpeg
  3 | 
  4 | 
  5 | def get_video_info(video_path):
  6 | 
  7 |     probe = ffmpeg.probe(video_path)
  8 | 
  9 |     size = probe["format"]["size"]
 10 |     duration = float(probe["format"]["duration"])
 11 |     video_bitrate = probe["format"]["bit_rate"]
 12 |     for i in probe["streams"]:
 13 |         if "width" in i:
 14 |             width = i["width"]
 15 |             height = i["height"]
 16 |         if "nb_frames" in i:
 17 |             fps = float(i["nb_frames"]) / float(duration)
 18 |     # if a video has no sound, it won't have audio data values
 19 |     audio_stream = next(
 20 |         (s for s in probe["streams"] if s["codec_type"] == "audio"), None
 21 |     )
 22 |     for i in probe["streams"]:
 23 |         if audio_stream:
 24 |             audio_bitrate = float(audio_stream["bit_rate"])
 25 |             audio_channels = audio_stream["channels"]
 26 |             audio_codec = audio_stream["codec_name"]
 27 |         else:
 28 |             audio_bitrate = None
 29 |             audio_channels = None
 30 |             audio_codec = None
 31 |     # audio_bitrate = float(next((s for s in probe['streams'] if s['codec_type'] == 'audio'), None)['bit_rate'])
 32 |     # audio_channels = next((s for s in probe['streams'] if s['codec_type'] == 'audio'), None)['channels']
 33 |     # audio_codec = next((s for s in probe['streams'] if s['codec_type'] == 'audio'), None)['codec_name']
 34 |     video_codec = next(
 35 |         (s for s in probe["streams"] if s["codec_type"] == "video"), None
 36 |     )["codec_name"]
 37 | 
 38 |     video_info = {
 39 |         "probe": probe,
 40 |         "video_path": video_path,
 41 |         "size": size,
 42 |         "width": width,
 43 |         "height": height,
 44 |         "duration": duration,
 45 |         "video_bitrate": video_bitrate,
 46 |         "fps": fps,
 47 |         "audio_bitrate": audio_bitrate,
 48 |         "video_codec": video_codec,
 49 |         "audio_codec": audio_codec,
 50 |         "audio_channels": audio_channels,
 51 |     }
 52 |     # improvement needed: need to extract caption file if it exists ffmpeg -i my_file.mkv -f webvtt outfile
 53 |     return video_info
 54 | 
 55 | 
 56 | def get_precompression_settings(video_info, target_size):
 57 | 
 58 |     duration = video_info["duration"]
 59 |     audio_bitrate = video_info["audio_bitrate"]
 60 |     size_upper_bound = target_size * 1000 * (duration / 60)  # max video size in KB
 61 | 
 62 |     total_bitrate_lower_bound = 100 * 1000  # in bps
 63 |     min_audio_bitrate = 64 * 1000  # in bps
 64 |     max_audio_bitrate = 128 * 1000  # in bps
 65 |     min_video_bitrate = 1000 * 1000  # in bps
 66 | 
 67 |     """Video quality settings:
 68 |     SD: 1,000 kbps video, 128 kbps audio
 69 |     HD: 2,000 kbps video, 128 kbps audio (recommended for Vidiren)
 70 |     Full HD: 4,500 kbps video, 256 kbps audio"""
 71 | 
 72 |     # Target total bitrate, in bps.
 73 |     target_total_bitrate = (size_upper_bound * 1024 * 8) / (1.073741824 * duration)
 74 |     if target_total_bitrate < total_bitrate_lower_bound:
 75 |         print("Bitrate is extremely low! Stop compress!")
 76 |         exit()
 77 | 
 78 |     # Mininmum size, in kb.
 79 |     min_size = (
 80 |         (min_audio_bitrate + min_video_bitrate) * (1.073741824 * duration) / (8 * 1024)
 81 |     )
 82 |     if size_upper_bound < min_size:
 83 |         print(
 84 |             "Quality not good! Recommended minimum size:",
 85 |             "{:,}".format(int(min_size)),
 86 |             "KB.",
 87 |         )
 88 |         exit()
 89 | 
 90 |     # target audio bitrate, in bps
 91 |     if 10 * audio_bitrate > target_total_bitrate:
 92 |         audio_bitrate = target_total_bitrate / 10
 93 |     if audio_bitrate < min_audio_bitrate < target_total_bitrate:
 94 |         audio_bitrate = min_audio_bitrate
 95 |     elif audio_bitrate > max_audio_bitrate:
 96 |         audio_bitrate = max_audio_bitrate
 97 | 
 98 |     # Target video bitrate, in bps.
 99 |     video_bitrate = target_total_bitrate - audio_bitrate
100 |     if video_bitrate < 1000:
101 |         print("Bitrate {} is extremely low! Stop compress.".format(video_bitrate))
102 | 
103 |     precompression_settings = {
104 |         "target_total_bitrate": target_total_bitrate,
105 |         "min_size": min_size,
106 |         "video_bitrate": video_bitrate,
107 |         "audio_bitrate": audio_bitrate,
108 |     }
109 | 
110 |     return precompression_settings
111 | 
112 | 
113 | def compress_video(video_path, video_bitrate, audio_bitrate):
114 | 
115 |     filename_suffix = "_compressed"
116 |     filename, extension = os.path.splitext(video_info["video_path"])
117 |     extension = ".mp4"
118 |     compressed_video = filename + filename_suffix + extension
119 |     two_pass = False
120 |     """1 pass is faster than 2 passes. 2-pass does not make a better quality or \
121 |         smaller file: it only lets you set the output file size (but not the quality),\
122 |             whereas -crf lets you choose the quality (but not the file size)."""
123 | 
124 |     try:
125 |         stream = ffmpeg.input(video_path)
126 |         # stream = ffmpeg.filter(stream, 'fps', fps=30) this filter kills the audio
127 |         if two_pass:
128 |             ffmpeg.output(
129 |                 stream,
130 |                 "/dev/null" if os.path.exists("/dev/null") else "NUL",
131 |                 **{"c:v": "libx264", "b:v": video_bitrate, "pass": 1, "f": "mp4"}
132 |             ).overwrite_output().run()
133 |             ffmpeg.output(
134 |                 stream,
135 |                 compressed_video,
136 |                 **{
137 |                     "c:v": "libx264",
138 |                     "b:v": video_bitrate,
139 |                     "pass": 2,
140 |                     "c:a": "aac",
141 |                     "b:a": audio_bitrate,
142 |                 }
143 |             ).overwrite_output().run()
144 |         else:
145 |             ffmpeg.output(
146 |                 stream,
147 |                 compressed_video,
148 |                 **{
149 |                     "c:v": "libx264",
150 |                     "b:v": video_bitrate,
151 |                     "c:a": "aac",
152 |                     "b:a": audio_bitrate,
153 |                 }
154 |             ).overwrite_output().run()
155 |     except ffmpeg.Error as e:
156 |         print(e.stderr)
157 |     print("\nAUDIO BITRATE USED FOR COMPRESSION: ", audio_bitrate)
158 |     return compressed_video
159 | 
160 | 
161 | def print_data(video_info, precompression_settings, compressed_video):
162 | 
163 |     print(
164 |         "\nMinimum size threshold: {} kb".format(
165 |             round(precompression_settings["min_size"])
166 |         )
167 |     )
168 |     print(
169 |         "Target total bitrate: {} kbps".format(
170 |             round(precompression_settings["target_total_bitrate"] / 1000)
171 |         )
172 |     )
173 |     print(
174 |         "Target audio bitrate: {} kbps".format(
175 |             round(precompression_settings["audio_bitrate"] / 1000)
176 |         )
177 |     )
178 |     print(
179 |         "Target video bitrate: {} kbps".format(
180 |             round(precompression_settings["video_bitrate"] / 1000)
181 |         )
182 |     )
183 |     print("\nVideo successfully compressed and saved as {}".format(compressed_video))
184 |     print("\nData before compression:")
185 |     print(
186 |         "\nSize: {} MB \nResolution: {}x{} pixels \nDuration: {} sec \n"
187 |         "Video bitrate: {} Kbits per sec \nAudio bitrate {} Kbits per sec \n"
188 |         "Frames per second: {} \nVideo codec: {} \n"
189 |         "Audio codec: {} \nAudio channels: {}".format(
190 |             round(int(video_info["size"]) / 1000000, 1),
191 |             video_info["width"],
192 |             video_info["height"],
193 |             int(video_info["duration"]),
194 |             round(int(video_info["video_bitrate"]) / 1000),
195 |             round(int(video_info["audio_bitrate"]) / 1000),
196 |             int(video_info["fps"]),
197 |             video_info["video_codec"],
198 |             video_info["audio_codec"],
199 |             video_info["audio_channels"],
200 |         )
201 |     )
202 |     print("\nData after compression:")
203 |     compressed_video_info = get_video_info(compressed_video)
204 |     print(
205 |         "\nSize: {} MB \nResolution: {}x{} pixels \nDuration: {} sec \n"
206 |         "Video bitrate: {} Kbits per sec \nAudio bitrate {} Kbits per sec \n"
207 |         "Frames per second: {} \nVideo codec: {} \nAudio codec: {} \nAudio channels: {}".format(
208 |             round(int(compressed_video_info["size"]) / 1000000, 1),
209 |             compressed_video_info["width"],
210 |             compressed_video_info["height"],
211 |             int(compressed_video_info["duration"]),
212 |             round(int(compressed_video_info["video_bitrate"]) / 1000),
213 |             round(int(video_info["audio_bitrate"]) / 1000),
214 |             int(compressed_video_info["fps"]),
215 |             compressed_video_info["video_codec"],
216 |             compressed_video_info["audio_codec"],
217 |             compressed_video_info["audio_channels"],
218 |         )
219 |     )
220 | 
221 | 
222 | video_path = "myvideo.mp4"
223 | 
224 | 
225 | def main():
226 |     video_info = get_video_info(video_path)
227 |     precompression_settings = get_precompression_settings(
228 |         video_info, target_size=48
229 |     )  # target size in MB per min of video
230 |     compressed_video = compress_video(
231 |         video_path,
232 |         precompression_settings["video_bitrate"],
233 |         precompression_settings["audio_bitrate"],
234 |     )
235 |     print_data(video_info, precompression_settings, compressed_video)
236 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | SpeechRecognition==3.8.1
2 | ffmpeg_python==0.2.0
3 | pydub==0.24.1
4 | moviepy==1.0.3
5 | ffmpeg==1.4
6 | 
7 | 


--------------------------------------------------------------------------------
/srt_generator.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import srt
  3 | import datetime
  4 | import ffmpeg
  5 | from google.cloud import speech
  6 | from google.cloud import storage
  7 | 
  8 | file_name = "beam"
  9 | 
 10 | bucket_name = "srt_file_generator"
 11 | os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "rapid-stage-289208-23915bd205ba.json"
 12 | gcs_uri = "gs://srt_file_generator/" + file_name + ".flac"
 13 | # gcs_uri = "gs://fabian_compression_test/rescue.mp4"
 14 | video_path = file_name + ".mp4"
 15 | flac_filepath = file_name + ".flac"
 16 | storage_object_name = file_name + ".flac"
 17 | speech_context = speech.SpeechContext(phrases=["EMK", "EMK products", "Beam Eye Gel"])
 18 | """as an improvement we need to generate a unique set of phrases for each store. Phrases can include:
 19 |     - brand name
 20 |     - products
 21 |     - main SEO keywords"""
 22 | 
 23 | 
 24 | def transcode_to_flac(video_path, flac_filepath):
 25 |     """Trancodes video files to audio flac files"""
 26 |     ffmpeg.input(video_path).output(flac_filepath, ac=1).run()
 27 | 
 28 | 
 29 | def upload_to_bucket(bucket_name, flac_filepath, storage_object_name):
 30 |     """Uploads a file to the cloud bucket."""
 31 | 
 32 |     storage_client = storage.Client()
 33 |     bucket = storage_client.bucket(bucket_name)
 34 |     blob = bucket.blob(storage_object_name)
 35 | 
 36 |     blob.upload_from_filename(flac_filepath)
 37 | 
 38 |     print("File {} uploaded to bucket: {}.".format(flac_filepath, bucket_name))
 39 | 
 40 | 
 41 | def delete_from_bucket(bucket_name, flac_filepath, storage_object_name):
 42 | 
 43 |     storage_client = storage.Client()
 44 |     bucket = storage_client.bucket(bucket_name)
 45 |     blob = bucket.blob(storage_object_name)
 46 | 
 47 |     blob.delete()
 48 | 
 49 | 
 50 | def long_running_recognize(gcs_uri):
 51 | 
 52 |     client = speech.SpeechClient()
 53 | 
 54 |     """ config = {
 55 |         "language_code": "en-US",
 56 |         "speech_contexts": speech_context,
 57 |         "enable_word_time_offsets": True,
 58 |         "model": "video",
 59 |         "enable_automatic_punctuation":True
 60 |     }"""
 61 | 
 62 |     config = speech.RecognitionConfig(
 63 |         language_code="en-US",
 64 |         speech_contexts=[speech_context],
 65 |         enable_word_time_offsets=True,
 66 |         model="video",
 67 |         enable_automatic_punctuation=True,
 68 |     )
 69 | 
 70 |     audio = {"uri": gcs_uri}
 71 |     operation = client.long_running_recognize(
 72 |         request={"config": config, "audio": audio}
 73 |     )
 74 |     print("Waiting for operation to complete...")
 75 |     response = operation.result()
 76 |     return response
 77 | 
 78 | 
 79 | def subtitle_generation(speech_to_text_response, bin_size=3):
 80 |     """We define a bin of time period to display the words in sync with audio.
 81 |     Here, bin_size = 3 means each bin is of 3 secs.
 82 |     All the words in the interval of 3 secs in result will be grouped togather."""
 83 |     transcriptions = []
 84 |     index = 0
 85 | 
 86 |     for result in response.results:
 87 |         try:
 88 |             if result.alternatives[0].words[0].start_time.seconds:
 89 |                 # bin start -> for first word of result
 90 |                 start_sec = result.alternatives[0].words[0].start_time.seconds
 91 |                 start_microsec = result.alternatives[0].words[0].start_time.microseconds
 92 |             else:
 93 |                 # bin start -> For First word of response
 94 |                 start_sec = 0
 95 |                 start_microsec = 0
 96 |             end_sec = start_sec + bin_size  # bin end sec
 97 | 
 98 |             # for last word of result
 99 |             last_word_end_sec = result.alternatives[0].words[-1].end_time.seconds
100 |             last_word_end_microsec = (
101 |                 result.alternatives[0].words[-1].end_time.microseconds
102 |             )
103 | 
104 |             # bin transcript
105 |             transcript = result.alternatives[0].words[0].word
106 | 
107 |             index += 1  # subtitle index
108 | 
109 |             for i in range(len(result.alternatives[0].words) - 1):
110 |                 try:
111 |                     word = result.alternatives[0].words[i + 1].word
112 |                     word_start_sec = (
113 |                         result.alternatives[0].words[i + 1].start_time.seconds
114 |                     )
115 |                     word_start_microsec = (
116 |                         result.alternatives[0].words[i + 1].start_time.microseconds
117 |                     )  # 0.001 to convert nana -> micro
118 |                     word_end_sec = result.alternatives[0].words[i + 1].end_time.seconds
119 | 
120 |                     if word_end_sec < end_sec:
121 |                         transcript = transcript + " " + word
122 |                     else:
123 |                         previous_word_end_sec = (
124 |                             result.alternatives[0].words[i].end_time.seconds
125 |                         )
126 |                         previous_word_end_microsec = (
127 |                             result.alternatives[0].words[i].end_time.microseconds
128 |                         )
129 | 
130 |                         # append bin transcript
131 |                         transcriptions.append(
132 |                             srt.Subtitle(
133 |                                 index,
134 |                                 datetime.timedelta(0, start_sec, start_microsec),
135 |                                 datetime.timedelta(
136 |                                     0, previous_word_end_sec, previous_word_end_microsec
137 |                                 ),
138 |                                 transcript,
139 |                             )
140 |                         )
141 | 
142 |                         # reset bin parameters
143 |                         start_sec = word_start_sec
144 |                         start_microsec = word_start_microsec
145 |                         end_sec = start_sec + bin_size
146 |                         transcript = result.alternatives[0].words[i + 1].word
147 | 
148 |                         index += 1
149 |                 except IndexError:
150 |                     pass
151 |             # append transcript of last transcript in bin
152 |             transcriptions.append(
153 |                 srt.Subtitle(
154 |                     index,
155 |                     datetime.timedelta(0, start_sec, start_microsec),
156 |                     datetime.timedelta(0, last_word_end_sec, last_word_end_microsec),
157 |                     transcript,
158 |                 )
159 |             )
160 |             index += 1
161 |         except IndexError:
162 |             pass
163 | 
164 |     # turn transcription list into subtitles
165 |     subtitles = srt.compose(transcriptions)
166 |     return subtitles
167 | 
168 | 
169 | if __name__ == "__main__":
170 |     transcode_to_flac(video_path, flac_filepath)
171 |     upload_to_bucket(bucket_name, flac_filepath, storage_object_name)
172 |     response = long_running_recognize(gcs_uri)
173 |     subtitles = subtitle_generation(response)
174 |     with open(file_name + ".srt", "w") as f:
175 |         f.write(subtitles)
176 |     delete_from_bucket(bucket_name, flac_filepath, storage_object_name)
177 |     print("Subtitle file generated")
178 | 


--------------------------------------------------------------------------------