├── .gitignore
├── input_video.mp4
├── output_video.mp4
├── generated_image.jpg
├── sample_conversation_with_fillers_and_silence.mp3
├── requirements.txt
├── README.md
├── LICENSE
├── ai_video_editor.py
└── video_generator.py


/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | venv/
3 | 


--------------------------------------------------------------------------------
/input_video.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Anil-matcha/AutoShorts/main/input_video.mp4


--------------------------------------------------------------------------------
/output_video.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Anil-matcha/AutoShorts/main/output_video.mp4


--------------------------------------------------------------------------------
/generated_image.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Anil-matcha/AutoShorts/main/generated_image.jpg


--------------------------------------------------------------------------------
/sample_conversation_with_fillers_and_silence.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Anil-matcha/AutoShorts/main/sample_conversation_with_fillers_and_silence.mp3


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | certifi==2024.7.4
 2 | charset-normalizer==3.3.2
 3 | click==8.1.7
 4 | decorator==4.4.2
 5 | filelock==3.15.4
 6 | fsspec==2024.6.1
 7 | gTTS==2.5.3
 8 | idna==3.8
 9 | imageio==2.35.1
10 | imageio-ffmpeg==0.5.1
11 | Jinja2==3.1.4
12 | llvmlite==0.43.0
13 | MarkupSafe==2.1.5
14 | more-itertools==10.4.0
15 | moviepy==1.0.3
16 | mpmath==1.3.0
17 | networkx==3.3
18 | numba==0.60.0
19 | numpy==2.0.2
20 | openai-whisper @ git+https://github.com/openai/whisper.git@ba3f3cd54b0e5b8ce1ab3de13e32122d0d5f98ab
21 | pillow==10.4.0
22 | proglog==0.1.10
23 | pydub==0.25.1
24 | regex==2024.7.24
25 | requests==2.32.3
26 | six==1.16.0
27 | sympy==1.13.2
28 | tiktoken==0.7.0
29 | torch==2.4.0
30 | tqdm==4.66.5
31 | typing_extensions==4.12.2
32 | urllib3==2.2.2
33 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # AutoShorts AI
 2 | 
 3 | Automatically remove silence from your video
 4 | 
 5 | ### Youtube tutorial -> https://youtu.be/O8sgcHyXGLI
 6 | 
 7 | ### Medium article -> https://medium.com/@anilmatcha/autoshorts-ai-ai-silence-remover-in-python-tutorial-2ff1062a0150
 8 | 
 9 | ### Demo
10 | 
11 | Video with silence (35 seconds)
12 | 
13 | https://github.com/user-attachments/assets/32261790-537d-47d0-bd8c-7bc2c066c983
14 | 
15 | Trimmed video without silences (23 seconds)
16 | 
17 | https://github.com/user-attachments/assets/5edbaf3d-7602-49b5-9fa7-f9bd934ed91d
18 | 
19 | ### Steps to use
20 | 
21 | ```
22 | Install requirements with below command
23 | 
24 | pip install -r requirements.txt
25 | 
26 | Replace the input_video.mp4 file with your file
27 | 
28 | Run python ai_video_editor.py
29 | ```
30 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Anil Chandra Naidu Matcha
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/ai_video_editor.py:
--------------------------------------------------------------------------------
 1 | import whisper
 2 | from moviepy.editor import VideoFileClip, concatenate_videoclips
 3 | 
 4 | def transcribe_video(video_path):
 5 |     """
 6 |     Transcribes the video and returns the result with word-level timestamps.
 7 |     
 8 |     Args:
 9 |         video_path (str): Path to the video file to be transcribed.
10 |     
11 |     Returns:
12 |         dict: Transcription result containing word-level timestamps.
13 |     """
14 |     # Load the Whisper model
15 |     model = whisper.load_model("base")
16 |     
17 |     # Transcribe the video
18 |     result = model.transcribe(video_path, prompt="Umm,let me think like,hmm... Okay,here's what I'm,like,thinking.", word_timestamps=True)
19 |     
20 |     return result
21 | 
22 | def identify_silence_periods(transcription, video_duration, threshold=1.0, buffer=0.1):
23 |     """
24 |     Identifies silence periods in the transcription based on the threshold.
25 |     
26 |     Args:
27 |         transcription (dict): The transcription result with word-level timestamps.
28 |         threshold (float): The minimum duration of silence to be considered.
29 |     
30 |     Returns:
31 |         list: A list of tuples where each tuple contains the start and end time of a silence period.
32 |     """
33 |     silence_periods = []
34 |     words = transcription['segments']
35 |     previous_end = 0
36 | 
37 |     for word in words:
38 |         start_time = word['start']
39 |         if start_time - previous_end > threshold:
40 |             silence_periods.append((previous_end+buffer, start_time-buffer))
41 |         previous_end = word['end']
42 | 
43 |     if video_duration - previous_end > threshold:
44 |         silence_periods.append((previous_end+buffer, video_duration-buffer))
45 | 
46 |     return silence_periods
47 | 
48 | def cut_silences(input_video, output_video, silence_periods):
49 |     """
50 |     Removes the silence periods from the video and saves the result.
51 |     
52 |     Args:
53 |         input_video (str): Path to the input video file.
54 |         output_video (str): Path to save the output video file.
55 |         silence_periods (list): A list of tuples indicating silence periods (start, end).
56 |     """
57 |     # Load the video
58 |     video = VideoFileClip(input_video)
59 | 
60 |     # Create a list of clips without the silence periods
61 |     clips = []
62 |     last_end = 0
63 | 
64 |     for (start, end) in silence_periods:
65 |         if last_end < start:
66 |             clips.append(video.subclip(last_end, start))
67 |         last_end = end
68 | 
69 |     # Add the final clip if there's any remaining video after the last silence
70 |     if last_end < video.duration:
71 |         clips.append(video.subclip(last_end, video.duration))
72 | 
73 |     # Concatenate the remaining clips
74 |     if clips:
75 |         final_clip = concatenate_videoclips(clips)
76 |         # Write the result to a file
77 |         final_clip.write_videofile(output_video, codec="libx264", audio_codec="aac")
78 |     else:
79 |         # If no clips are left after cutting silences, save the original video
80 |         video.write_videofile(output_video, codec="libx264", audio_codec="aac")
81 | 
82 | # Example usage:
83 | if __name__ == "__main__":
84 |     video_path = "input_video.mp4"       # Path to your video file
85 |     output_path = "output_video.mp4"    # Path to save the edited video
86 | 
87 |     video = VideoFileClip(video_path)
88 |     video_duration = video.duration
89 | 
90 |     # Step 1: Transcribe the video
91 |     transcription_result = transcribe_video(video_path)
92 |     print("Transcript", transcription_result)
93 | 
94 |     # Step 2: Identify silence periods
95 |     silence_periods = identify_silence_periods(transcription_result, video_duration, threshold=0.5)
96 | 
97 |     # Step 3: Cut silences from the video
98 |     cut_silences(video_path, output_path, silence_periods)
99 | 


--------------------------------------------------------------------------------
/video_generator.py:
--------------------------------------------------------------------------------
  1 | from gtts import gTTS
  2 | from pydub import AudioSegment
  3 | import random
  4 | import os
  5 | from PIL import Image, ImageDraw, ImageFont
  6 | from moviepy.editor import AudioFileClip, ImageClip
  7 | 
  8 | def create_image_with_text(image_path):
  9 |     width, height = 720, 1280
 10 |     image = Image.new('RGB', (width, height), 'white')
 11 | 
 12 |     draw = ImageDraw.Draw(image)
 13 |     text = "Sample Text Overlay"
 14 |     font_size = 50
 15 | 
 16 |     try:
 17 |         # Load a font
 18 |         font = ImageFont.truetype("arial.ttf", font_size)
 19 |     except IOError:
 20 |         # Use a default font
 21 |         font = ImageFont.load_default()
 22 | 
 23 |     # Calculate text size
 24 |     text_width, text_height = draw.textbbox((0, 0), text, font=font)[2:]
 25 | 
 26 |     # Calculate position
 27 |     text_x = (width - text_width) // 2
 28 |     text_y = (height - text_height) // 2
 29 | 
 30 |     # Draw text on the image
 31 |     draw.text((text_x, text_y), text, font=font, fill='black')
 32 |     image.save(image_path)
 33 | 
 34 | def create_video_from_image_and_audio(image_path, audio_path, output_path):
 35 |     # Load the generated image
 36 |     image = ImageClip(image_path)
 37 | 
 38 |     # Load the audio file
 39 |     try:
 40 |         audio = AudioFileClip(audio_path)
 41 |     except Exception as e:
 42 |         print(f"Error loading audio file: {e}")
 43 |         return
 44 | 
 45 |     # Check if the audio file has duration
 46 |     if audio.duration <= 0:
 47 |         print("Audio file has no duration.")
 48 |         return
 49 | 
 50 |     # Set the duration of the image to match the audio duration
 51 |     image = image.set_duration(audio.duration)
 52 | 
 53 |     # Set the image as the video clip
 54 |     video = image.set_audio(audio)
 55 | 
 56 |     # Output the final video
 57 |     try:
 58 |         video.write_videofile(output_path, fps=24, codec='libx264', audio_codec='aac')
 59 |         print(f"Video saved as {output_path}")
 60 |     except Exception as e:
 61 |         print(f"Error creating video: {e}")
 62 | 
 63 | # Sample conversation with filler words and intentional silences
 64 | conversation = [
 65 |     "So, I was thinking we could go to the park later.",
 66 |     "But, umm, I’m not really sure if the weather will be good.",
 67 |     "You know, I’ve been, uh, really busy with work lately.",
 68 |     "Like, maybe we should plan for the weekend instead.",
 69 |     "Hmm, but I don't know if I have time, like, to go out.",
 70 |     "Anyway, let’s just see how things go."
 71 | ]
 72 | 
 73 | # Create a list to hold audio segments
 74 | audio_segments = []
 75 | 
 76 | # Generate audio for each sentence and optionally add silence
 77 | for sentence in conversation:
 78 |     # Save the sentence as audio using gTTS
 79 |     tts = gTTS(text=sentence, lang='en')
 80 |     tts.save("sentence.mp3")
 81 |     
 82 |     # Load the sentence audio
 83 |     sentence_audio = AudioSegment.from_file('sentence.mp3')
 84 |     audio_segments.append(sentence_audio)
 85 |     
 86 |     # Randomly add silence
 87 |     if random.choice([True, False]):
 88 |         silence_duration = random.randint(1500, 4500)  # 1.5 to 4.5 seconds of silence
 89 |         silence = AudioSegment.silent(duration=silence_duration)
 90 |         audio_segments.append(silence)
 91 | 
 92 | # Combine all audio segments into one
 93 | final_audio = sum(audio_segments)
 94 | 
 95 | # Save the final audio
 96 | audio_file_path = "sample_conversation_with_fillers_and_silence.mp3"
 97 | final_audio.export(audio_file_path, format="mp3")
 98 | 
 99 | # Clean up temporary file
100 | os.remove("sentence.mp3")
101 | 
102 | print(f"Audio file saved as {audio_file_path}")
103 | 
104 | # Create the image and video
105 | image_path = "generated_image.jpg"
106 | create_image_with_text(image_path)
107 | 
108 | output_path = "input_video.mp4"
109 | create_video_from_image_and_audio(image_path, audio_file_path, output_path)
110 | 


--------------------------------------------------------------------------------