├── .gitignore ├── input_video.mp4 ├── output_video.mp4 ├── generated_image.jpg ├── sample_conversation_with_fillers_and_silence.mp3 ├── requirements.txt ├── README.md ├── LICENSE ├── ai_video_editor.py └── video_generator.py /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | venv/ 3 | -------------------------------------------------------------------------------- /input_video.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Anil-matcha/AutoShorts/main/input_video.mp4 -------------------------------------------------------------------------------- /output_video.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Anil-matcha/AutoShorts/main/output_video.mp4 -------------------------------------------------------------------------------- /generated_image.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Anil-matcha/AutoShorts/main/generated_image.jpg -------------------------------------------------------------------------------- /sample_conversation_with_fillers_and_silence.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Anil-matcha/AutoShorts/main/sample_conversation_with_fillers_and_silence.mp3 -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | certifi==2024.7.4 2 | charset-normalizer==3.3.2 3 | click==8.1.7 4 | decorator==4.4.2 5 | filelock==3.15.4 6 | fsspec==2024.6.1 7 | gTTS==2.5.3 8 | idna==3.8 9 | imageio==2.35.1 10 | imageio-ffmpeg==0.5.1 11 | Jinja2==3.1.4 12 | llvmlite==0.43.0 13 | MarkupSafe==2.1.5 14 | more-itertools==10.4.0 15 | moviepy==1.0.3 16 | mpmath==1.3.0 17 | networkx==3.3 18 | numba==0.60.0 19 | numpy==2.0.2 20 | openai-whisper @ git+https://github.com/openai/whisper.git@ba3f3cd54b0e5b8ce1ab3de13e32122d0d5f98ab 21 | pillow==10.4.0 22 | proglog==0.1.10 23 | pydub==0.25.1 24 | regex==2024.7.24 25 | requests==2.32.3 26 | six==1.16.0 27 | sympy==1.13.2 28 | tiktoken==0.7.0 29 | torch==2.4.0 30 | tqdm==4.66.5 31 | typing_extensions==4.12.2 32 | urllib3==2.2.2 33 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # AutoShorts AI 2 | 3 | Automatically remove silence from your video 4 | 5 | ### Youtube tutorial -> https://youtu.be/O8sgcHyXGLI 6 | 7 | ### Medium article -> https://medium.com/@anilmatcha/autoshorts-ai-ai-silence-remover-in-python-tutorial-2ff1062a0150 8 | 9 | ### Demo 10 | 11 | Video with silence (35 seconds) 12 | 13 | https://github.com/user-attachments/assets/32261790-537d-47d0-bd8c-7bc2c066c983 14 | 15 | Trimmed video without silences (23 seconds) 16 | 17 | https://github.com/user-attachments/assets/5edbaf3d-7602-49b5-9fa7-f9bd934ed91d 18 | 19 | ### Steps to use 20 | 21 | ``` 22 | Install requirements with below command 23 | 24 | pip install -r requirements.txt 25 | 26 | Replace the input_video.mp4 file with your file 27 | 28 | Run python ai_video_editor.py 29 | ``` 30 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Anil Chandra Naidu Matcha 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /ai_video_editor.py: -------------------------------------------------------------------------------- 1 | import whisper 2 | from moviepy.editor import VideoFileClip, concatenate_videoclips 3 | 4 | def transcribe_video(video_path): 5 | """ 6 | Transcribes the video and returns the result with word-level timestamps. 7 | 8 | Args: 9 | video_path (str): Path to the video file to be transcribed. 10 | 11 | Returns: 12 | dict: Transcription result containing word-level timestamps. 13 | """ 14 | # Load the Whisper model 15 | model = whisper.load_model("base") 16 | 17 | # Transcribe the video 18 | result = model.transcribe(video_path, prompt="Umm,let me think like,hmm... Okay,here's what I'm,like,thinking.", word_timestamps=True) 19 | 20 | return result 21 | 22 | def identify_silence_periods(transcription, video_duration, threshold=1.0, buffer=0.1): 23 | """ 24 | Identifies silence periods in the transcription based on the threshold. 25 | 26 | Args: 27 | transcription (dict): The transcription result with word-level timestamps. 28 | threshold (float): The minimum duration of silence to be considered. 29 | 30 | Returns: 31 | list: A list of tuples where each tuple contains the start and end time of a silence period. 32 | """ 33 | silence_periods = [] 34 | words = transcription['segments'] 35 | previous_end = 0 36 | 37 | for word in words: 38 | start_time = word['start'] 39 | if start_time - previous_end > threshold: 40 | silence_periods.append((previous_end+buffer, start_time-buffer)) 41 | previous_end = word['end'] 42 | 43 | if video_duration - previous_end > threshold: 44 | silence_periods.append((previous_end+buffer, video_duration-buffer)) 45 | 46 | return silence_periods 47 | 48 | def cut_silences(input_video, output_video, silence_periods): 49 | """ 50 | Removes the silence periods from the video and saves the result. 51 | 52 | Args: 53 | input_video (str): Path to the input video file. 54 | output_video (str): Path to save the output video file. 55 | silence_periods (list): A list of tuples indicating silence periods (start, end). 56 | """ 57 | # Load the video 58 | video = VideoFileClip(input_video) 59 | 60 | # Create a list of clips without the silence periods 61 | clips = [] 62 | last_end = 0 63 | 64 | for (start, end) in silence_periods: 65 | if last_end < start: 66 | clips.append(video.subclip(last_end, start)) 67 | last_end = end 68 | 69 | # Add the final clip if there's any remaining video after the last silence 70 | if last_end < video.duration: 71 | clips.append(video.subclip(last_end, video.duration)) 72 | 73 | # Concatenate the remaining clips 74 | if clips: 75 | final_clip = concatenate_videoclips(clips) 76 | # Write the result to a file 77 | final_clip.write_videofile(output_video, codec="libx264", audio_codec="aac") 78 | else: 79 | # If no clips are left after cutting silences, save the original video 80 | video.write_videofile(output_video, codec="libx264", audio_codec="aac") 81 | 82 | # Example usage: 83 | if __name__ == "__main__": 84 | video_path = "input_video.mp4" # Path to your video file 85 | output_path = "output_video.mp4" # Path to save the edited video 86 | 87 | video = VideoFileClip(video_path) 88 | video_duration = video.duration 89 | 90 | # Step 1: Transcribe the video 91 | transcription_result = transcribe_video(video_path) 92 | print("Transcript", transcription_result) 93 | 94 | # Step 2: Identify silence periods 95 | silence_periods = identify_silence_periods(transcription_result, video_duration, threshold=0.5) 96 | 97 | # Step 3: Cut silences from the video 98 | cut_silences(video_path, output_path, silence_periods) 99 | -------------------------------------------------------------------------------- /video_generator.py: -------------------------------------------------------------------------------- 1 | from gtts import gTTS 2 | from pydub import AudioSegment 3 | import random 4 | import os 5 | from PIL import Image, ImageDraw, ImageFont 6 | from moviepy.editor import AudioFileClip, ImageClip 7 | 8 | def create_image_with_text(image_path): 9 | width, height = 720, 1280 10 | image = Image.new('RGB', (width, height), 'white') 11 | 12 | draw = ImageDraw.Draw(image) 13 | text = "Sample Text Overlay" 14 | font_size = 50 15 | 16 | try: 17 | # Load a font 18 | font = ImageFont.truetype("arial.ttf", font_size) 19 | except IOError: 20 | # Use a default font 21 | font = ImageFont.load_default() 22 | 23 | # Calculate text size 24 | text_width, text_height = draw.textbbox((0, 0), text, font=font)[2:] 25 | 26 | # Calculate position 27 | text_x = (width - text_width) // 2 28 | text_y = (height - text_height) // 2 29 | 30 | # Draw text on the image 31 | draw.text((text_x, text_y), text, font=font, fill='black') 32 | image.save(image_path) 33 | 34 | def create_video_from_image_and_audio(image_path, audio_path, output_path): 35 | # Load the generated image 36 | image = ImageClip(image_path) 37 | 38 | # Load the audio file 39 | try: 40 | audio = AudioFileClip(audio_path) 41 | except Exception as e: 42 | print(f"Error loading audio file: {e}") 43 | return 44 | 45 | # Check if the audio file has duration 46 | if audio.duration <= 0: 47 | print("Audio file has no duration.") 48 | return 49 | 50 | # Set the duration of the image to match the audio duration 51 | image = image.set_duration(audio.duration) 52 | 53 | # Set the image as the video clip 54 | video = image.set_audio(audio) 55 | 56 | # Output the final video 57 | try: 58 | video.write_videofile(output_path, fps=24, codec='libx264', audio_codec='aac') 59 | print(f"Video saved as {output_path}") 60 | except Exception as e: 61 | print(f"Error creating video: {e}") 62 | 63 | # Sample conversation with filler words and intentional silences 64 | conversation = [ 65 | "So, I was thinking we could go to the park later.", 66 | "But, umm, I’m not really sure if the weather will be good.", 67 | "You know, I’ve been, uh, really busy with work lately.", 68 | "Like, maybe we should plan for the weekend instead.", 69 | "Hmm, but I don't know if I have time, like, to go out.", 70 | "Anyway, let’s just see how things go." 71 | ] 72 | 73 | # Create a list to hold audio segments 74 | audio_segments = [] 75 | 76 | # Generate audio for each sentence and optionally add silence 77 | for sentence in conversation: 78 | # Save the sentence as audio using gTTS 79 | tts = gTTS(text=sentence, lang='en') 80 | tts.save("sentence.mp3") 81 | 82 | # Load the sentence audio 83 | sentence_audio = AudioSegment.from_file('sentence.mp3') 84 | audio_segments.append(sentence_audio) 85 | 86 | # Randomly add silence 87 | if random.choice([True, False]): 88 | silence_duration = random.randint(1500, 4500) # 1.5 to 4.5 seconds of silence 89 | silence = AudioSegment.silent(duration=silence_duration) 90 | audio_segments.append(silence) 91 | 92 | # Combine all audio segments into one 93 | final_audio = sum(audio_segments) 94 | 95 | # Save the final audio 96 | audio_file_path = "sample_conversation_with_fillers_and_silence.mp3" 97 | final_audio.export(audio_file_path, format="mp3") 98 | 99 | # Clean up temporary file 100 | os.remove("sentence.mp3") 101 | 102 | print(f"Audio file saved as {audio_file_path}") 103 | 104 | # Create the image and video 105 | image_path = "generated_image.jpg" 106 | create_image_with_text(image_path) 107 | 108 | output_path = "input_video.mp4" 109 | create_video_from_image_and_audio(image_path, audio_file_path, output_path) 110 | --------------------------------------------------------------------------------