├── offlineyoutube ├── lib │ ├── __init__.py │ ├── .DS_Store │ └── functions.py ├── __init__.py ├── .DS_Store ├── config.py └── app.py ├── pyinstaller scripts └── Apple silicon │ ├── README.md │ └── app.spec ├── requirements.txt ├── .gitignore ├── LICENSE ├── setup.py ├── legacy └── VectorDatabaseYoutube.py └── README.md /offlineyoutube/lib/__init__.py: -------------------------------------------------------------------------------- 1 | from .functions import * 2 | -------------------------------------------------------------------------------- /offlineyoutube/__init__.py: -------------------------------------------------------------------------------- 1 | # offline_youtube/__init__.py 2 | from .app import * 3 | 4 | -------------------------------------------------------------------------------- /offlineyoutube/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DrewThomasson/offlineYoutube/HEAD/offlineyoutube/.DS_Store -------------------------------------------------------------------------------- /offlineyoutube/lib/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DrewThomasson/offlineYoutube/HEAD/offlineyoutube/lib/.DS_Store -------------------------------------------------------------------------------- /offlineyoutube/config.py: -------------------------------------------------------------------------------- 1 | # config.py 2 | import os 3 | OFFLINE_YOUTUBE_DIR = os.path.join(os.path.expanduser('~'), 'offlineyoutube_files') 4 | 5 | -------------------------------------------------------------------------------- /pyinstaller scripts/Apple silicon/README.md: -------------------------------------------------------------------------------- 1 | ## To build as a binary on apple silicon move app.spec to root of repo at `/vectorDatabaseYoutube/` and run: 2 | 3 | ```bash 4 | pyinstaller --clean app.spec -y 5 | ``` 6 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | yt-dlp 2 | pandas 3 | numpy 4 | requests 5 | faiss-cpu 6 | faster-whisper 7 | sentence-transformers 8 | gradio==3.36.1 9 | argparse 10 | beautifulsoup4 11 | pysrt 12 | webvtt-py 13 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore build artifacts 2 | build/ 3 | dist/ 4 | *.egg-info/ 5 | __pycache__/ 6 | 7 | # macOS files 8 | .DS_Store 9 | 10 | # Python cache 11 | *.pyc 12 | *.pyo 13 | 14 | # Virtual environments 15 | venv/ 16 | 17 | # Project-specific files 18 | offlineyoutube/offlineYoutubeFiles/ 19 | 20 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Drew Thomasson 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /pyinstaller scripts/Apple silicon/app.spec: -------------------------------------------------------------------------------- 1 | # -*- mode: python ; coding: utf-8 -*- 2 | from PyInstaller.utils.hooks import collect_data_files 3 | 4 | # Collect data files for Gradio and Gradio Client 5 | datas = [] 6 | datas += collect_data_files('gradio') 7 | datas += collect_data_files('gradio_client') 8 | 9 | a = Analysis( 10 | ['app.py'], # Your main application entry point 11 | pathex=[], # Add paths if necessary 12 | binaries=[], # Include any additional binaries if needed 13 | datas=datas, 14 | hiddenimports=[], # Specify hidden imports if any 15 | hookspath=[], # Add hook paths if required 16 | hooksconfig={}, 17 | runtime_hooks=[], 18 | excludes=[], 19 | noarchive=False, 20 | optimize=0, # Optimization level (0 for no optimization) 21 | module_collection_mode={ 22 | 'gradio': 'py', # Collect Gradio as source .py files 23 | }, 24 | ) 25 | 26 | # Create the executable in a single-file format 27 | pyz = PYZ(a.pure) 28 | 29 | exe = EXE( 30 | pyz, 31 | a.scripts, 32 | [], 33 | exclude_binaries=True, 34 | name='app', 35 | debug=False, 36 | bootloader_ignore_signals=False, 37 | strip=False, 38 | upx=True, 39 | console=True, 40 | disable_windowed_traceback=False, 41 | argv_emulation=False, 42 | target_arch=None, 43 | codesign_identity=None, 44 | entitlements_file=None, 45 | onefile=True, # Ensure single-file build 46 | ) 47 | 48 | # Final collection step, collecting necessary files and binaries 49 | coll = COLLECT( 50 | exe, 51 | a.binaries, 52 | a.datas, 53 | strip=False, 54 | upx=True, 55 | upx_exclude=[], 56 | name='app', 57 | ) 58 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import platform 3 | from setuptools import setup, find_packages 4 | 5 | # Check for M1 Mac and Python version 6 | if platform.system() == "Darwin" and platform.processor() == "arm": 7 | if not (sys.version_info.major == 3 and sys.version_info.minor == 10): 8 | raise RuntimeError( 9 | "This package requires Python 3.10 on M1 Macs. " 10 | "Please create a Python 3.10 virtual environment and try again." 11 | ) 12 | 13 | setup( 14 | name="offlineyoutube", 15 | version="2.1.9", 16 | packages=find_packages(), 17 | include_package_data=True, 18 | install_requires=[ 19 | "yt-dlp", 20 | "pandas", 21 | "numpy", 22 | "requests", 23 | "faiss-cpu", 24 | "faster-whisper", 25 | "sentence-transformers", 26 | "gradio==3.36.1", 27 | "argparse", 28 | "beautifulsoup4", 29 | "pysrt", 30 | "webvtt-py" 31 | ], 32 | entry_points={ 33 | "console_scripts": [ 34 | "offlineyoutube=offlineyoutube.app:main" 35 | ] 36 | }, 37 | python_requires=">=3.8", 38 | author="Andrew Phillip Thomasson", 39 | author_email="drew.thomasson100@gmail.com", 40 | description="A YouTube video search and management tool with a Gradio interface", 41 | long_description=open("README.md").read(), 42 | long_description_content_type="text/markdown", 43 | url="https://github.com/DrewThomasson/offlineYoutube", 44 | classifiers=[ 45 | "Programming Language :: Python :: 3", 46 | "License :: OSI Approved :: MIT License", 47 | "Operating System :: OS Independent", 48 | ], 49 | ) 50 | -------------------------------------------------------------------------------- /legacy/VectorDatabaseYoutube.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import yt_dlp 4 | import pandas as pd 5 | import numpy as np 6 | import requests 7 | import faiss 8 | from faster_whisper import WhisperModel 9 | from sentence_transformers import SentenceTransformer 10 | 11 | # Setup directories 12 | os.makedirs('thumbnails', exist_ok=True) 13 | os.makedirs('datasets', exist_ok=True) 14 | 15 | # Initialize models 16 | whisper_model = WhisperModel("small", device="cpu", compute_type="int8") 17 | embedding_model = SentenceTransformer('all-MiniLM-L6-v2') 18 | 19 | def extract_video_id_from_link(link): 20 | video_id = re.search(r"v=([0-9A-Za-z_-]{11})", link) 21 | return f"https://www.youtube.com/watch?v={video_id.group(1)}" if video_id else link 22 | 23 | 24 | # Helper function to extract YouTube video ID 25 | def get_video_id(youtube_link): 26 | pattern = r"(?:v=|\/)([0-9A-Za-z_-]{11}).*" 27 | match = re.search(pattern, youtube_link) 28 | return match.group(1) if match else None 29 | 30 | # Download thumbnail for offline use 31 | def download_thumbnail(video_id): 32 | thumbnail_url = f"https://img.youtube.com/vi/{video_id}/hqdefault.jpg" 33 | thumbnail_path = f"thumbnails/{video_id}.jpg" 34 | 35 | if not os.path.exists(thumbnail_path): 36 | response = requests.get(thumbnail_url, stream=True) 37 | if response.status_code == 200: 38 | with open(thumbnail_path, 'wb') as f: 39 | f.write(response.content) 40 | return thumbnail_path 41 | 42 | # Transcribe audio with faster-whisper 43 | def extract_transcript(video_url): 44 | video_id = get_video_id(video_url) 45 | print(f"Transcribing {video_id}...") 46 | 47 | with yt_dlp.YoutubeDL({'format': 'bestaudio'}) as ydl: 48 | info = ydl.extract_info(video_url, download=False) 49 | audio_url = info['url'] 50 | 51 | segments, _ = whisper_model.transcribe(audio_url, vad_filter=True) 52 | 53 | sentences = [] 54 | for segment in segments: 55 | for sentence in segment.text.split('.'): 56 | sentence = sentence.strip() 57 | if sentence: 58 | sentences.append((sentence, segment.start)) 59 | return sentences 60 | 61 | # Process videos into a dataset 62 | def process_videos(video_links): 63 | data = [] 64 | 65 | for link in video_links: 66 | video_id = get_video_id(link) 67 | sentences = extract_transcript(link) 68 | thumbnail_path = download_thumbnail(video_id) 69 | 70 | for sentence, timestamp in sentences: 71 | data.append({ 72 | 'text': sentence, 73 | 'timestamp': timestamp, 74 | 'YouTube_link': link, 75 | 'thumbnail_path': thumbnail_path 76 | }) 77 | 78 | return pd.DataFrame(data) 79 | 80 | # Save dataset to CSV 81 | def save_dataset(data): 82 | dataset_path = 'datasets/transcript_dataset.csv' 83 | if os.path.exists(dataset_path): 84 | existing_data = pd.read_csv(dataset_path) 85 | data = pd.concat([existing_data, data], ignore_index=True) 86 | data.to_csv(dataset_path, index=False) 87 | print(f"Dataset saved to {dataset_path}") 88 | 89 | # Create a vector database using FAISS 90 | def create_vector_database(data): 91 | data['embedding'] = data['text'].apply(lambda x: embedding_model.encode(x)) 92 | 93 | dimension = len(data['embedding'].iloc[0]) 94 | index = faiss.IndexFlatL2(dimension) 95 | 96 | embeddings = np.vstack(data['embedding'].values) 97 | index.add(embeddings) 98 | 99 | # Save the FAISS index 100 | faiss.write_index(index, 'datasets/vector_index.faiss') 101 | print("Vector database created and saved.") 102 | return index 103 | 104 | # Query the vector database 105 | def query_vector_database(query, top_k=5): 106 | index = faiss.read_index('datasets/vector_index.faiss') 107 | data = pd.read_csv('datasets/transcript_dataset.csv') 108 | 109 | query_vector = embedding_model.encode(query).reshape(1, -1) 110 | distances, indices = index.search(query_vector, top_k) 111 | 112 | results = data.loc[indices[0]].copy() # Avoid SettingWithCopyWarning 113 | results['score'] = distances[0] 114 | 115 | # Extract base video link for grouping 116 | results['video_id'] = results['YouTube_link'].apply(extract_video_id_from_link) 117 | 118 | # Aggregate most relevant videos by video ID 119 | video_relevance = ( 120 | results.groupby('video_id') 121 | .agg( 122 | relevance=('score', 'mean'), # Average relevance for each video 123 | thumbnail=('thumbnail_path', 'first'), # Use the first thumbnail 124 | text=('text', 'first'), # Use the first text snippet 125 | original_link=('YouTube_link', 'first') # Use the first timestamped link 126 | ) 127 | .reset_index() 128 | .sort_values(by='relevance', ascending=True) # Sort by relevance (lower is better) 129 | .head(5) # Limit to top 5 videos 130 | ) 131 | 132 | return results[['text', 'YouTube_link', 'thumbnail_path', 'score']], video_relevance 133 | 134 | 135 | # Main function to handle video input and queries 136 | def main(): 137 | if not os.path.exists('datasets/transcript_dataset.csv'): 138 | print("No database found. Please add videos to create the initial database.") 139 | video_links = get_video_links() 140 | data = process_videos(video_links) 141 | save_dataset(data) 142 | create_vector_database(data) 143 | else: 144 | print("1: Add more videos\n2: Query the existing database") 145 | option = input("Select an option: ").strip() 146 | 147 | if option == '1': 148 | video_links = get_video_links() 149 | data = process_videos(video_links) 150 | save_dataset(data) 151 | create_vector_database(data) 152 | elif option == '2': 153 | query_loop() 154 | else: 155 | print("Invalid option.") 156 | 157 | def get_video_links(): 158 | print("1: Provide a playlist link\n2: Provide a list of video links") 159 | option = input("Select an option: ").strip() 160 | 161 | if option == '1': 162 | playlist_url = input("Enter YouTube playlist URL: ").strip() 163 | with yt_dlp.YoutubeDL({'extract_flat': 'in_playlist'}) as ydl: 164 | playlist_info = ydl.extract_info(playlist_url, download=False) 165 | video_links = [entry['url'] for entry in playlist_info['entries']] 166 | elif option == '2': 167 | video_links = input("Enter YouTube video links (comma-separated): ").strip().split(',') 168 | else: 169 | print("Invalid option.") 170 | return [] 171 | 172 | return video_links 173 | 174 | def query_loop(): 175 | while True: 176 | query = input("Enter your search query (or 'exit' to quit): ").strip() 177 | if query.lower() == 'exit': 178 | break 179 | 180 | results, top_videos = query_vector_database(query) 181 | 182 | # Print detailed results for each text entry 183 | print("\nDetailed Results:\n") 184 | for _, row in results.iterrows(): 185 | print(f"Text: {row['text']}") 186 | print(f"Link: {row['YouTube_link']}") 187 | print(f"Thumbnail: {row['thumbnail_path']}") 188 | print(f"Score: {row['score']:.4f}\n") 189 | 190 | # Print top-ranked videos based on relevance 191 | print("\nTop Relevant Videos:\n") 192 | for idx, row in top_videos.iterrows(): 193 | print(f"Rank {idx + 1}:") 194 | print(f"Relevance Score: {row['relevance']:.4f}") 195 | print(f"Video Link: {row['original_link']}") 196 | print(f"Thumbnail: {row['thumbnail']}") 197 | print(f"Example Text: {row['text']}\n") 198 | 199 | # Run the application 200 | if __name__ == "__main__": 201 | main() 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # **Offline YouTube Video Search Application** 2 | 3 | This application allows users to **extract transcripts from YouTube videos**, **upload their own video/audio files**, **create searchable vector databases**, and **perform semantic searches** using a **Gradio web interface** or **command-line interface (CLI)**. It's powered by `faster-whisper` for transcription, `FAISS` for vector search, and `sentence-transformers` for text embeddings. 4 | 5 | --- 6 | 7 | ## **Features** 8 | 9 | - Extract transcripts from individual videos, playlists, and entire channels. 10 | - **Upload your own video or audio files for processing.** 11 | - Automatically detect playlists, channels, and individual video links. 12 | - Automatically download video thumbnails. 13 | - Store transcripts and create a searchable vector database. 14 | - Perform semantic searches on video content. 15 | - Supports **Gradio web interface** and **CLI** for flexible usage. 16 | - Easily add more videos or your own files to the dataset. 17 | 18 | --- 19 | 20 | ## **Web Interface** 21 | 22 | ### **Add Videos Tab** 23 | 24 | - **Enter playlist, channel, and/or video URLs (comma-separated).** 25 | - **Upload your own video/audio files.** 26 | - **Option to process entire channels when a channel URL is provided.** 27 | - **Option to keep videos stored locally or not.** 28 | 29 | Screenshot 2024-11-01 at 11 14 22 AM 30 | 31 | ### **Search Tab** 32 | 33 | - **Enter your search query to find relevant snippets.** 34 | - **View top relevant videos with thumbnails and play local videos if available.** 35 | - **View detailed results with timestamps and direct links.** 36 | 37 | Screenshot 2024-11-01 at 11 18 01 AM 38 | Screenshot 2024-11-01 at 12 05 34 PM 39 | 40 | --- 41 | 42 | ## **Installation** 43 | ![PyPI Downloads](https://static.pepy.tech/badge/offlineyoutube) 44 | 45 | Ensure you have Python installed (>= 3.8). Then, pip install: 46 | (Requires Python 3.10 for Apple Silicon Macs) 47 | 48 | ```bash 49 | pip install offlineyoutube 50 | ``` 51 | 52 | --- 53 | 54 | ## **Usage** 55 | 56 | The app provides **two ways to interact**: 57 | 1. **Gradio Web Interface** 58 | 2. **Command-Line Interface (CLI)** 59 | 60 | ### **1. Running the Gradio Web Interface** 61 | 62 | Launch the web interface: 63 | 64 | ```bash 65 | offlineyoutube ui 66 | ``` 67 | 68 | or simply: 69 | 70 | ```bash 71 | offlineyoutube 72 | ``` 73 | 74 | Then, open the URL (usually `http://127.0.0.1:7860`) in your browser. 75 | 76 | #### **Gradio Interface Tabs:** 77 | 78 | - **Add Videos:** 79 | - Enter playlist URLs, channel URLs, and/or individual video URLs (comma-separated). 80 | - **Upload your own video or audio files for processing.** 81 | - **Option to process entire YouTube channels when a channel URL is provided.** 82 | - **Option to keep videos stored locally or not.** 83 | - The app will automatically detect whether each link is a playlist, channel, or a video. 84 | - Videos and uploaded files will be transcribed, and the database will be updated with the content. 85 | 86 | - **Search:** 87 | - Enter search queries to find relevant snippets from the video transcripts. 88 | - Results are ranked based on semantic similarity and include video thumbnails. 89 | - **If local videos are available, you can play them directly in the interface.** 90 | 91 | --- 92 | 93 | ### **2. Command-Line Interface (CLI)** 94 | 95 | The CLI provides more flexibility for programmatic use. 96 | 97 | #### **Commands Overview** 98 | 99 | Use the `--help` command to view available commands and examples: 100 | 101 | ```bash 102 | offlineyoutube --help 103 | ``` 104 | 105 | **Output:** 106 | 107 | ``` 108 | usage: offlineyoutube [-h] {add,search,ui} ... 109 | 110 | YouTube Video Search Application 111 | 112 | positional arguments: 113 | {add,search,ui} Available commands 114 | add Add videos to the database 115 | search Search the video database 116 | ui Run the Gradio web interface 117 | 118 | optional arguments: 119 | -h, --help Show this help message and exit 120 | 121 | Examples: 122 | # Add videos from a playlist and keep videos locally 123 | offlineyoutube add --input "https://www.youtube.com/playlist?list=YOUR_PLAYLIST_ID" --keep_videos 124 | 125 | # Add specific videos without keeping videos locally 126 | offlineyoutube add --input "https://www.youtube.com/watch?v=VIDEO_ID1,https://www.youtube.com/watch?v=VIDEO_ID2" 127 | 128 | # Add videos from a channel (process entire channel) 129 | offlineyoutube add --input "https://www.youtube.com/channel/CHANNEL_ID" --process_channel 130 | 131 | # Search the database with a query 132 | offlineyoutube search --query "Your search query" --top_k 5 133 | 134 | # Run the Gradio web interface 135 | offlineyoutube ui 136 | ``` 137 | 138 | --- 139 | 140 | ### **Examples of CLI Usage** 141 | 142 | #### **1. Adding Videos** 143 | 144 | - **Add Playlists and Videos:** 145 | 146 | ```bash 147 | offlineyoutube add --input "https://www.youtube.com/playlist?list=YOUR_PLAYLIST_ID,https://www.youtube.com/watch?v=VIDEO_ID" 148 | ``` 149 | 150 | - **Add Specific Videos Without Keeping Them Locally:** 151 | 152 | ```bash 153 | offlineyoutube add --input "https://www.youtube.com/watch?v=dQw4w9WgXcQ,https://www.youtube.com/watch?v=9bZkp7q19f0" 154 | ``` 155 | 156 | - **Add Videos from a Channel (Process Entire Channel):** 157 | 158 | ```bash 159 | offlineyoutube add --input "https://www.youtube.com/channel/CHANNEL_ID" --process_channel 160 | ``` 161 | 162 | - **Add Videos and Keep Videos Stored Locally:** 163 | 164 | ```bash 165 | offlineyoutube add --input "https://www.youtube.com/watch?v=VIDEO_ID" --keep_videos 166 | ``` 167 | 168 | #### **2. Searching the Database** 169 | 170 | - **Perform a Search:** 171 | 172 | ```bash 173 | offlineyoutube search --query "machine learning tutorials" --top_k 5 174 | ``` 175 | 176 | --- 177 | 178 | ### **How It Works** 179 | 180 | 1. **Adding Videos and Uploaded Files:** 181 | - The app accepts a list of links and automatically detects whether each link is a playlist, channel, or an individual video. 182 | - **You can upload your own video or audio files for processing.** 183 | - It downloads video audio (or uses uploaded files) and transcribes it using `faster-whisper`. 184 | - Thumbnails are downloaded and saved locally. 185 | - The transcript data is saved in `datasets/transcript_dataset.csv`. 186 | - A vector database is updated using FAISS with embeddings generated by `sentence-transformers`. 187 | 188 | 2. **Incremental Updating:** 189 | - Videos and uploaded files are processed one by one, and the dataset and vector database are updated incrementally. 190 | - This ensures efficient processing, especially when dealing with large datasets. 191 | 192 | 3. **Searching the Database:** 193 | - When a query is entered, the app computes its embedding and searches the FAISS index for relevant video snippets. 194 | - The top results are displayed with thumbnails, titles, and links to the videos. 195 | - **If local videos are available, you can play them directly in the interface.** 196 | 197 | --- 198 | 199 | ### **FAQ** 200 | 201 | #### **1. How do I add multiple playlists, channels, and videos at once?** 202 | 203 | Simply provide a comma-separated list of URLs, and the app will automatically detect and process each link: 204 | 205 | ```bash 206 | offlineyoutube add --input "https://www.youtube.com/playlist?list=PLAYLIST_ID1,https://www.youtube.com/watch?v=VIDEO_ID,https://www.youtube.com/channel/CHANNEL_ID" 207 | ``` 208 | 209 | If you want to process entire channels, make sure to include the `--process_channel` flag: 210 | 211 | ```bash 212 | offlineyoutube add --input "https://www.youtube.com/channel/CHANNEL_ID" --process_channel 213 | ``` 214 | 215 | #### **2. How can I upload my own video or audio files for processing?** 216 | 217 | In the Gradio web interface, navigate to the **Add Videos** tab. Use the **"Upload your own video/audio files"** option to upload one or multiple files. The app will process these files and add them to the database. 218 | 219 | #### **3. Why aren’t new videos or uploaded files showing up in search results?** 220 | 221 | Ensure that the videos or files have been fully processed and that the vector database has been updated. The app handles this automatically, but processing may take time for large videos, playlists, or channels. 222 | 223 | #### **4. How do I prevent videos from being stored locally?** 224 | 225 | By default, the app keeps videos stored locally. To change this behavior, use the `--keep_videos` flag and set it to `False`: 226 | 227 | ```bash 228 | offlineyoutube add --input "VIDEO_OR_PLAYLIST_URL" --keep_videos False 229 | ``` 230 | 231 | In the Gradio interface, uncheck the **"Keep videos stored locally"** option in the **Add Videos** tab. 232 | 233 | #### **5. Can I process entire YouTube channels?** 234 | 235 | Yes! Use the `--process_channel` flag when adding videos via the CLI: 236 | 237 | ```bash 238 | offlineyoutube add --input "https://www.youtube.com/channel/CHANNEL_ID" --process_channel 239 | ``` 240 | 241 | In the Gradio interface, check the **"Process entire channel when a channel URL is provided"** option in the **Add Videos** tab. 242 | 243 | #### **6. Can I search the database without launching the Gradio interface?** 244 | 245 | Yes! Use the `search` command via the CLI: 246 | 247 | ```bash 248 | offlineyoutube search --query "Your query" --top_k 5 249 | ``` 250 | 251 | --- 252 | 253 | ### **Project Structure** 254 | 255 | ``` 256 | . 257 | ├── app.py # Main application script (Gradio + CLI) 258 | ├── functions.py # Helper functions for transcription, FAISS, etc. 259 | ├── datasets/ 260 | │ ├── transcript_dataset.csv # CSV file storing transcripts 261 | │ └── vector_index.faiss # FAISS vector index 262 | ├── thumbnails/ # Folder for storing video thumbnails 263 | ├── videos/ # Folder for storing downloaded videos (if keep_videos is True) 264 | ├── tmp/ # Temporary folder for videos (if keep_videos is False) 265 | ├── uploaded_files/ # Folder for storing uploaded files 266 | ``` 267 | 268 | --- 269 | 270 | ### **Known Limitations** 271 | 272 | - **Processing Time:** Transcribing videos and generating embeddings can be time-consuming, especially for long videos, large playlists, or channels. 273 | - **Storage Requirements:** Keeping videos stored locally will require additional disk space. Use the `--keep_videos False` option if storage is a concern. 274 | - **Large Datasets:** As the dataset grows, querying may take longer. Consider optimizing the FAISS index for very large datasets. 275 | 276 | --- 277 | 278 | ### **Contributing** 279 | 280 | Feel free to fork the repository, open issues, or submit pull requests if you'd like to contribute to this project. 281 | 282 | --- 283 | 284 | ### **License** 285 | 286 | This project is licensed under the MIT License. See the LICENSE file for details. 287 | 288 | --- 289 | 290 | ### **Acknowledgments** 291 | 292 | - **faster-whisper** for fast transcription. 293 | - **FAISS** for efficient vector search. 294 | - **Gradio** for the interactive web interface. 295 | - **yt-dlp** for downloading video content. 296 | 297 | --- 298 | -------------------------------------------------------------------------------- /offlineyoutube/app.py: -------------------------------------------------------------------------------- 1 | # app.py 2 | 3 | import os 4 | import sys 5 | sys.path.append(os.path.dirname(__file__)) # Add this line here 6 | import multiprocessing 7 | import shutil 8 | import gradio as gr 9 | import argparse 10 | import pandas as pd 11 | from lib.functions import ( 12 | initialize_models, setup_directories, process_videos, 13 | query_vector_database, get_video_links 14 | ) 15 | from config import OFFLINE_YOUTUBE_DIR # Ensure this path is correct 16 | 17 | def add_videos_interface(input_text, uploaded_files, process_channel, keep_videos, video_quality): 18 | """ 19 | Interface function for adding videos to the database. 20 | """ 21 | # Initialize models within the function to avoid multi-processing issues 22 | whisper_model, embedding_model = initialize_models() 23 | 24 | video_links = get_video_links(input_text, process_channel) 25 | uploaded_files_paths = [] 26 | if uploaded_files: 27 | uploaded_files_dir = os.path.join(OFFLINE_YOUTUBE_DIR, 'uploaded_files') 28 | os.makedirs(uploaded_files_dir, exist_ok=True) 29 | for uploaded_file in uploaded_files: 30 | try: 31 | original_filename = os.path.basename(uploaded_file.name) 32 | file_path = os.path.join(uploaded_files_dir, original_filename) 33 | 34 | shutil.copy(uploaded_file.name, file_path) 35 | 36 | if os.path.getsize(file_path) == 0: 37 | print(f"Uploaded file {original_filename} is empty. Skipping.") 38 | continue 39 | uploaded_files_paths.append(file_path) 40 | print(f"Saved uploaded file {original_filename} to {file_path} ({os.path.getsize(file_path)} bytes)") 41 | except Exception as e: 42 | print(f"Error saving uploaded file {original_filename}: {e}") 43 | if not video_links and not uploaded_files_paths: 44 | return "No valid video links or files provided." 45 | # Process videos and uploaded files with selected video quality 46 | data, video_titles = process_videos( 47 | video_links, uploaded_files_paths, keep_videos=keep_videos, video_quality=video_quality 48 | ) 49 | 50 | # Prepare a message with the video titles 51 | if video_titles: 52 | titles_message = "\n".join(f"- {title}" for title in video_titles) 53 | return f"Videos processed and database updated.\nAdded Videos:\n{titles_message}" 54 | else: 55 | return "No new videos were added to the database." 56 | 57 | def search_interface(query_text, top_k): 58 | """ 59 | Interface function for searching the database. 60 | """ 61 | # Initialize only the embedding model within the function 62 | _, embedding_model = initialize_models() 63 | 64 | index_path = os.path.join(OFFLINE_YOUTUBE_DIR, 'datasets', 'vector_index.faiss') 65 | dataset_path = os.path.join(OFFLINE_YOUTUBE_DIR, 'datasets', 'transcript_dataset.csv') 66 | 67 | if not os.path.exists(index_path): 68 | return "No database found. Please add videos first.", None 69 | try: 70 | results, top_videos = query_vector_database(query_text, embedding_model, top_k=top_k) 71 | except Exception as e: 72 | return f"Error: {e}", None 73 | 74 | # Prepare top videos 75 | top_videos_html = "

Top Relevant Videos:

" 76 | for idx, row in top_videos.iterrows(): 77 | rank = idx + 1 # Since idx is now sequential 78 | # Check if local video exists 79 | local_video_path = row['local_video_path'] 80 | if isinstance(local_video_path, str) and local_video_path and not pd.isnull(local_video_path): 81 | local_video_exists = os.path.exists(local_video_path) 82 | else: 83 | local_video_exists = False 84 | local_video_player = '' 85 | if local_video_exists: 86 | # Replace backslashes with forward slashes for compatibility 87 | local_video_url = 'file/' + local_video_path.replace("\\", "/") 88 | local_video_player = f""" 89 |
90 | Show Local Video 91 | 95 |
96 | """ 97 | top_videos_html += f""" 98 |
99 |

Rank {rank}

100 | Thumbnail 101 |

Title: {row['video_title']}

102 |

Relevance Score: {row['relevance']:.4f}

103 |

Example Text: {row['text']}

104 |

Watch on YouTube

105 | {local_video_player} 106 |
107 |
108 | """ 109 | 110 | # Prepare detailed results 111 | detailed_html = "

Detailed Results:

" 112 | for _, row in results.iterrows(): 113 | # Check if local video exists 114 | local_video_path = row['local_video_path'] 115 | if isinstance(local_video_path, str) and local_video_path and not pd.isnull(local_video_path): 116 | local_video_exists = os.path.exists(local_video_path) 117 | else: 118 | local_video_exists = False 119 | local_video_player = '' 120 | if local_video_exists: 121 | # Replace backslashes with forward slashes for compatibility 122 | local_video_url = 'file/' + local_video_path.replace("\\", "/") 123 | timestamp = int(row['timestamp']) 124 | local_video_player = f""" 125 |
126 | Show Local Video at Timestamp 127 | 131 |
132 | """ 133 | detailed_html += f""" 134 |
135 | Thumbnail 136 |

Title: {row['video_title']}

137 |

Text: {row['text']}

138 |

Score: {row['score']:.4f}

139 |

Watch on YouTube at Timestamp

140 | {local_video_player} 141 |
142 |
143 | """ 144 | return top_videos_html, detailed_html 145 | 146 | def main(): 147 | os.environ["TOKENIZERS_PARALLELISM"] = "false" 148 | setup_directories() 149 | 150 | parser = argparse.ArgumentParser( 151 | description="YouTube Video Search Application", 152 | epilog=""" 153 | Examples: 154 | # Add videos from a playlist and keep videos locally 155 | python app.py add --input "https://www.youtube.com/playlist?list=YOUR_PLAYLIST_ID" --keep_videos 156 | 157 | # Add specific videos without keeping videos locally 158 | python app.py add --input "https://www.youtube.com/watch?v=VIDEO_ID1,https://www.youtube.com/watch?v=VIDEO_ID2" 159 | 160 | # Search the database with a query 161 | python app.py search --query "Your search query" --top_k 5 162 | 163 | # Run the Gradio web interface 164 | python app.py ui 165 | """, 166 | formatter_class=argparse.RawDescriptionHelpFormatter 167 | ) 168 | 169 | subparsers = parser.add_subparsers(dest='command') 170 | 171 | # Add videos command 172 | parser_add = subparsers.add_parser('add', help='Add videos to the database') 173 | parser_add.add_argument('--input', required=True, help='Playlist URL or comma-separated video URLs') 174 | parser_add.add_argument('--process_channel', action='store_true', help='Process entire channel when a channel URL is provided') 175 | parser_add.add_argument('--keep_videos', action='store_true', help='Keep videos stored locally') 176 | 177 | # Search command 178 | parser_search = subparsers.add_parser('search', help='Search the video database') 179 | parser_search.add_argument('--query', required=True, help='Search query') 180 | parser_search.add_argument('--top_k', type=int, default=5, help='Number of results to return') 181 | 182 | # Run Gradio interface 183 | parser_ui = subparsers.add_parser('ui', help='Run the Gradio web interface') 184 | 185 | args = parser.parse_args() 186 | 187 | if args.command == 'add': 188 | # For CLI, use the default video quality of 720p 189 | default_video_quality = "720p" 190 | status = add_videos_interface(args.input, [], args.process_channel, args.keep_videos, default_video_quality) 191 | print(status) 192 | 193 | elif args.command == 'search': 194 | top_videos_html, detailed_results = search_interface(args.query, args.top_k) 195 | if isinstance(top_videos_html, str): 196 | print(top_videos_html) 197 | else: 198 | # Extract data from HTML for console output 199 | from bs4 import BeautifulSoup 200 | 201 | # Extract top videos 202 | soup = BeautifulSoup(top_videos_html, 'html.parser') 203 | print("Top Relevant Videos:\n") 204 | for idx, div in enumerate(soup.find_all('div')): 205 | rank = div.find('h4').text 206 | title = div.find('p', text=lambda t: t and 'Title:' in t).text 207 | relevance = div.find('p', text=lambda t: t and 'Relevance Score:' in t).text 208 | example_text = div.find('p', text=lambda t: t and 'Example Text:' in t).text 209 | link = div.find('a')['href'] 210 | print(f"{rank}\n{title}\n{relevance}\n{example_text}\nLink: {link}\n") 211 | 212 | # Extract detailed results 213 | soup = BeautifulSoup(detailed_results, 'html.parser') 214 | print("Detailed Results:\n") 215 | for div in soup.find_all('div'): 216 | title = div.find('p', text=lambda t: t and 'Title:' in t).text 217 | text = div.find('p', text=lambda t: t and 'Text:' in t).text 218 | score = div.find('p', text=lambda t: t and 'Score:' in t).text 219 | link = div.find('a')['href'] 220 | print(f"{title}\n{score}\n{text}\nLink: {link}\n") 221 | 222 | else: 223 | # Run Gradio interface if no command is provided or 'ui' command is used 224 | with gr.Blocks(theme=gr.themes.Soft()) as demo: 225 | gr.Markdown("# 🎥 YouTube Video Search Application") 226 | 227 | with gr.Tab("Add Videos"): 228 | gr.Markdown("### Add videos to the database") 229 | input_text = gr.Textbox(lines=2, placeholder="Enter playlist, channel, and/or video URLs (comma-separated)") 230 | process_channel = gr.Checkbox(label="Process entire channel when a channel URL is provided", value=False) 231 | keep_videos = gr.Checkbox(label="Keep videos stored locally", value=True) 232 | video_quality = gr.Dropdown( 233 | label="Select Video Quality", 234 | choices=["144p", "240p", "360p", "480p", "720p", "1080p"], 235 | value="720p", 236 | info="Choose the desired video quality for downloads." 237 | ) 238 | file_upload = gr.File(label="Upload your own video/audio files", file_count="multiple", type="file") 239 | add_button = gr.Button("Add Videos") 240 | add_output = gr.Textbox(label="Status") 241 | add_button.click( 242 | add_videos_interface, 243 | inputs=[input_text, file_upload, process_channel, keep_videos, video_quality], 244 | outputs=add_output 245 | ) 246 | 247 | with gr.Tab("Search"): 248 | gr.Markdown("### Search the video database") 249 | query_text = gr.Textbox(lines=1, placeholder="Enter your search query") 250 | top_k = gr.Slider(1, 20, value=5, step=1, label="Number of Results") 251 | search_button = gr.Button("Search") 252 | top_video_results = gr.HTML() 253 | detailed_results = gr.HTML() 254 | search_button.click( 255 | search_interface, 256 | inputs=[query_text, top_k], 257 | outputs=[top_video_results, detailed_results] 258 | ) 259 | 260 | demo.launch() 261 | 262 | if __name__ == "__main__": 263 | # Fix for multiprocessing in PyInstaller 264 | multiprocessing.freeze_support() 265 | 266 | # Ensure set_start_method is only set once 267 | try: 268 | multiprocessing.set_start_method('spawn', force=True) 269 | except RuntimeError: 270 | pass 271 | 272 | main() 273 | -------------------------------------------------------------------------------- /offlineyoutube/lib/functions.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import yt_dlp 4 | import pandas as pd 5 | import numpy as np 6 | import requests 7 | import faiss 8 | import shutil 9 | from faster_whisper import WhisperModel 10 | from sentence_transformers import SentenceTransformer 11 | from tqdm import tqdm 12 | import pysrt 13 | import subprocess 14 | import webvtt 15 | import tempfile 16 | from pathlib import Path 17 | from config import OFFLINE_YOUTUBE_DIR # Ensure this path is correct 18 | 19 | def initialize_models(whisper_model_size='tiny', device='cpu', compute_type='int8', embedding_model_name='all-MiniLM-L6-v2'): 20 | """ 21 | Initialize the Whisper and embedding models. 22 | """ 23 | try: 24 | whisper_model = WhisperModel(whisper_model_size, device=device, compute_type=compute_type) 25 | print(f"Initialized WhisperModel with size='{whisper_model_size}', device='{device}', compute_type='{compute_type}'.") 26 | except Exception as e: 27 | print(f"Error initializing WhisperModel: {e}") 28 | raise e 29 | 30 | try: 31 | embedding_model = SentenceTransformer(embedding_model_name) 32 | print(f"Initialized SentenceTransformer with model='{embedding_model_name}'.") 33 | except Exception as e: 34 | print(f"Error initializing SentenceTransformer: {e}") 35 | raise e 36 | 37 | return whisper_model, embedding_model 38 | 39 | def setup_directories(): 40 | """ 41 | Create necessary directories for storing thumbnails and datasets within the base directory. 42 | """ 43 | directories = [ 44 | 'thumbnails', 45 | 'datasets', 46 | 'tmp', 47 | 'videos', 48 | 'uploaded_files' 49 | ] 50 | for directory in directories: 51 | path = os.path.join(OFFLINE_YOUTUBE_DIR, directory) 52 | os.makedirs(path, exist_ok=True) 53 | print(f"Ensured directory exists: {path}") 54 | 55 | def extract_video_id_from_link(link): 56 | """ 57 | Extract YouTube video ID from a link. 58 | """ 59 | video_id = re.search(r"(?:v=|\/)([0-9A-Za-z_-]{11})", link) 60 | return video_id.group(1) if video_id else None 61 | 62 | def get_video_id(youtube_link): 63 | """ 64 | Get the video ID from a YouTube link. 65 | """ 66 | pattern = r"(?:v=|\/)([0-9A-Za-z_-]{11}).*" 67 | match = re.search(pattern, youtube_link) 68 | return match.group(1) if match else None 69 | 70 | def download_thumbnail(video_id): 71 | """ 72 | Download the thumbnail image for a YouTube video. 73 | """ 74 | thumbnail_url = f"https://img.youtube.com/vi/{video_id}/hqdefault.jpg" 75 | thumbnail_path = os.path.join(OFFLINE_YOUTUBE_DIR, 'thumbnails', f"{video_id}.jpg") 76 | 77 | if not os.path.exists(thumbnail_path): 78 | try: 79 | response = requests.get(thumbnail_url, stream=True) 80 | if response.status_code == 200: 81 | with open(thumbnail_path, 'wb') as f: 82 | shutil.copyfileobj(response.raw, f) 83 | print(f"Downloaded thumbnail for video ID {video_id} to {thumbnail_path}.") 84 | else: 85 | print(f"Failed to download thumbnail for video ID {video_id}. Status code: {response.status_code}") 86 | except Exception as e: 87 | print(f"Error downloading thumbnail for video ID {video_id}: {e}") 88 | else: 89 | print(f"Thumbnail already exists for video ID {video_id} at {thumbnail_path}.") 90 | return thumbnail_path 91 | 92 | def download_video(video_url, output_dir, keep_video=True, download_audio_only=False, video_quality="720p"): 93 | """ 94 | Download video or audio to a specified directory, attempt to download subtitles. 95 | """ 96 | # First, attempt to download subtitles only 97 | subtitles_available, subtitle_file, video_id, video_title = download_subtitles(video_url, output_dir) 98 | 99 | # Define video quality mapping 100 | quality_mapping = { 101 | "144p": "bestvideo[height<=144][ext=mp4]+bestaudio[ext=m4a]/mp4", 102 | "240p": "bestvideo[height<=240][ext=mp4]+bestaudio[ext=m4a]/mp4", 103 | "360p": "bestvideo[height<=360][ext=mp4]+bestaudio[ext=m4a]/mp4", 104 | "480p": "bestvideo[height<=480][ext=mp4]+bestaudio[ext=m4a]/mp4", 105 | "720p": "bestvideo[height<=720][ext=mp4]+bestaudio[ext=m4a]/mp4", 106 | "1080p": "bestvideo[height<=1080][ext=mp4]+bestaudio[ext=m4a]/mp4", 107 | } 108 | 109 | selected_format = quality_mapping.get(video_quality, "bestvideo[height<=720][ext=mp4]+bestaudio[ext=m4a]/mp4") 110 | print(f"Selected format for download: {selected_format}") 111 | 112 | # Decide whether to download video or audio based on subtitles availability and user preference 113 | if keep_video: 114 | # Need to download the video with selected quality 115 | ydl_opts = { 116 | 'format': selected_format, 117 | 'outtmpl': os.path.join(output_dir, '%(id)s.%(ext)s'), 118 | 'quiet': True, 119 | 'no_warnings': True, 120 | 'merge_output_format': 'mp4', 121 | 'skip_download': False, 122 | } 123 | try: 124 | with yt_dlp.YoutubeDL(ydl_opts) as ydl: 125 | info_dict = ydl.extract_info(video_url, download=True) 126 | video_id = info_dict.get('id', '') 127 | video_title = info_dict.get('title', '') 128 | # Get the actual filename 129 | filename = ydl.prepare_filename(info_dict) 130 | video_file = filename 131 | print(f"Downloaded video: {video_file}") 132 | except Exception as e: 133 | print(f"Error downloading media for video {video_url}: {e}") 134 | video_file = None 135 | else: 136 | # If subtitles are available and not keeping video, we don't need to download anything 137 | if subtitles_available: 138 | print("Subtitles found. Proceeding without downloading media.") 139 | video_file = None 140 | else: 141 | # Need to download audio for transcription 142 | ydl_opts = { 143 | 'format': 'bestaudio/best', 144 | 'outtmpl': os.path.join(output_dir, '%(id)s.%(ext)s'), 145 | 'quiet': True, 146 | 'no_warnings': True, 147 | 'skip_download': False, 148 | } 149 | try: 150 | with yt_dlp.YoutubeDL(ydl_opts) as ydl: 151 | info_dict = ydl.extract_info(video_url, download=True) 152 | video_id = info_dict.get('id', '') 153 | video_title = info_dict.get('title', '') 154 | # Get the actual filename 155 | filename = ydl.prepare_filename(info_dict) 156 | video_file = filename 157 | print(f"Downloaded audio: {video_file}") 158 | except Exception as e: 159 | print(f"Error downloading audio for video {video_url}: {e}") 160 | video_file = None 161 | 162 | return video_file, video_id, video_title, subtitles_available, subtitle_file 163 | 164 | def download_subtitles(video_url, output_dir): 165 | """ 166 | Attempt to download subtitles for a video without downloading the video. 167 | """ 168 | ydl_opts = { 169 | 'skip_download': True, 170 | 'writesubtitles': True, 171 | 'writeautomaticsub': True, 172 | 'subtitleslangs': ['en'], 173 | 'quiet': True, 174 | 'outtmpl': os.path.join(output_dir, '%(id)s'), 175 | } 176 | try: 177 | with yt_dlp.YoutubeDL(ydl_opts) as ydl: 178 | info_dict = ydl.extract_info(video_url, download=False) 179 | video_id = info_dict.get('id', '') 180 | video_title = info_dict.get('title', '') 181 | 182 | # Check for subtitle files 183 | subtitle_file = None 184 | subtitles_available = False 185 | possible_extensions = ['en.srt', 'en.vtt'] 186 | for ext_sub in possible_extensions: 187 | possible_subtitle_file = os.path.join(output_dir, f"{video_id}.{ext_sub}") 188 | if os.path.exists(possible_subtitle_file): 189 | subtitle_file = possible_subtitle_file 190 | subtitles_available = True 191 | print(f"Found subtitle file: {subtitle_file}") 192 | break 193 | 194 | # If subtitles are not available, attempt with subprocess 195 | if not subtitles_available: 196 | print("Subtitles not found. Attempting to download subtitles using alternative method.") 197 | cmd = [ 198 | 'yt-dlp', '--skip-download', '--write-sub', '--write-auto-sub', 199 | '--sub-lang', 'en', '--output', 200 | os.path.join(output_dir, '%(id)s'), 201 | video_url 202 | ] 203 | subprocess.run(cmd, check=False) 204 | # Attempt to find the subtitle file 205 | for ext_sub in possible_extensions: 206 | possible_subtitle_file = os.path.join(output_dir, f"{video_id}.{ext_sub}") 207 | if os.path.exists(possible_subtitle_file): 208 | subtitle_file = possible_subtitle_file 209 | subtitles_available = True 210 | print(f"Downloaded subtitle file: {subtitle_file}") 211 | break 212 | 213 | return subtitles_available, subtitle_file, video_id, video_title 214 | 215 | except Exception as e: 216 | print(f"Error downloading subtitles for video {video_url}: {e}") 217 | return False, None, None, None 218 | 219 | def extract_audio_from_video(video_file_path): 220 | """ 221 | Extract audio from a video file using ffmpeg and save it to a temporary file. 222 | Returns the path to the extracted audio file. 223 | """ 224 | try: 225 | temp_dir = tempfile.mkdtemp() 226 | audio_file_path = os.path.join(temp_dir, "extracted_audio.wav") 227 | cmd = [ 228 | 'ffmpeg', 229 | '-i', video_file_path, 230 | '-vn', # No video 231 | '-acodec', 'pcm_s16le', # PCM 16-bit little endian 232 | '-ar', '16000', # 16kHz 233 | '-ac', '1', # Mono 234 | audio_file_path, 235 | '-y' # Overwrite without asking 236 | ] 237 | subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True) 238 | print(f"Extracted audio to {audio_file_path}") 239 | return audio_file_path 240 | except Exception as e: 241 | print(f"Error extracting audio from {video_file_path}: {e}") 242 | return None 243 | 244 | def convert_to_mp4(input_file, output_dir): 245 | """ 246 | Convert any video or audio file to MP4 format using ffmpeg. 247 | Returns the path to the converted MP4 file. 248 | """ 249 | try: 250 | input_path = Path(input_file) 251 | output_path = Path(output_dir) / (input_path.stem + ".mp4") 252 | if input_path.suffix.lower() != '.mp4': 253 | cmd = [ 254 | 'ffmpeg', 255 | '-i', str(input_path), 256 | '-c:v', 'libx264', 257 | '-c:a', 'aac', 258 | '-strict', 'experimental', 259 | '-b:a', '192k', 260 | '-y', # Overwrite without asking 261 | str(output_path) 262 | ] 263 | subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True) 264 | print(f"Converted {input_file} to {output_path}") 265 | return str(output_path) 266 | else: 267 | # If already mp4, just return the original path 268 | print(f"File {input_file} is already in MP4 format.") 269 | return str(input_file) 270 | except Exception as e: 271 | print(f"Error converting {input_file} to MP4: {e}") 272 | return None 273 | 274 | def extract_transcript(audio_file, whisper_model, subtitles_available=False, subtitle_file=None): 275 | """ 276 | Transcribe the audio file using faster-whisper or read subtitles. 277 | """ 278 | if subtitles_available and subtitle_file: 279 | # Read subtitles file 280 | sentences = extract_transcript_from_subtitles(subtitle_file) 281 | elif audio_file: 282 | # Transcribe using Whisper 283 | print("Using Whisper to transcribe audio.") 284 | sentences = [] 285 | try: 286 | # Reduced beam size and no VAD filter to stabilize 287 | segments, _ = whisper_model.transcribe(audio_file, vad_filter=False, beam_size=5) 288 | for segment in segments: 289 | for sentence in segment.text.split('.'): 290 | sentence = sentence.strip() 291 | if sentence: 292 | sentences.append((sentence, segment.start)) 293 | print(f"Transcription completed for {audio_file}.") 294 | except Exception as e: 295 | print(f"Error during transcription: {e}") 296 | sentences = [] 297 | else: 298 | print("No subtitles or audio file available for transcription.") 299 | sentences = [] 300 | return sentences 301 | 302 | def extract_transcript_from_subtitles(subtitle_file): 303 | """ 304 | Extract transcript from subtitles file (.srt or .vtt format). 305 | """ 306 | sentences = [] 307 | try: 308 | if subtitle_file.endswith('.srt'): 309 | subs = pysrt.open(subtitle_file) 310 | for sub in subs: 311 | text = sub.text.strip().replace('\n', ' ') 312 | start = sub.start.ordinal / 1000.0 # Convert milliseconds to seconds 313 | if text: 314 | sentences.append((text, start)) 315 | elif subtitle_file.endswith('.vtt'): 316 | subs = webvtt.read(subtitle_file) 317 | for caption in subs: 318 | text = caption.text.strip().replace('\n', ' ') 319 | start = caption.start_in_seconds 320 | if text: 321 | sentences.append((text, start)) 322 | else: 323 | print(f"Unsupported subtitle format for file: {subtitle_file}") 324 | except Exception as e: 325 | print(f"Error reading subtitles file {subtitle_file}: {e}") 326 | return sentences 327 | 328 | def query_vector_database(query, embedding_model, top_k=5): 329 | """ 330 | Query the FAISS vector database with a search query. 331 | """ 332 | index_path = os.path.join(OFFLINE_YOUTUBE_DIR, 'datasets', 'vector_index.faiss') 333 | dataset_path = os.path.join(OFFLINE_YOUTUBE_DIR, 'datasets', 'transcript_dataset.csv') 334 | 335 | if not os.path.exists(index_path): 336 | raise FileNotFoundError("Vector index not found. Please add videos first.") 337 | 338 | index = faiss.read_index(index_path) 339 | data = pd.read_csv(dataset_path) 340 | if 'video_id' not in data.columns: 341 | data['video_id'] = data['YouTube_link'].apply(get_video_id) 342 | data.to_csv(dataset_path, index=False) 343 | 344 | query_vector = embedding_model.encode(query).astype('float32').reshape(1, -1) 345 | distances, indices = index.search(query_vector, top_k) 346 | 347 | results = data.iloc[indices[0]].copy() 348 | results['score'] = distances[0] 349 | 350 | # Aggregate most relevant videos by video ID 351 | video_relevance = ( 352 | results.groupby('video_id') 353 | .agg( 354 | relevance=('score', 'mean'), 355 | thumbnail=('thumbnail_path', 'first'), 356 | text=('text', 'first'), 357 | original_link=('YouTube_link', 'first'), 358 | video_title=('video_title', 'first'), 359 | local_video_path=('local_video_path', 'first') 360 | ) 361 | .sort_values(by='relevance', ascending=True) 362 | .head(5) 363 | .reset_index(drop=True) 364 | ) 365 | 366 | return results, video_relevance 367 | 368 | def process_videos(video_links, uploaded_files_paths, keep_videos=False, video_quality="720p"): 369 | """ 370 | Process each YouTube video and uploaded files one by one, updating the dataset and vector database after each. 371 | """ 372 | # Initialize models within the function to avoid multi-processing issues 373 | whisper_model, embedding_model = initialize_models() 374 | 375 | # Paths for dataset and index 376 | video_titles = set() # Use a set to store unique video titles 377 | dataset_path = os.path.join(OFFLINE_YOUTUBE_DIR, 'datasets', 'transcript_dataset.csv') 378 | index_path = os.path.join(OFFLINE_YOUTUBE_DIR, 'datasets', 'vector_index.faiss') 379 | 380 | # Decide on video directory 381 | if keep_videos: 382 | video_dir = os.path.join(OFFLINE_YOUTUBE_DIR, 'videos') 383 | else: 384 | video_dir = os.path.join(OFFLINE_YOUTUBE_DIR, 'tmp') 385 | 386 | os.makedirs(video_dir, exist_ok=True) 387 | print(f"Using video directory: {video_dir}") 388 | 389 | # Load existing dataset if it exists 390 | if os.path.exists(dataset_path): 391 | data = pd.read_csv(dataset_path) 392 | if 'video_id' not in data.columns: 393 | data['video_id'] = data['YouTube_link'].apply(get_video_id) 394 | data.to_csv(dataset_path, index=False) 395 | existing_video_ids = set(data['video_id'].unique()) 396 | print(f"Loaded existing dataset with {len(existing_video_ids)} videos.") 397 | else: 398 | data = pd.DataFrame() 399 | existing_video_ids = set() 400 | print("No existing dataset found. Starting fresh.") 401 | 402 | # Load existing index if it exists 403 | if os.path.exists(index_path): 404 | try: 405 | index = faiss.read_index(index_path) 406 | print(f"Loaded existing FAISS index from {index_path}.") 407 | except Exception as e: 408 | print(f"Error loading FAISS index: {e}") 409 | index = None 410 | else: 411 | index = None 412 | print("No existing FAISS index found. A new index will be created.") 413 | 414 | # Process video links 415 | if video_links: 416 | for idx, link in enumerate(tqdm(video_links, desc="Processing Videos", unit="video")): 417 | video_id = get_video_id(link) 418 | if video_id in existing_video_ids: 419 | print(f"Video {video_id} already processed. Skipping.") 420 | continue # Skip already processed videos 421 | 422 | print(f"\nProcessing video {idx + 1}/{len(video_links)}: {link}") 423 | # Determine if we need to download audio-only 424 | download_audio_only = not keep_videos 425 | 426 | # Download video or audio and subtitles with selected video quality 427 | video_file, video_id, video_title, subtitles_available, subtitle_file = download_video( 428 | link, video_dir, keep_video=keep_videos, download_audio_only=download_audio_only, video_quality=video_quality 429 | ) 430 | 431 | if not subtitles_available and not video_file: 432 | print(f"Cannot process video {video_id} because neither subtitles nor audio/video are available.") 433 | continue 434 | 435 | # Transcribe audio or read subtitles 436 | print(f"Extracting transcript for video ID {video_id}...") 437 | if subtitles_available: 438 | print("Subtitles found. Using subtitles for transcript.") 439 | else: 440 | print("Subtitles not found. Using Whisper to transcribe audio.") 441 | 442 | sentences = extract_transcript(video_file, whisper_model, subtitles_available, subtitle_file) 443 | if not sentences: 444 | print(f"No transcript available for video {video_id}. Skipping.") 445 | continue 446 | thumbnail_path = download_thumbnail(video_id) 447 | 448 | new_data = [] 449 | embeddings = [] 450 | for sentence, timestamp in sentences: 451 | timestamped_link = f"https://www.youtube.com/watch?v={video_id}&t={int(timestamp)}s" 452 | local_video_path = os.path.abspath(video_file) if keep_videos and video_file else '' 453 | new_data.append({ 454 | 'video_id': video_id, 455 | 'text': sentence, 456 | 'timestamp': timestamp, 457 | 'YouTube_link': link, 458 | 'YouTube_timestamped_link': timestamped_link, 459 | 'thumbnail_path': thumbnail_path, 460 | 'video_title': video_title, 461 | 'local_video_path': local_video_path 462 | }) 463 | video_titles.add(video_title) 464 | # Encode the sentence to get embedding 465 | embedding = embedding_model.encode(sentence).astype('float32') 466 | embeddings.append(embedding) 467 | 468 | # Convert new_data to DataFrame 469 | new_data_df = pd.DataFrame(new_data) 470 | 471 | # Append new data to dataset 472 | data = pd.concat([data, new_data_df], ignore_index=True) 473 | # Save updated dataset 474 | data.to_csv(dataset_path, index=False) 475 | print(f"Updated dataset with {len(new_data_df)} new entries.") 476 | 477 | # Update the FAISS index 478 | if embeddings: 479 | embeddings = np.vstack(embeddings) 480 | dimension = embeddings.shape[1] 481 | if index is None: 482 | # Create new index 483 | index = faiss.IndexFlatL2(dimension) 484 | print(f"Created new FAISS index with dimension {dimension}.") 485 | index.add(embeddings) 486 | # Save the updated index 487 | faiss.write_index(index, index_path) 488 | print(f"Updated FAISS index with {len(embeddings)} new embeddings.") 489 | 490 | # Delete the audio/video file after processing if not keeping videos 491 | if not keep_videos and video_file and os.path.exists(video_file): 492 | os.remove(video_file) 493 | print(f"Deleted temporary video file: {video_file}") 494 | if subtitles_available and subtitle_file and os.path.exists(subtitle_file): 495 | os.remove(subtitle_file) 496 | print(f"Deleted temporary subtitle file: {subtitle_file}") 497 | 498 | # Process uploaded files 499 | if uploaded_files_paths: 500 | for idx, file_path in enumerate(tqdm(uploaded_files_paths, desc="Processing Uploaded Files", unit="file")): 501 | file_extension = os.path.splitext(file_path)[1].lower() 502 | is_video = file_extension in ['.mp4', '.mkv', '.avi', '.mov', '.flv', '.wmv'] 503 | is_audio = file_extension in ['.mp3', '.wav', '.aac', '.flac', '.ogg', '.m4a'] 504 | 505 | if not (is_video or is_audio): 506 | print(f"Unsupported file type for file {file_path}. Skipping.") 507 | continue 508 | 509 | video_id = os.path.splitext(os.path.basename(file_path))[0] 510 | video_title = video_id 511 | link = '' 512 | thumbnail_path = '' 513 | print(f"\nProcessing uploaded file {idx + 1}/{len(uploaded_files_paths)}: {file_path}") 514 | 515 | # Convert to MP4 if not already 516 | converted_mp4 = convert_to_mp4(file_path, os.path.join(OFFLINE_YOUTUBE_DIR, 'uploaded_files')) 517 | if not converted_mp4: 518 | print(f"Failed to convert {file_path} to MP4. Skipping.") 519 | continue 520 | 521 | # Extract audio from the converted MP4 522 | audio_file_path = extract_audio_from_video(converted_mp4) 523 | if not audio_file_path: 524 | print(f"Failed to extract audio from {converted_mp4}. Skipping.") 525 | continue 526 | 527 | # Transcribe using Whisper 528 | print(f"Transcribing uploaded file {video_id}...") 529 | sentences = extract_transcript(audio_file_path, whisper_model, subtitles_available=False, subtitle_file=None) 530 | if not sentences: 531 | print(f"No transcript available for file {video_id}. Skipping.") 532 | if os.path.exists(audio_file_path): 533 | shutil.rmtree(os.path.dirname(audio_file_path)) 534 | continue 535 | 536 | new_data = [] 537 | embeddings = [] 538 | for sentence, timestamp in sentences: 539 | timestamped_link = '' # No YouTube link for uploaded files 540 | local_video_path = os.path.abspath(converted_mp4) # Always keep uploaded files locally 541 | new_data.append({ 542 | 'video_id': video_id, 543 | 'text': sentence, 544 | 'timestamp': timestamp, 545 | 'YouTube_link': link, 546 | 'YouTube_timestamped_link': timestamped_link, 547 | 'thumbnail_path': thumbnail_path, # No thumbnail for uploaded files 548 | 'video_title': video_title, 549 | 'local_video_path': local_video_path 550 | }) 551 | video_titles.add(video_title) 552 | # Encode the sentence to get embedding 553 | embedding = embedding_model.encode(sentence).astype('float32') 554 | embeddings.append(embedding) 555 | 556 | # Convert new_data to DataFrame 557 | new_data_df = pd.DataFrame(new_data) 558 | 559 | # Append new data to dataset 560 | data = pd.concat([data, new_data_df], ignore_index=True) 561 | # Save updated dataset 562 | data.to_csv(dataset_path, index=False) 563 | print(f"Updated dataset with {len(new_data_df)} new entries from uploaded files.") 564 | 565 | # Update the FAISS index 566 | if embeddings: 567 | embeddings = np.vstack(embeddings) 568 | dimension = embeddings.shape[1] 569 | if index is None: 570 | # Create new index 571 | index = faiss.IndexFlatL2(dimension) 572 | print(f"Created new FAISS index with dimension {dimension}.") 573 | index.add(embeddings) 574 | # Save the updated index 575 | faiss.write_index(index, index_path) 576 | print(f"Updated FAISS index with {len(embeddings)} new embeddings.") 577 | 578 | # Delete the extracted audio file after processing 579 | if os.path.exists(audio_file_path): 580 | shutil.rmtree(os.path.dirname(audio_file_path)) 581 | print(f"Deleted temporary audio file directory: {os.path.dirname(audio_file_path)}") 582 | 583 | return data, video_titles 584 | 585 | def is_channel_url(url): 586 | """ 587 | Check if a URL is a YouTube channel URL. 588 | """ 589 | return any(x in url for x in ['/channel/', '/c/', '/user/']) 590 | 591 | def get_video_links(input_text, process_channel=False): 592 | """ 593 | Get video links from a list of input links, automatically detecting playlists, channels, and individual videos. 594 | """ 595 | video_links = [] 596 | if not input_text.strip(): 597 | return video_links 598 | links = [link.strip() for link in input_text.strip().split(',') if link.strip()] 599 | for link in links: 600 | try: 601 | ydl_opts = { 602 | 'quiet': True, 603 | 'no_warnings': True, 604 | 'extract_flat': 'in_playlist', 605 | } 606 | if is_channel_url(link): 607 | if not process_channel: 608 | print(f"Channel URL detected: {link}") 609 | print("Process Channel option is not enabled. Skipping channel.") 610 | continue 611 | else: 612 | # For channels, get all videos 613 | ydl_opts['playlistend'] = None 614 | else: 615 | # For non-channels, get all videos in playlists 616 | ydl_opts['playlistend'] = None 617 | with yt_dlp.YoutubeDL(ydl_opts) as ydl: 618 | info = ydl.extract_info(link, download=False) 619 | if '_type' in info and info['_type'] == 'playlist': 620 | # It's a playlist or a channel 621 | entries = info.get('entries', []) 622 | for entry in entries: 623 | video_id = entry.get('id') 624 | if video_id: 625 | video_link = f"https://www.youtube.com/watch?v={video_id}" 626 | video_links.append(video_link) 627 | elif 'id' in info: 628 | # It's a single video 629 | video_id = info['id'] 630 | video_link = f"https://www.youtube.com/watch?v={video_id}" 631 | video_links.append(video_link) 632 | else: 633 | print(f"Unknown link type, skipped: {link}") 634 | except Exception as e: 635 | print(f"Error processing link {link}: {e}") 636 | return video_links 637 | --------------------------------------------------------------------------------