├── offlineyoutube
    ├── lib
    │   ├── __init__.py
    │   ├── .DS_Store
    │   └── functions.py
    ├── __init__.py
    ├── .DS_Store
    ├── config.py
    └── app.py
├── pyinstaller scripts
    └── Apple silicon
    │   ├── README.md
    │   └── app.spec
├── requirements.txt
├── .gitignore
├── LICENSE
├── setup.py
├── legacy
    └── VectorDatabaseYoutube.py
└── README.md


/offlineyoutube/lib/__init__.py:
--------------------------------------------------------------------------------
1 | from .functions import *
2 | 


--------------------------------------------------------------------------------
/offlineyoutube/__init__.py:
--------------------------------------------------------------------------------
1 | # offline_youtube/__init__.py
2 | from .app import *
3 | 
4 | 


--------------------------------------------------------------------------------
/offlineyoutube/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DrewThomasson/offlineYoutube/HEAD/offlineyoutube/.DS_Store


--------------------------------------------------------------------------------
/offlineyoutube/lib/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DrewThomasson/offlineYoutube/HEAD/offlineyoutube/lib/.DS_Store


--------------------------------------------------------------------------------
/offlineyoutube/config.py:
--------------------------------------------------------------------------------
1 | # config.py
2 | import os
3 | OFFLINE_YOUTUBE_DIR = os.path.join(os.path.expanduser('~'), 'offlineyoutube_files')
4 | 
5 | 


--------------------------------------------------------------------------------
/pyinstaller scripts/Apple silicon/README.md:
--------------------------------------------------------------------------------
1 | ## To build as a binary on apple silicon move app.spec to root of repo at `/vectorDatabaseYoutube/` and run:
2 | 
3 | ```bash
4 | pyinstaller --clean app.spec -y
5 | ```
6 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | yt-dlp 
 2 | pandas 
 3 | numpy 
 4 | requests 
 5 | faiss-cpu 
 6 | faster-whisper 
 7 | sentence-transformers 
 8 | gradio==3.36.1
 9 | argparse 
10 | beautifulsoup4
11 | pysrt 
12 | webvtt-py
13 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Ignore build artifacts
 2 | build/
 3 | dist/
 4 | *.egg-info/
 5 | __pycache__/
 6 | 
 7 | # macOS files
 8 | .DS_Store
 9 | 
10 | # Python cache
11 | *.pyc
12 | *.pyo
13 | 
14 | # Virtual environments
15 | venv/
16 | 
17 | # Project-specific files
18 | offlineyoutube/offlineYoutubeFiles/
19 | 
20 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Drew Thomasson
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/pyinstaller scripts/Apple silicon/app.spec:
--------------------------------------------------------------------------------
 1 | # -*- mode: python ; coding: utf-8 -*-
 2 | from PyInstaller.utils.hooks import collect_data_files
 3 | 
 4 | # Collect data files for Gradio and Gradio Client
 5 | datas = []
 6 | datas += collect_data_files('gradio')
 7 | datas += collect_data_files('gradio_client')
 8 | 
 9 | a = Analysis(
10 |     ['app.py'],  # Your main application entry point
11 |     pathex=[],  # Add paths if necessary
12 |     binaries=[],  # Include any additional binaries if needed
13 |     datas=datas,
14 |     hiddenimports=[],  # Specify hidden imports if any
15 |     hookspath=[],  # Add hook paths if required
16 |     hooksconfig={},
17 |     runtime_hooks=[],
18 |     excludes=[],
19 |     noarchive=False,
20 |     optimize=0,  # Optimization level (0 for no optimization)
21 |     module_collection_mode={
22 |         'gradio': 'py',  # Collect Gradio as source .py files
23 |     },
24 | )
25 | 
26 | # Create the executable in a single-file format
27 | pyz = PYZ(a.pure)
28 | 
29 | exe = EXE(
30 |     pyz,
31 |     a.scripts,
32 |     [],
33 |     exclude_binaries=True,
34 |     name='app',
35 |     debug=False,
36 |     bootloader_ignore_signals=False,
37 |     strip=False,
38 |     upx=True,
39 |     console=True,
40 |     disable_windowed_traceback=False,
41 |     argv_emulation=False,
42 |     target_arch=None,
43 |     codesign_identity=None,
44 |     entitlements_file=None,
45 |     onefile=True,  # Ensure single-file build
46 | )
47 | 
48 | # Final collection step, collecting necessary files and binaries
49 | coll = COLLECT(
50 |     exe,
51 |     a.binaries,
52 |     a.datas,
53 |     strip=False,
54 |     upx=True,
55 |     upx_exclude=[],
56 |     name='app',
57 | )
58 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import platform
 3 | from setuptools import setup, find_packages
 4 | 
 5 | # Check for M1 Mac and Python version
 6 | if platform.system() == "Darwin" and platform.processor() == "arm":
 7 |     if not (sys.version_info.major == 3 and sys.version_info.minor == 10):
 8 |         raise RuntimeError(
 9 |             "This package requires Python 3.10 on M1 Macs. "
10 |             "Please create a Python 3.10 virtual environment and try again."
11 |         )
12 | 
13 | setup(
14 |     name="offlineyoutube",
15 |     version="2.1.9",
16 |     packages=find_packages(),
17 |     include_package_data=True,
18 |     install_requires=[
19 |         "yt-dlp",
20 |         "pandas",
21 |         "numpy",
22 |         "requests",
23 |         "faiss-cpu",
24 |         "faster-whisper",
25 |         "sentence-transformers",
26 |         "gradio==3.36.1",
27 |         "argparse",
28 |         "beautifulsoup4",
29 |         "pysrt",
30 |         "webvtt-py"
31 |     ],
32 |     entry_points={
33 |         "console_scripts": [
34 |             "offlineyoutube=offlineyoutube.app:main"
35 |         ]
36 |     },
37 |     python_requires=">=3.8",
38 |     author="Andrew Phillip Thomasson",
39 |     author_email="drew.thomasson100@gmail.com",
40 |     description="A YouTube video search and management tool with a Gradio interface",
41 |     long_description=open("README.md").read(),
42 |     long_description_content_type="text/markdown",
43 |     url="https://github.com/DrewThomasson/offlineYoutube",
44 |     classifiers=[
45 |         "Programming Language :: Python :: 3",
46 |         "License :: OSI Approved :: MIT License",
47 |         "Operating System :: OS Independent",
48 |     ],
49 | )
50 | 


--------------------------------------------------------------------------------
/legacy/VectorDatabaseYoutube.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | import yt_dlp
  4 | import pandas as pd
  5 | import numpy as np
  6 | import requests
  7 | import faiss
  8 | from faster_whisper import WhisperModel
  9 | from sentence_transformers import SentenceTransformer
 10 | 
 11 | # Setup directories
 12 | os.makedirs('thumbnails', exist_ok=True)
 13 | os.makedirs('datasets', exist_ok=True)
 14 | 
 15 | # Initialize models
 16 | whisper_model = WhisperModel("small", device="cpu", compute_type="int8")
 17 | embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
 18 | 
 19 | def extract_video_id_from_link(link):
 20 |     video_id = re.search(r"v=([0-9A-Za-z_-]{11})", link)
 21 |     return f"https://www.youtube.com/watch?v={video_id.group(1)}" if video_id else link
 22 | 
 23 | 
 24 | # Helper function to extract YouTube video ID
 25 | def get_video_id(youtube_link):
 26 |     pattern = r"(?:v=|\/)([0-9A-Za-z_-]{11}).*"
 27 |     match = re.search(pattern, youtube_link)
 28 |     return match.group(1) if match else None
 29 | 
 30 | # Download thumbnail for offline use
 31 | def download_thumbnail(video_id):
 32 |     thumbnail_url = f"https://img.youtube.com/vi/{video_id}/hqdefault.jpg"
 33 |     thumbnail_path = f"thumbnails/{video_id}.jpg"
 34 |     
 35 |     if not os.path.exists(thumbnail_path):
 36 |         response = requests.get(thumbnail_url, stream=True)
 37 |         if response.status_code == 200:
 38 |             with open(thumbnail_path, 'wb') as f:
 39 |                 f.write(response.content)
 40 |     return thumbnail_path
 41 | 
 42 | # Transcribe audio with faster-whisper
 43 | def extract_transcript(video_url):
 44 |     video_id = get_video_id(video_url)
 45 |     print(f"Transcribing {video_id}...")
 46 | 
 47 |     with yt_dlp.YoutubeDL({'format': 'bestaudio'}) as ydl:
 48 |         info = ydl.extract_info(video_url, download=False)
 49 |         audio_url = info['url']
 50 | 
 51 |     segments, _ = whisper_model.transcribe(audio_url, vad_filter=True)
 52 | 
 53 |     sentences = []
 54 |     for segment in segments:
 55 |         for sentence in segment.text.split('.'):
 56 |             sentence = sentence.strip()
 57 |             if sentence:
 58 |                 sentences.append((sentence, segment.start))
 59 |     return sentences
 60 | 
 61 | # Process videos into a dataset
 62 | def process_videos(video_links):
 63 |     data = []
 64 | 
 65 |     for link in video_links:
 66 |         video_id = get_video_id(link)
 67 |         sentences = extract_transcript(link)
 68 |         thumbnail_path = download_thumbnail(video_id)
 69 | 
 70 |         for sentence, timestamp in sentences:
 71 |             data.append({
 72 |                 'text': sentence,
 73 |                 'timestamp': timestamp,
 74 |                 'YouTube_link': link,
 75 |                 'thumbnail_path': thumbnail_path
 76 |             })
 77 | 
 78 |     return pd.DataFrame(data)
 79 | 
 80 | # Save dataset to CSV
 81 | def save_dataset(data):
 82 |     dataset_path = 'datasets/transcript_dataset.csv'
 83 |     if os.path.exists(dataset_path):
 84 |         existing_data = pd.read_csv(dataset_path)
 85 |         data = pd.concat([existing_data, data], ignore_index=True)
 86 |     data.to_csv(dataset_path, index=False)
 87 |     print(f"Dataset saved to {dataset_path}")
 88 | 
 89 | # Create a vector database using FAISS
 90 | def create_vector_database(data):
 91 |     data['embedding'] = data['text'].apply(lambda x: embedding_model.encode(x))
 92 | 
 93 |     dimension = len(data['embedding'].iloc[0])
 94 |     index = faiss.IndexFlatL2(dimension)
 95 | 
 96 |     embeddings = np.vstack(data['embedding'].values)
 97 |     index.add(embeddings)
 98 | 
 99 |     # Save the FAISS index
100 |     faiss.write_index(index, 'datasets/vector_index.faiss')
101 |     print("Vector database created and saved.")
102 |     return index
103 | 
104 | # Query the vector database
105 | def query_vector_database(query, top_k=5):
106 |     index = faiss.read_index('datasets/vector_index.faiss')
107 |     data = pd.read_csv('datasets/transcript_dataset.csv')
108 | 
109 |     query_vector = embedding_model.encode(query).reshape(1, -1)
110 |     distances, indices = index.search(query_vector, top_k)
111 | 
112 |     results = data.loc[indices[0]].copy()  # Avoid SettingWithCopyWarning
113 |     results['score'] = distances[0]
114 | 
115 |     # Extract base video link for grouping
116 |     results['video_id'] = results['YouTube_link'].apply(extract_video_id_from_link)
117 | 
118 |     # Aggregate most relevant videos by video ID
119 |     video_relevance = (
120 |         results.groupby('video_id')
121 |         .agg(
122 |             relevance=('score', 'mean'),  # Average relevance for each video
123 |             thumbnail=('thumbnail_path', 'first'),  # Use the first thumbnail
124 |             text=('text', 'first'),  # Use the first text snippet
125 |             original_link=('YouTube_link', 'first')  # Use the first timestamped link
126 |         )
127 |         .reset_index()
128 |         .sort_values(by='relevance', ascending=True)  # Sort by relevance (lower is better)
129 |         .head(5)  # Limit to top 5 videos
130 |     )
131 | 
132 |     return results[['text', 'YouTube_link', 'thumbnail_path', 'score']], video_relevance
133 | 
134 | 
135 | # Main function to handle video input and queries
136 | def main():
137 |     if not os.path.exists('datasets/transcript_dataset.csv'):
138 |         print("No database found. Please add videos to create the initial database.")
139 |         video_links = get_video_links()
140 |         data = process_videos(video_links)
141 |         save_dataset(data)
142 |         create_vector_database(data)
143 |     else:
144 |         print("1: Add more videos\n2: Query the existing database")
145 |         option = input("Select an option: ").strip()
146 | 
147 |         if option == '1':
148 |             video_links = get_video_links()
149 |             data = process_videos(video_links)
150 |             save_dataset(data)
151 |             create_vector_database(data)
152 |         elif option == '2':
153 |             query_loop()
154 |         else:
155 |             print("Invalid option.")
156 | 
157 | def get_video_links():
158 |     print("1: Provide a playlist link\n2: Provide a list of video links")
159 |     option = input("Select an option: ").strip()
160 | 
161 |     if option == '1':
162 |         playlist_url = input("Enter YouTube playlist URL: ").strip()
163 |         with yt_dlp.YoutubeDL({'extract_flat': 'in_playlist'}) as ydl:
164 |             playlist_info = ydl.extract_info(playlist_url, download=False)
165 |             video_links = [entry['url'] for entry in playlist_info['entries']]
166 |     elif option == '2':
167 |         video_links = input("Enter YouTube video links (comma-separated): ").strip().split(',')
168 |     else:
169 |         print("Invalid option.")
170 |         return []
171 | 
172 |     return video_links
173 | 
174 | def query_loop():
175 |     while True:
176 |         query = input("Enter your search query (or 'exit' to quit): ").strip()
177 |         if query.lower() == 'exit':
178 |             break
179 | 
180 |         results, top_videos = query_vector_database(query)
181 | 
182 |         # Print detailed results for each text entry
183 |         print("\nDetailed Results:\n")
184 |         for _, row in results.iterrows():
185 |             print(f"Text: {row['text']}")
186 |             print(f"Link: {row['YouTube_link']}")
187 |             print(f"Thumbnail: {row['thumbnail_path']}")
188 |             print(f"Score: {row['score']:.4f}\n")
189 | 
190 |         # Print top-ranked videos based on relevance
191 |         print("\nTop Relevant Videos:\n")
192 |         for idx, row in top_videos.iterrows():
193 |             print(f"Rank {idx + 1}:")
194 |             print(f"Relevance Score: {row['relevance']:.4f}")
195 |             print(f"Video Link: {row['original_link']}")
196 |             print(f"Thumbnail: {row['thumbnail']}")
197 |             print(f"Example Text: {row['text']}\n")
198 | 
199 | # Run the application
200 | if __name__ == "__main__":
201 |     main()
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # **Offline YouTube Video Search Application**
  2 | 
  3 | This application allows users to **extract transcripts from YouTube videos**, **upload their own video/audio files**, **create searchable vector databases**, and **perform semantic searches** using a **Gradio web interface** or **command-line interface (CLI)**. It's powered by `faster-whisper` for transcription, `FAISS` for vector search, and `sentence-transformers` for text embeddings.
  4 | 
  5 | ---
  6 | 
  7 | ## **Features**
  8 | 
  9 | - Extract transcripts from individual videos, playlists, and entire channels.
 10 | - **Upload your own video or audio files for processing.**
 11 | - Automatically detect playlists, channels, and individual video links.
 12 | - Automatically download video thumbnails.
 13 | - Store transcripts and create a searchable vector database.
 14 | - Perform semantic searches on video content.
 15 | - Supports **Gradio web interface** and **CLI** for flexible usage.
 16 | - Easily add more videos or your own files to the dataset.
 17 | 
 18 | ---
 19 | 
 20 | ## **Web Interface**
 21 | 
 22 | ### **Add Videos Tab**
 23 | 
 24 | - **Enter playlist, channel, and/or video URLs (comma-separated).**
 25 | - **Upload your own video/audio files.**
 26 | - **Option to process entire channels when a channel URL is provided.**
 27 | - **Option to keep videos stored locally or not.**
 28 | 
 29 | <img width="628" alt="Screenshot 2024-11-01 at 11 14 22 AM" src="https://github.com/user-attachments/assets/00807fa3-ac86-4940-a72a-60fa267577d0">
 30 | 
 31 | ### **Search Tab**
 32 | 
 33 | - **Enter your search query to find relevant snippets.**
 34 | - **View top relevant videos with thumbnails and play local videos if available.**
 35 | - **View detailed results with timestamps and direct links.**
 36 | 
 37 | <img width="635" alt="Screenshot 2024-11-01 at 11 18 01 AM" src="https://github.com/user-attachments/assets/c2c21482-dbf6-4515-b1ca-d1bf650e3c48">
 38 | <img width="635" alt="Screenshot 2024-11-01 at 12 05 34 PM" src="https://github.com/user-attachments/assets/eb881286-a827-410b-a484-641b78ea1e0e">
 39 | 
 40 | ---
 41 | 
 42 | ## **Installation**
 43 | ![PyPI Downloads](https://static.pepy.tech/badge/offlineyoutube)
 44 | 
 45 | Ensure you have Python installed (>= 3.8). Then, pip install:
 46 | (Requires Python 3.10 for Apple Silicon Macs)
 47 | 
 48 | ```bash
 49 | pip install offlineyoutube
 50 | ```
 51 | 
 52 | ---
 53 | 
 54 | ## **Usage**
 55 | 
 56 | The app provides **two ways to interact**:  
 57 | 1. **Gradio Web Interface**  
 58 | 2. **Command-Line Interface (CLI)**
 59 | 
 60 | ### **1. Running the Gradio Web Interface**
 61 | 
 62 | Launch the web interface:
 63 | 
 64 | ```bash
 65 | offlineyoutube ui
 66 | ```
 67 | 
 68 | or simply:
 69 | 
 70 | ```bash
 71 | offlineyoutube
 72 | ```
 73 | 
 74 | Then, open the URL (usually `http://127.0.0.1:7860`) in your browser.
 75 | 
 76 | #### **Gradio Interface Tabs:**
 77 | 
 78 | - **Add Videos:**  
 79 |   - Enter playlist URLs, channel URLs, and/or individual video URLs (comma-separated).
 80 |   - **Upload your own video or audio files for processing.**
 81 |   - **Option to process entire YouTube channels when a channel URL is provided.**
 82 |   - **Option to keep videos stored locally or not.**
 83 |   - The app will automatically detect whether each link is a playlist, channel, or a video.
 84 |   - Videos and uploaded files will be transcribed, and the database will be updated with the content.
 85 |   
 86 | - **Search:**  
 87 |   - Enter search queries to find relevant snippets from the video transcripts.
 88 |   - Results are ranked based on semantic similarity and include video thumbnails.
 89 |   - **If local videos are available, you can play them directly in the interface.**
 90 | 
 91 | ---
 92 | 
 93 | ### **2. Command-Line Interface (CLI)**
 94 | 
 95 | The CLI provides more flexibility for programmatic use.
 96 | 
 97 | #### **Commands Overview**
 98 | 
 99 | Use the `--help` command to view available commands and examples:
100 | 
101 | ```bash
102 | offlineyoutube --help
103 | ```
104 | 
105 | **Output:**
106 | 
107 | ```
108 | usage: offlineyoutube [-h] {add,search,ui} ...
109 | 
110 | YouTube Video Search Application
111 | 
112 | positional arguments:
113 |   {add,search,ui}   Available commands
114 |     add             Add videos to the database
115 |     search          Search the video database
116 |     ui              Run the Gradio web interface
117 | 
118 | optional arguments:
119 |   -h, --help        Show this help message and exit
120 | 
121 | Examples:
122 |   # Add videos from a playlist and keep videos locally
123 |   offlineyoutube add --input "https://www.youtube.com/playlist?list=YOUR_PLAYLIST_ID" --keep_videos
124 | 
125 |   # Add specific videos without keeping videos locally
126 |   offlineyoutube add --input "https://www.youtube.com/watch?v=VIDEO_ID1,https://www.youtube.com/watch?v=VIDEO_ID2"
127 | 
128 |   # Add videos from a channel (process entire channel)
129 |   offlineyoutube add --input "https://www.youtube.com/channel/CHANNEL_ID" --process_channel
130 | 
131 |   # Search the database with a query
132 |   offlineyoutube search --query "Your search query" --top_k 5
133 | 
134 |   # Run the Gradio web interface
135 |   offlineyoutube ui
136 | ```
137 | 
138 | ---
139 | 
140 | ### **Examples of CLI Usage**
141 | 
142 | #### **1. Adding Videos**
143 | 
144 | - **Add Playlists and Videos:**
145 | 
146 |    ```bash
147 |    offlineyoutube add --input "https://www.youtube.com/playlist?list=YOUR_PLAYLIST_ID,https://www.youtube.com/watch?v=VIDEO_ID"
148 |    ```
149 | 
150 | - **Add Specific Videos Without Keeping Them Locally:**
151 | 
152 |    ```bash
153 |    offlineyoutube add --input "https://www.youtube.com/watch?v=dQw4w9WgXcQ,https://www.youtube.com/watch?v=9bZkp7q19f0"
154 |    ```
155 | 
156 | - **Add Videos from a Channel (Process Entire Channel):**
157 | 
158 |    ```bash
159 |    offlineyoutube add --input "https://www.youtube.com/channel/CHANNEL_ID" --process_channel
160 |    ```
161 | 
162 | - **Add Videos and Keep Videos Stored Locally:**
163 | 
164 |    ```bash
165 |    offlineyoutube add --input "https://www.youtube.com/watch?v=VIDEO_ID" --keep_videos
166 |    ```
167 | 
168 | #### **2. Searching the Database**
169 | 
170 | - **Perform a Search:**
171 | 
172 |    ```bash
173 |    offlineyoutube search --query "machine learning tutorials" --top_k 5
174 |    ```
175 | 
176 | ---
177 | 
178 | ### **How It Works**
179 | 
180 | 1. **Adding Videos and Uploaded Files:**
181 |    - The app accepts a list of links and automatically detects whether each link is a playlist, channel, or an individual video.
182 |    - **You can upload your own video or audio files for processing.**
183 |    - It downloads video audio (or uses uploaded files) and transcribes it using `faster-whisper`.
184 |    - Thumbnails are downloaded and saved locally.
185 |    - The transcript data is saved in `datasets/transcript_dataset.csv`.
186 |    - A vector database is updated using FAISS with embeddings generated by `sentence-transformers`.
187 | 
188 | 2. **Incremental Updating:**
189 |    - Videos and uploaded files are processed one by one, and the dataset and vector database are updated incrementally.
190 |    - This ensures efficient processing, especially when dealing with large datasets.
191 | 
192 | 3. **Searching the Database:**
193 |    - When a query is entered, the app computes its embedding and searches the FAISS index for relevant video snippets.
194 |    - The top results are displayed with thumbnails, titles, and links to the videos.
195 |    - **If local videos are available, you can play them directly in the interface.**
196 | 
197 | ---
198 | 
199 | ### **FAQ**
200 | 
201 | #### **1. How do I add multiple playlists, channels, and videos at once?**
202 | 
203 | Simply provide a comma-separated list of URLs, and the app will automatically detect and process each link:
204 | 
205 | ```bash
206 | offlineyoutube add --input "https://www.youtube.com/playlist?list=PLAYLIST_ID1,https://www.youtube.com/watch?v=VIDEO_ID,https://www.youtube.com/channel/CHANNEL_ID"
207 | ```
208 | 
209 | If you want to process entire channels, make sure to include the `--process_channel` flag:
210 | 
211 | ```bash
212 | offlineyoutube add --input "https://www.youtube.com/channel/CHANNEL_ID" --process_channel
213 | ```
214 | 
215 | #### **2. How can I upload my own video or audio files for processing?**
216 | 
217 | In the Gradio web interface, navigate to the **Add Videos** tab. Use the **"Upload your own video/audio files"** option to upload one or multiple files. The app will process these files and add them to the database.
218 | 
219 | #### **3. Why aren’t new videos or uploaded files showing up in search results?**
220 | 
221 | Ensure that the videos or files have been fully processed and that the vector database has been updated. The app handles this automatically, but processing may take time for large videos, playlists, or channels.
222 | 
223 | #### **4. How do I prevent videos from being stored locally?**
224 | 
225 | By default, the app keeps videos stored locally. To change this behavior, use the `--keep_videos` flag and set it to `False`:
226 | 
227 | ```bash
228 | offlineyoutube add --input "VIDEO_OR_PLAYLIST_URL" --keep_videos False
229 | ```
230 | 
231 | In the Gradio interface, uncheck the **"Keep videos stored locally"** option in the **Add Videos** tab.
232 | 
233 | #### **5. Can I process entire YouTube channels?**
234 | 
235 | Yes! Use the `--process_channel` flag when adding videos via the CLI:
236 | 
237 | ```bash
238 | offlineyoutube add --input "https://www.youtube.com/channel/CHANNEL_ID" --process_channel
239 | ```
240 | 
241 | In the Gradio interface, check the **"Process entire channel when a channel URL is provided"** option in the **Add Videos** tab.
242 | 
243 | #### **6. Can I search the database without launching the Gradio interface?**
244 | 
245 | Yes! Use the `search` command via the CLI:
246 | 
247 | ```bash
248 | offlineyoutube search --query "Your query" --top_k 5
249 | ```
250 | 
251 | ---
252 | 
253 | ### **Project Structure**
254 | 
255 | ```
256 | .
257 | ├── app.py                       # Main application script (Gradio + CLI)
258 | ├── functions.py                 # Helper functions for transcription, FAISS, etc.
259 | ├── datasets/
260 | │   ├── transcript_dataset.csv   # CSV file storing transcripts
261 | │   └── vector_index.faiss       # FAISS vector index
262 | ├── thumbnails/                  # Folder for storing video thumbnails
263 | ├── videos/                      # Folder for storing downloaded videos (if keep_videos is True)
264 | ├── tmp/                         # Temporary folder for videos (if keep_videos is False)
265 | ├── uploaded_files/              # Folder for storing uploaded files
266 | ```
267 | 
268 | ---
269 | 
270 | ### **Known Limitations**
271 | 
272 | - **Processing Time:** Transcribing videos and generating embeddings can be time-consuming, especially for long videos, large playlists, or channels.
273 | - **Storage Requirements:** Keeping videos stored locally will require additional disk space. Use the `--keep_videos False` option if storage is a concern.
274 | - **Large Datasets:** As the dataset grows, querying may take longer. Consider optimizing the FAISS index for very large datasets.
275 | 
276 | ---
277 | 
278 | ### **Contributing**
279 | 
280 | Feel free to fork the repository, open issues, or submit pull requests if you'd like to contribute to this project.
281 | 
282 | ---
283 | 
284 | ### **License**
285 | 
286 | This project is licensed under the MIT License. See the LICENSE file for details.
287 | 
288 | ---
289 | 
290 | ### **Acknowledgments**
291 | 
292 | - **faster-whisper** for fast transcription.
293 | - **FAISS** for efficient vector search.
294 | - **Gradio** for the interactive web interface.
295 | - **yt-dlp** for downloading video content.
296 | 
297 | ---
298 | 


--------------------------------------------------------------------------------
/offlineyoutube/app.py:
--------------------------------------------------------------------------------
  1 | # app.py
  2 | 
  3 | import os
  4 | import sys
  5 | sys.path.append(os.path.dirname(__file__))  # Add this line here
  6 | import multiprocessing
  7 | import shutil
  8 | import gradio as gr
  9 | import argparse
 10 | import pandas as pd
 11 | from lib.functions import (
 12 |     initialize_models, setup_directories, process_videos,
 13 |     query_vector_database, get_video_links
 14 | )
 15 | from config import OFFLINE_YOUTUBE_DIR  # Ensure this path is correct
 16 | 
 17 | def add_videos_interface(input_text, uploaded_files, process_channel, keep_videos, video_quality):
 18 |     """
 19 |     Interface function for adding videos to the database.
 20 |     """
 21 |     # Initialize models within the function to avoid multi-processing issues
 22 |     whisper_model, embedding_model = initialize_models()
 23 |     
 24 |     video_links = get_video_links(input_text, process_channel)
 25 |     uploaded_files_paths = []
 26 |     if uploaded_files:
 27 |         uploaded_files_dir = os.path.join(OFFLINE_YOUTUBE_DIR, 'uploaded_files')
 28 |         os.makedirs(uploaded_files_dir, exist_ok=True)
 29 |         for uploaded_file in uploaded_files:
 30 |             try:
 31 |                 original_filename = os.path.basename(uploaded_file.name)
 32 |                 file_path = os.path.join(uploaded_files_dir, original_filename)
 33 |                 
 34 |                 shutil.copy(uploaded_file.name, file_path)
 35 |                 
 36 |                 if os.path.getsize(file_path) == 0:
 37 |                     print(f"Uploaded file {original_filename} is empty. Skipping.")
 38 |                     continue
 39 |                 uploaded_files_paths.append(file_path)
 40 |                 print(f"Saved uploaded file {original_filename} to {file_path} ({os.path.getsize(file_path)} bytes)")
 41 |             except Exception as e:
 42 |                 print(f"Error saving uploaded file {original_filename}: {e}")
 43 |     if not video_links and not uploaded_files_paths:
 44 |         return "No valid video links or files provided."
 45 |     # Process videos and uploaded files with selected video quality
 46 |     data, video_titles = process_videos(
 47 |         video_links, uploaded_files_paths, keep_videos=keep_videos, video_quality=video_quality
 48 |     )
 49 |     
 50 |     # Prepare a message with the video titles
 51 |     if video_titles:
 52 |         titles_message = "\n".join(f"- {title}" for title in video_titles)
 53 |         return f"Videos processed and database updated.\nAdded Videos:\n{titles_message}"
 54 |     else:
 55 |         return "No new videos were added to the database."
 56 | 
 57 | def search_interface(query_text, top_k):
 58 |     """
 59 |     Interface function for searching the database.
 60 |     """
 61 |     # Initialize only the embedding model within the function
 62 |     _, embedding_model = initialize_models()
 63 |     
 64 |     index_path = os.path.join(OFFLINE_YOUTUBE_DIR, 'datasets', 'vector_index.faiss')
 65 |     dataset_path = os.path.join(OFFLINE_YOUTUBE_DIR, 'datasets', 'transcript_dataset.csv')
 66 |     
 67 |     if not os.path.exists(index_path):
 68 |         return "No database found. Please add videos first.", None
 69 |     try:
 70 |         results, top_videos = query_vector_database(query_text, embedding_model, top_k=top_k)
 71 |     except Exception as e:
 72 |         return f"Error: {e}", None
 73 | 
 74 |     # Prepare top videos
 75 |     top_videos_html = "<h1>Top Relevant Videos:</h1>"
 76 |     for idx, row in top_videos.iterrows():
 77 |         rank = idx + 1  # Since idx is now sequential
 78 |         # Check if local video exists
 79 |         local_video_path = row['local_video_path']
 80 |         if isinstance(local_video_path, str) and local_video_path and not pd.isnull(local_video_path):
 81 |             local_video_exists = os.path.exists(local_video_path)
 82 |         else:
 83 |             local_video_exists = False
 84 |         local_video_player = ''
 85 |         if local_video_exists:
 86 |             # Replace backslashes with forward slashes for compatibility
 87 |             local_video_url = 'file/' + local_video_path.replace("\\", "/")
 88 |             local_video_player = f"""
 89 |             <details>
 90 |                 <summary>Show Local Video</summary>
 91 |                 <video width='320' height='240' controls>
 92 |                     <source src='{local_video_url}' type='video/mp4'>
 93 |                     Your browser does not support the video tag.
 94 |                 </video>
 95 |             </details>
 96 |             """
 97 |         top_videos_html += f"""
 98 |         <div style='margin-bottom:20px;'>
 99 |             <h4>Rank {rank}</h4>
100 |             <img src='file/{row['thumbnail']}' alt='Thumbnail' width='120' style='float:left; margin-right:10px;'>
101 |             <p><strong>Title:</strong> {row['video_title']}</p>
102 |             <p><strong>Relevance Score:</strong> {row['relevance']:.4f}</p>
103 |             <p><strong>Example Text:</strong> {row['text']}</p>
104 |             <p><a href='{row['original_link']}' target='_blank'>Watch on YouTube</a></p>
105 |             {local_video_player}
106 |             <div style='clear:both;'></div>
107 |         </div>
108 |         """
109 | 
110 |     # Prepare detailed results
111 |     detailed_html = "<h1>Detailed Results:</h1>"
112 |     for _, row in results.iterrows():
113 |         # Check if local video exists
114 |         local_video_path = row['local_video_path']
115 |         if isinstance(local_video_path, str) and local_video_path and not pd.isnull(local_video_path):
116 |             local_video_exists = os.path.exists(local_video_path)
117 |         else:
118 |             local_video_exists = False
119 |         local_video_player = ''
120 |         if local_video_exists:
121 |             # Replace backslashes with forward slashes for compatibility
122 |             local_video_url = 'file/' + local_video_path.replace("\\", "/")
123 |             timestamp = int(row['timestamp'])
124 |             local_video_player = f"""
125 |             <details>
126 |                 <summary>Show Local Video at Timestamp</summary>
127 |                 <video width='320' height='240' controls>
128 |                     <source src='{local_video_url}#t={timestamp}' type='video/mp4'>
129 |                     Your browser does not support the video tag.
130 |                 </video>
131 |             </details>
132 |             """
133 |         detailed_html += f"""
134 |         <div style='margin-bottom:20px;'>
135 |             <img src='file/{row['thumbnail_path']}' alt='Thumbnail' width='120' style='float:left; margin-right:10px;'>
136 |             <p><strong>Title:</strong> {row['video_title']}</p>
137 |             <p><strong>Text:</strong> {row['text']}</p>
138 |             <p><strong>Score:</strong> {row['score']:.4f}</p>
139 |             <p><a href='{row['YouTube_timestamped_link']}' target='_blank'>Watch on YouTube at Timestamp</a></p>
140 |             {local_video_player}
141 |             <div style='clear:both;'></div>
142 |         </div>
143 |         """
144 |     return top_videos_html, detailed_html
145 | 
146 | def main():
147 |     os.environ["TOKENIZERS_PARALLELISM"] = "false"
148 |     setup_directories()
149 |     
150 |     parser = argparse.ArgumentParser(
151 |         description="YouTube Video Search Application",
152 |         epilog="""
153 | Examples:
154 |   # Add videos from a playlist and keep videos locally
155 |   python app.py add --input "https://www.youtube.com/playlist?list=YOUR_PLAYLIST_ID" --keep_videos
156 |   
157 |   # Add specific videos without keeping videos locally
158 |   python app.py add --input "https://www.youtube.com/watch?v=VIDEO_ID1,https://www.youtube.com/watch?v=VIDEO_ID2"
159 |   
160 |   # Search the database with a query
161 |   python app.py search --query "Your search query" --top_k 5
162 |   
163 |   # Run the Gradio web interface
164 |   python app.py ui
165 | """,
166 |         formatter_class=argparse.RawDescriptionHelpFormatter
167 |     )
168 | 
169 |     subparsers = parser.add_subparsers(dest='command')
170 | 
171 |     # Add videos command
172 |     parser_add = subparsers.add_parser('add', help='Add videos to the database')
173 |     parser_add.add_argument('--input', required=True, help='Playlist URL or comma-separated video URLs')
174 |     parser_add.add_argument('--process_channel', action='store_true', help='Process entire channel when a channel URL is provided')
175 |     parser_add.add_argument('--keep_videos', action='store_true', help='Keep videos stored locally')
176 | 
177 |     # Search command
178 |     parser_search = subparsers.add_parser('search', help='Search the video database')
179 |     parser_search.add_argument('--query', required=True, help='Search query')
180 |     parser_search.add_argument('--top_k', type=int, default=5, help='Number of results to return')
181 | 
182 |     # Run Gradio interface
183 |     parser_ui = subparsers.add_parser('ui', help='Run the Gradio web interface')
184 | 
185 |     args = parser.parse_args()
186 | 
187 |     if args.command == 'add':
188 |         # For CLI, use the default video quality of 720p
189 |         default_video_quality = "720p"
190 |         status = add_videos_interface(args.input, [], args.process_channel, args.keep_videos, default_video_quality)
191 |         print(status)
192 | 
193 |     elif args.command == 'search':
194 |         top_videos_html, detailed_results = search_interface(args.query, args.top_k)
195 |         if isinstance(top_videos_html, str):
196 |             print(top_videos_html)
197 |         else:
198 |             # Extract data from HTML for console output
199 |             from bs4 import BeautifulSoup
200 | 
201 |             # Extract top videos
202 |             soup = BeautifulSoup(top_videos_html, 'html.parser')
203 |             print("Top Relevant Videos:\n")
204 |             for idx, div in enumerate(soup.find_all('div')):
205 |                 rank = div.find('h4').text
206 |                 title = div.find('p', text=lambda t: t and 'Title:' in t).text
207 |                 relevance = div.find('p', text=lambda t: t and 'Relevance Score:' in t).text
208 |                 example_text = div.find('p', text=lambda t: t and 'Example Text:' in t).text
209 |                 link = div.find('a')['href']
210 |                 print(f"{rank}\n{title}\n{relevance}\n{example_text}\nLink: {link}\n")
211 | 
212 |             # Extract detailed results
213 |             soup = BeautifulSoup(detailed_results, 'html.parser')
214 |             print("Detailed Results:\n")
215 |             for div in soup.find_all('div'):
216 |                 title = div.find('p', text=lambda t: t and 'Title:' in t).text
217 |                 text = div.find('p', text=lambda t: t and 'Text:' in t).text
218 |                 score = div.find('p', text=lambda t: t and 'Score:' in t).text
219 |                 link = div.find('a')['href']
220 |                 print(f"{title}\n{score}\n{text}\nLink: {link}\n")
221 | 
222 |     else:
223 |         # Run Gradio interface if no command is provided or 'ui' command is used
224 |         with gr.Blocks(theme=gr.themes.Soft()) as demo:
225 |             gr.Markdown("# 🎥 YouTube Video Search Application")
226 | 
227 |             with gr.Tab("Add Videos"):
228 |                 gr.Markdown("### Add videos to the database")
229 |                 input_text = gr.Textbox(lines=2, placeholder="Enter playlist, channel, and/or video URLs (comma-separated)")
230 |                 process_channel = gr.Checkbox(label="Process entire channel when a channel URL is provided", value=False)
231 |                 keep_videos = gr.Checkbox(label="Keep videos stored locally", value=True)
232 |                 video_quality = gr.Dropdown(
233 |                     label="Select Video Quality",
234 |                     choices=["144p", "240p", "360p", "480p", "720p", "1080p"],
235 |                     value="720p",
236 |                     info="Choose the desired video quality for downloads."
237 |                 )
238 |                 file_upload = gr.File(label="Upload your own video/audio files", file_count="multiple", type="file")
239 |                 add_button = gr.Button("Add Videos")
240 |                 add_output = gr.Textbox(label="Status")
241 |                 add_button.click(
242 |                     add_videos_interface,
243 |                     inputs=[input_text, file_upload, process_channel, keep_videos, video_quality],
244 |                     outputs=add_output
245 |                 )
246 | 
247 |             with gr.Tab("Search"):
248 |                 gr.Markdown("### Search the video database")
249 |                 query_text = gr.Textbox(lines=1, placeholder="Enter your search query")
250 |                 top_k = gr.Slider(1, 20, value=5, step=1, label="Number of Results")
251 |                 search_button = gr.Button("Search")
252 |                 top_video_results = gr.HTML()
253 |                 detailed_results = gr.HTML()
254 |                 search_button.click(
255 |                     search_interface,
256 |                     inputs=[query_text, top_k],
257 |                     outputs=[top_video_results, detailed_results]
258 |                 )
259 | 
260 |         demo.launch()
261 | 
262 | if __name__ == "__main__":
263 |     # Fix for multiprocessing in PyInstaller
264 |     multiprocessing.freeze_support()
265 | 
266 |     # Ensure set_start_method is only set once
267 |     try:
268 |         multiprocessing.set_start_method('spawn', force=True)
269 |     except RuntimeError:
270 |         pass
271 | 
272 |     main()
273 | 


--------------------------------------------------------------------------------
/offlineyoutube/lib/functions.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | import yt_dlp
  4 | import pandas as pd
  5 | import numpy as np
  6 | import requests
  7 | import faiss
  8 | import shutil
  9 | from faster_whisper import WhisperModel
 10 | from sentence_transformers import SentenceTransformer
 11 | from tqdm import tqdm
 12 | import pysrt
 13 | import subprocess
 14 | import webvtt
 15 | import tempfile
 16 | from pathlib import Path
 17 | from config import OFFLINE_YOUTUBE_DIR  # Ensure this path is correct
 18 | 
 19 | def initialize_models(whisper_model_size='tiny', device='cpu', compute_type='int8', embedding_model_name='all-MiniLM-L6-v2'):
 20 |     """
 21 |     Initialize the Whisper and embedding models.
 22 |     """
 23 |     try:
 24 |         whisper_model = WhisperModel(whisper_model_size, device=device, compute_type=compute_type)
 25 |         print(f"Initialized WhisperModel with size='{whisper_model_size}', device='{device}', compute_type='{compute_type}'.")
 26 |     except Exception as e:
 27 |         print(f"Error initializing WhisperModel: {e}")
 28 |         raise e
 29 | 
 30 |     try:
 31 |         embedding_model = SentenceTransformer(embedding_model_name)
 32 |         print(f"Initialized SentenceTransformer with model='{embedding_model_name}'.")
 33 |     except Exception as e:
 34 |         print(f"Error initializing SentenceTransformer: {e}")
 35 |         raise e
 36 | 
 37 |     return whisper_model, embedding_model
 38 | 
 39 | def setup_directories():
 40 |     """
 41 |     Create necessary directories for storing thumbnails and datasets within the base directory.
 42 |     """
 43 |     directories = [
 44 |         'thumbnails',
 45 |         'datasets',
 46 |         'tmp',
 47 |         'videos',
 48 |         'uploaded_files'
 49 |     ]
 50 |     for directory in directories:
 51 |         path = os.path.join(OFFLINE_YOUTUBE_DIR, directory)
 52 |         os.makedirs(path, exist_ok=True)
 53 |         print(f"Ensured directory exists: {path}")
 54 | 
 55 | def extract_video_id_from_link(link):
 56 |     """
 57 |     Extract YouTube video ID from a link.
 58 |     """
 59 |     video_id = re.search(r"(?:v=|\/)([0-9A-Za-z_-]{11})", link)
 60 |     return video_id.group(1) if video_id else None
 61 | 
 62 | def get_video_id(youtube_link):
 63 |     """
 64 |     Get the video ID from a YouTube link.
 65 |     """
 66 |     pattern = r"(?:v=|\/)([0-9A-Za-z_-]{11}).*"
 67 |     match = re.search(pattern, youtube_link)
 68 |     return match.group(1) if match else None
 69 | 
 70 | def download_thumbnail(video_id):
 71 |     """
 72 |     Download the thumbnail image for a YouTube video.
 73 |     """
 74 |     thumbnail_url = f"https://img.youtube.com/vi/{video_id}/hqdefault.jpg"
 75 |     thumbnail_path = os.path.join(OFFLINE_YOUTUBE_DIR, 'thumbnails', f"{video_id}.jpg")
 76 |     
 77 |     if not os.path.exists(thumbnail_path):
 78 |         try:
 79 |             response = requests.get(thumbnail_url, stream=True)
 80 |             if response.status_code == 200:
 81 |                 with open(thumbnail_path, 'wb') as f:
 82 |                     shutil.copyfileobj(response.raw, f)
 83 |                 print(f"Downloaded thumbnail for video ID {video_id} to {thumbnail_path}.")
 84 |             else:
 85 |                 print(f"Failed to download thumbnail for video ID {video_id}. Status code: {response.status_code}")
 86 |         except Exception as e:
 87 |             print(f"Error downloading thumbnail for video ID {video_id}: {e}")
 88 |     else:
 89 |         print(f"Thumbnail already exists for video ID {video_id} at {thumbnail_path}.")
 90 |     return thumbnail_path
 91 | 
 92 | def download_video(video_url, output_dir, keep_video=True, download_audio_only=False, video_quality="720p"):
 93 |     """
 94 |     Download video or audio to a specified directory, attempt to download subtitles.
 95 |     """
 96 |     # First, attempt to download subtitles only
 97 |     subtitles_available, subtitle_file, video_id, video_title = download_subtitles(video_url, output_dir)
 98 |     
 99 |     # Define video quality mapping
100 |     quality_mapping = {
101 |         "144p": "bestvideo[height<=144][ext=mp4]+bestaudio[ext=m4a]/mp4",
102 |         "240p": "bestvideo[height<=240][ext=mp4]+bestaudio[ext=m4a]/mp4",
103 |         "360p": "bestvideo[height<=360][ext=mp4]+bestaudio[ext=m4a]/mp4",
104 |         "480p": "bestvideo[height<=480][ext=mp4]+bestaudio[ext=m4a]/mp4",
105 |         "720p": "bestvideo[height<=720][ext=mp4]+bestaudio[ext=m4a]/mp4",
106 |         "1080p": "bestvideo[height<=1080][ext=mp4]+bestaudio[ext=m4a]/mp4",
107 |     }
108 | 
109 |     selected_format = quality_mapping.get(video_quality, "bestvideo[height<=720][ext=mp4]+bestaudio[ext=m4a]/mp4")
110 |     print(f"Selected format for download: {selected_format}")
111 | 
112 |     # Decide whether to download video or audio based on subtitles availability and user preference
113 |     if keep_video:
114 |         # Need to download the video with selected quality
115 |         ydl_opts = {
116 |             'format': selected_format,
117 |             'outtmpl': os.path.join(output_dir, '%(id)s.%(ext)s'),
118 |             'quiet': True,
119 |             'no_warnings': True,
120 |             'merge_output_format': 'mp4',
121 |             'skip_download': False,
122 |         }
123 |         try:
124 |             with yt_dlp.YoutubeDL(ydl_opts) as ydl:
125 |                 info_dict = ydl.extract_info(video_url, download=True)
126 |                 video_id = info_dict.get('id', '')
127 |                 video_title = info_dict.get('title', '')
128 |                 # Get the actual filename
129 |                 filename = ydl.prepare_filename(info_dict)
130 |                 video_file = filename
131 |                 print(f"Downloaded video: {video_file}")
132 |         except Exception as e:
133 |             print(f"Error downloading media for video {video_url}: {e}")
134 |             video_file = None
135 |     else:
136 |         # If subtitles are available and not keeping video, we don't need to download anything
137 |         if subtitles_available:
138 |             print("Subtitles found. Proceeding without downloading media.")
139 |             video_file = None
140 |         else:
141 |             # Need to download audio for transcription
142 |             ydl_opts = {
143 |                 'format': 'bestaudio/best',
144 |                 'outtmpl': os.path.join(output_dir, '%(id)s.%(ext)s'),
145 |                 'quiet': True,
146 |                 'no_warnings': True,
147 |                 'skip_download': False,
148 |             }
149 |             try:
150 |                 with yt_dlp.YoutubeDL(ydl_opts) as ydl:
151 |                     info_dict = ydl.extract_info(video_url, download=True)
152 |                     video_id = info_dict.get('id', '')
153 |                     video_title = info_dict.get('title', '')
154 |                     # Get the actual filename
155 |                     filename = ydl.prepare_filename(info_dict)
156 |                     video_file = filename
157 |                     print(f"Downloaded audio: {video_file}")
158 |             except Exception as e:
159 |                 print(f"Error downloading audio for video {video_url}: {e}")
160 |                 video_file = None
161 | 
162 |     return video_file, video_id, video_title, subtitles_available, subtitle_file
163 | 
164 | def download_subtitles(video_url, output_dir):
165 |     """
166 |     Attempt to download subtitles for a video without downloading the video.
167 |     """
168 |     ydl_opts = {
169 |         'skip_download': True,
170 |         'writesubtitles': True,
171 |         'writeautomaticsub': True,
172 |         'subtitleslangs': ['en'],
173 |         'quiet': True,
174 |         'outtmpl': os.path.join(output_dir, '%(id)s'),
175 |     }
176 |     try:
177 |         with yt_dlp.YoutubeDL(ydl_opts) as ydl:
178 |             info_dict = ydl.extract_info(video_url, download=False)
179 |             video_id = info_dict.get('id', '')
180 |             video_title = info_dict.get('title', '')
181 | 
182 |             # Check for subtitle files
183 |             subtitle_file = None
184 |             subtitles_available = False
185 |             possible_extensions = ['en.srt', 'en.vtt']
186 |             for ext_sub in possible_extensions:
187 |                 possible_subtitle_file = os.path.join(output_dir, f"{video_id}.{ext_sub}")
188 |                 if os.path.exists(possible_subtitle_file):
189 |                     subtitle_file = possible_subtitle_file
190 |                     subtitles_available = True
191 |                     print(f"Found subtitle file: {subtitle_file}")
192 |                     break
193 | 
194 |             # If subtitles are not available, attempt with subprocess
195 |             if not subtitles_available:
196 |                 print("Subtitles not found. Attempting to download subtitles using alternative method.")
197 |                 cmd = [
198 |                     'yt-dlp', '--skip-download', '--write-sub', '--write-auto-sub',
199 |                     '--sub-lang', 'en', '--output',
200 |                     os.path.join(output_dir, '%(id)s'),
201 |                     video_url
202 |                 ]
203 |                 subprocess.run(cmd, check=False)
204 |                 # Attempt to find the subtitle file
205 |                 for ext_sub in possible_extensions:
206 |                     possible_subtitle_file = os.path.join(output_dir, f"{video_id}.{ext_sub}")
207 |                     if os.path.exists(possible_subtitle_file):
208 |                         subtitle_file = possible_subtitle_file
209 |                         subtitles_available = True
210 |                         print(f"Downloaded subtitle file: {subtitle_file}")
211 |                         break
212 | 
213 |             return subtitles_available, subtitle_file, video_id, video_title
214 | 
215 |     except Exception as e:
216 |         print(f"Error downloading subtitles for video {video_url}: {e}")
217 |         return False, None, None, None
218 | 
219 | def extract_audio_from_video(video_file_path):
220 |     """
221 |     Extract audio from a video file using ffmpeg and save it to a temporary file.
222 |     Returns the path to the extracted audio file.
223 |     """
224 |     try:
225 |         temp_dir = tempfile.mkdtemp()
226 |         audio_file_path = os.path.join(temp_dir, "extracted_audio.wav")
227 |         cmd = [
228 |             'ffmpeg',
229 |             '-i', video_file_path,
230 |             '-vn',  # No video
231 |             '-acodec', 'pcm_s16le',  # PCM 16-bit little endian
232 |             '-ar', '16000',  # 16kHz
233 |             '-ac', '1',  # Mono
234 |             audio_file_path,
235 |             '-y'  # Overwrite without asking
236 |         ]
237 |         subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True)
238 |         print(f"Extracted audio to {audio_file_path}")
239 |         return audio_file_path
240 |     except Exception as e:
241 |         print(f"Error extracting audio from {video_file_path}: {e}")
242 |         return None
243 | 
244 | def convert_to_mp4(input_file, output_dir):
245 |     """
246 |     Convert any video or audio file to MP4 format using ffmpeg.
247 |     Returns the path to the converted MP4 file.
248 |     """
249 |     try:
250 |         input_path = Path(input_file)
251 |         output_path = Path(output_dir) / (input_path.stem + ".mp4")
252 |         if input_path.suffix.lower() != '.mp4':
253 |             cmd = [
254 |                 'ffmpeg',
255 |                 '-i', str(input_path),
256 |                 '-c:v', 'libx264',
257 |                 '-c:a', 'aac',
258 |                 '-strict', 'experimental',
259 |                 '-b:a', '192k',
260 |                 '-y',  # Overwrite without asking
261 |                 str(output_path)
262 |             ]
263 |             subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True)
264 |             print(f"Converted {input_file} to {output_path}")
265 |             return str(output_path)
266 |         else:
267 |             # If already mp4, just return the original path
268 |             print(f"File {input_file} is already in MP4 format.")
269 |             return str(input_file)
270 |     except Exception as e:
271 |         print(f"Error converting {input_file} to MP4: {e}")
272 |         return None
273 | 
274 | def extract_transcript(audio_file, whisper_model, subtitles_available=False, subtitle_file=None):
275 |     """
276 |     Transcribe the audio file using faster-whisper or read subtitles.
277 |     """
278 |     if subtitles_available and subtitle_file:
279 |         # Read subtitles file
280 |         sentences = extract_transcript_from_subtitles(subtitle_file)
281 |     elif audio_file:
282 |         # Transcribe using Whisper
283 |         print("Using Whisper to transcribe audio.")
284 |         sentences = []
285 |         try:
286 |             # Reduced beam size and no VAD filter to stabilize
287 |             segments, _ = whisper_model.transcribe(audio_file, vad_filter=False, beam_size=5)
288 |             for segment in segments:
289 |                 for sentence in segment.text.split('.'):
290 |                     sentence = sentence.strip()
291 |                     if sentence:
292 |                         sentences.append((sentence, segment.start))
293 |             print(f"Transcription completed for {audio_file}.")
294 |         except Exception as e:
295 |             print(f"Error during transcription: {e}")
296 |             sentences = []
297 |     else:
298 |         print("No subtitles or audio file available for transcription.")
299 |         sentences = []
300 |     return sentences
301 | 
302 | def extract_transcript_from_subtitles(subtitle_file):
303 |     """
304 |     Extract transcript from subtitles file (.srt or .vtt format).
305 |     """
306 |     sentences = []
307 |     try:
308 |         if subtitle_file.endswith('.srt'):
309 |             subs = pysrt.open(subtitle_file)
310 |             for sub in subs:
311 |                 text = sub.text.strip().replace('\n', ' ')
312 |                 start = sub.start.ordinal / 1000.0  # Convert milliseconds to seconds
313 |                 if text:
314 |                     sentences.append((text, start))
315 |         elif subtitle_file.endswith('.vtt'):
316 |             subs = webvtt.read(subtitle_file)
317 |             for caption in subs:
318 |                 text = caption.text.strip().replace('\n', ' ')
319 |                 start = caption.start_in_seconds
320 |                 if text:
321 |                     sentences.append((text, start))
322 |         else:
323 |             print(f"Unsupported subtitle format for file: {subtitle_file}")
324 |     except Exception as e:
325 |         print(f"Error reading subtitles file {subtitle_file}: {e}")
326 |     return sentences
327 | 
328 | def query_vector_database(query, embedding_model, top_k=5):
329 |     """
330 |     Query the FAISS vector database with a search query.
331 |     """
332 |     index_path = os.path.join(OFFLINE_YOUTUBE_DIR, 'datasets', 'vector_index.faiss')
333 |     dataset_path = os.path.join(OFFLINE_YOUTUBE_DIR, 'datasets', 'transcript_dataset.csv')
334 | 
335 |     if not os.path.exists(index_path):
336 |         raise FileNotFoundError("Vector index not found. Please add videos first.")
337 | 
338 |     index = faiss.read_index(index_path)
339 |     data = pd.read_csv(dataset_path)
340 |     if 'video_id' not in data.columns:
341 |         data['video_id'] = data['YouTube_link'].apply(get_video_id)
342 |         data.to_csv(dataset_path, index=False)
343 | 
344 |     query_vector = embedding_model.encode(query).astype('float32').reshape(1, -1)
345 |     distances, indices = index.search(query_vector, top_k)
346 | 
347 |     results = data.iloc[indices[0]].copy()
348 |     results['score'] = distances[0]
349 | 
350 |     # Aggregate most relevant videos by video ID
351 |     video_relevance = (
352 |         results.groupby('video_id')
353 |         .agg(
354 |             relevance=('score', 'mean'),
355 |             thumbnail=('thumbnail_path', 'first'),
356 |             text=('text', 'first'),
357 |             original_link=('YouTube_link', 'first'),
358 |             video_title=('video_title', 'first'),
359 |             local_video_path=('local_video_path', 'first')
360 |         )
361 |         .sort_values(by='relevance', ascending=True)
362 |         .head(5)
363 |         .reset_index(drop=True)
364 |     )
365 | 
366 |     return results, video_relevance
367 | 
368 | def process_videos(video_links, uploaded_files_paths, keep_videos=False, video_quality="720p"):
369 |     """
370 |     Process each YouTube video and uploaded files one by one, updating the dataset and vector database after each.
371 |     """
372 |     # Initialize models within the function to avoid multi-processing issues
373 |     whisper_model, embedding_model = initialize_models()
374 |     
375 |     # Paths for dataset and index
376 |     video_titles = set()  # Use a set to store unique video titles
377 |     dataset_path = os.path.join(OFFLINE_YOUTUBE_DIR, 'datasets', 'transcript_dataset.csv')
378 |     index_path = os.path.join(OFFLINE_YOUTUBE_DIR, 'datasets', 'vector_index.faiss')
379 | 
380 |     # Decide on video directory
381 |     if keep_videos:
382 |         video_dir = os.path.join(OFFLINE_YOUTUBE_DIR, 'videos')
383 |     else:
384 |         video_dir = os.path.join(OFFLINE_YOUTUBE_DIR, 'tmp')
385 | 
386 |     os.makedirs(video_dir, exist_ok=True)
387 |     print(f"Using video directory: {video_dir}")
388 | 
389 |     # Load existing dataset if it exists
390 |     if os.path.exists(dataset_path):
391 |         data = pd.read_csv(dataset_path)
392 |         if 'video_id' not in data.columns:
393 |             data['video_id'] = data['YouTube_link'].apply(get_video_id)
394 |             data.to_csv(dataset_path, index=False)
395 |         existing_video_ids = set(data['video_id'].unique())
396 |         print(f"Loaded existing dataset with {len(existing_video_ids)} videos.")
397 |     else:
398 |         data = pd.DataFrame()
399 |         existing_video_ids = set()
400 |         print("No existing dataset found. Starting fresh.")
401 | 
402 |     # Load existing index if it exists
403 |     if os.path.exists(index_path):
404 |         try:
405 |             index = faiss.read_index(index_path)
406 |             print(f"Loaded existing FAISS index from {index_path}.")
407 |         except Exception as e:
408 |             print(f"Error loading FAISS index: {e}")
409 |             index = None
410 |     else:
411 |         index = None
412 |         print("No existing FAISS index found. A new index will be created.")
413 | 
414 |     # Process video links
415 |     if video_links:
416 |         for idx, link in enumerate(tqdm(video_links, desc="Processing Videos", unit="video")):
417 |             video_id = get_video_id(link)
418 |             if video_id in existing_video_ids:
419 |                 print(f"Video {video_id} already processed. Skipping.")
420 |                 continue  # Skip already processed videos
421 | 
422 |             print(f"\nProcessing video {idx + 1}/{len(video_links)}: {link}")
423 |             # Determine if we need to download audio-only
424 |             download_audio_only = not keep_videos
425 | 
426 |             # Download video or audio and subtitles with selected video quality
427 |             video_file, video_id, video_title, subtitles_available, subtitle_file = download_video(
428 |                 link, video_dir, keep_video=keep_videos, download_audio_only=download_audio_only, video_quality=video_quality
429 |             )
430 | 
431 |             if not subtitles_available and not video_file:
432 |                 print(f"Cannot process video {video_id} because neither subtitles nor audio/video are available.")
433 |                 continue
434 | 
435 |             # Transcribe audio or read subtitles
436 |             print(f"Extracting transcript for video ID {video_id}...")
437 |             if subtitles_available:
438 |                 print("Subtitles found. Using subtitles for transcript.")
439 |             else:
440 |                 print("Subtitles not found. Using Whisper to transcribe audio.")
441 | 
442 |             sentences = extract_transcript(video_file, whisper_model, subtitles_available, subtitle_file)
443 |             if not sentences:
444 |                 print(f"No transcript available for video {video_id}. Skipping.")
445 |                 continue
446 |             thumbnail_path = download_thumbnail(video_id)
447 | 
448 |             new_data = []
449 |             embeddings = []
450 |             for sentence, timestamp in sentences:
451 |                 timestamped_link = f"https://www.youtube.com/watch?v={video_id}&t={int(timestamp)}s"
452 |                 local_video_path = os.path.abspath(video_file) if keep_videos and video_file else ''
453 |                 new_data.append({
454 |                     'video_id': video_id,
455 |                     'text': sentence,
456 |                     'timestamp': timestamp,
457 |                     'YouTube_link': link,
458 |                     'YouTube_timestamped_link': timestamped_link,
459 |                     'thumbnail_path': thumbnail_path,
460 |                     'video_title': video_title,
461 |                     'local_video_path': local_video_path
462 |                 })
463 |                 video_titles.add(video_title)
464 |                 # Encode the sentence to get embedding
465 |                 embedding = embedding_model.encode(sentence).astype('float32')
466 |                 embeddings.append(embedding)
467 | 
468 |             # Convert new_data to DataFrame
469 |             new_data_df = pd.DataFrame(new_data)
470 | 
471 |             # Append new data to dataset
472 |             data = pd.concat([data, new_data_df], ignore_index=True)
473 |             # Save updated dataset
474 |             data.to_csv(dataset_path, index=False)
475 |             print(f"Updated dataset with {len(new_data_df)} new entries.")
476 | 
477 |             # Update the FAISS index
478 |             if embeddings:
479 |                 embeddings = np.vstack(embeddings)
480 |                 dimension = embeddings.shape[1]
481 |                 if index is None:
482 |                     # Create new index
483 |                     index = faiss.IndexFlatL2(dimension)
484 |                     print(f"Created new FAISS index with dimension {dimension}.")
485 |                 index.add(embeddings)
486 |                 # Save the updated index
487 |                 faiss.write_index(index, index_path)
488 |                 print(f"Updated FAISS index with {len(embeddings)} new embeddings.")
489 | 
490 |             # Delete the audio/video file after processing if not keeping videos
491 |             if not keep_videos and video_file and os.path.exists(video_file):
492 |                 os.remove(video_file)
493 |                 print(f"Deleted temporary video file: {video_file}")
494 |             if subtitles_available and subtitle_file and os.path.exists(subtitle_file):
495 |                 os.remove(subtitle_file)
496 |                 print(f"Deleted temporary subtitle file: {subtitle_file}")
497 | 
498 |     # Process uploaded files
499 |     if uploaded_files_paths:
500 |         for idx, file_path in enumerate(tqdm(uploaded_files_paths, desc="Processing Uploaded Files", unit="file")):
501 |             file_extension = os.path.splitext(file_path)[1].lower()
502 |             is_video = file_extension in ['.mp4', '.mkv', '.avi', '.mov', '.flv', '.wmv']
503 |             is_audio = file_extension in ['.mp3', '.wav', '.aac', '.flac', '.ogg', '.m4a']
504 | 
505 |             if not (is_video or is_audio):
506 |                 print(f"Unsupported file type for file {file_path}. Skipping.")
507 |                 continue
508 | 
509 |             video_id = os.path.splitext(os.path.basename(file_path))[0]
510 |             video_title = video_id
511 |             link = ''
512 |             thumbnail_path = ''
513 |             print(f"\nProcessing uploaded file {idx + 1}/{len(uploaded_files_paths)}: {file_path}")
514 | 
515 |             # Convert to MP4 if not already
516 |             converted_mp4 = convert_to_mp4(file_path, os.path.join(OFFLINE_YOUTUBE_DIR, 'uploaded_files'))
517 |             if not converted_mp4:
518 |                 print(f"Failed to convert {file_path} to MP4. Skipping.")
519 |                 continue
520 | 
521 |             # Extract audio from the converted MP4
522 |             audio_file_path = extract_audio_from_video(converted_mp4)
523 |             if not audio_file_path:
524 |                 print(f"Failed to extract audio from {converted_mp4}. Skipping.")
525 |                 continue
526 | 
527 |             # Transcribe using Whisper
528 |             print(f"Transcribing uploaded file {video_id}...")
529 |             sentences = extract_transcript(audio_file_path, whisper_model, subtitles_available=False, subtitle_file=None)
530 |             if not sentences:
531 |                 print(f"No transcript available for file {video_id}. Skipping.")
532 |                 if os.path.exists(audio_file_path):
533 |                     shutil.rmtree(os.path.dirname(audio_file_path))
534 |                 continue
535 | 
536 |             new_data = []
537 |             embeddings = []
538 |             for sentence, timestamp in sentences:
539 |                 timestamped_link = ''  # No YouTube link for uploaded files
540 |                 local_video_path = os.path.abspath(converted_mp4)  # Always keep uploaded files locally
541 |                 new_data.append({
542 |                     'video_id': video_id,
543 |                     'text': sentence,
544 |                     'timestamp': timestamp,
545 |                     'YouTube_link': link,
546 |                     'YouTube_timestamped_link': timestamped_link,
547 |                     'thumbnail_path': thumbnail_path,  # No thumbnail for uploaded files
548 |                     'video_title': video_title,
549 |                     'local_video_path': local_video_path
550 |                 })
551 |                 video_titles.add(video_title)
552 |                 # Encode the sentence to get embedding
553 |                 embedding = embedding_model.encode(sentence).astype('float32')
554 |                 embeddings.append(embedding)
555 | 
556 |             # Convert new_data to DataFrame
557 |             new_data_df = pd.DataFrame(new_data)
558 | 
559 |             # Append new data to dataset
560 |             data = pd.concat([data, new_data_df], ignore_index=True)
561 |             # Save updated dataset
562 |             data.to_csv(dataset_path, index=False)
563 |             print(f"Updated dataset with {len(new_data_df)} new entries from uploaded files.")
564 | 
565 |             # Update the FAISS index
566 |             if embeddings:
567 |                 embeddings = np.vstack(embeddings)
568 |                 dimension = embeddings.shape[1]
569 |                 if index is None:
570 |                     # Create new index
571 |                     index = faiss.IndexFlatL2(dimension)
572 |                     print(f"Created new FAISS index with dimension {dimension}.")
573 |                 index.add(embeddings)
574 |                 # Save the updated index
575 |                 faiss.write_index(index, index_path)
576 |                 print(f"Updated FAISS index with {len(embeddings)} new embeddings.")
577 | 
578 |             # Delete the extracted audio file after processing
579 |             if os.path.exists(audio_file_path):
580 |                 shutil.rmtree(os.path.dirname(audio_file_path))
581 |                 print(f"Deleted temporary audio file directory: {os.path.dirname(audio_file_path)}")
582 | 
583 |     return data, video_titles
584 | 
585 | def is_channel_url(url):
586 |     """
587 |     Check if a URL is a YouTube channel URL.
588 |     """
589 |     return any(x in url for x in ['/channel/', '/c/', '/user/'])
590 | 
591 | def get_video_links(input_text, process_channel=False):
592 |     """
593 |     Get video links from a list of input links, automatically detecting playlists, channels, and individual videos.
594 |     """
595 |     video_links = []
596 |     if not input_text.strip():
597 |         return video_links
598 |     links = [link.strip() for link in input_text.strip().split(',') if link.strip()]
599 |     for link in links:
600 |         try:
601 |             ydl_opts = {
602 |                 'quiet': True,
603 |                 'no_warnings': True,
604 |                 'extract_flat': 'in_playlist',
605 |             }
606 |             if is_channel_url(link):
607 |                 if not process_channel:
608 |                     print(f"Channel URL detected: {link}")
609 |                     print("Process Channel option is not enabled. Skipping channel.")
610 |                     continue
611 |                 else:
612 |                     # For channels, get all videos
613 |                     ydl_opts['playlistend'] = None
614 |             else:
615 |                 # For non-channels, get all videos in playlists
616 |                 ydl_opts['playlistend'] = None
617 |             with yt_dlp.YoutubeDL(ydl_opts) as ydl:
618 |                 info = ydl.extract_info(link, download=False)
619 |                 if '_type' in info and info['_type'] == 'playlist':
620 |                     # It's a playlist or a channel
621 |                     entries = info.get('entries', [])
622 |                     for entry in entries:
623 |                         video_id = entry.get('id')
624 |                         if video_id:
625 |                             video_link = f"https://www.youtube.com/watch?v={video_id}"
626 |                             video_links.append(video_link)
627 |                 elif 'id' in info:
628 |                     # It's a single video
629 |                     video_id = info['id']
630 |                     video_link = f"https://www.youtube.com/watch?v={video_id}"
631 |                     video_links.append(video_link)
632 |                 else:
633 |                     print(f"Unknown link type, skipped: {link}")
634 |         except Exception as e:
635 |             print(f"Error processing link {link}: {e}")
636 |     return video_links
637 | 


--------------------------------------------------------------------------------