├── offlineyoutube
├── lib
│ ├── __init__.py
│ ├── .DS_Store
│ └── functions.py
├── __init__.py
├── .DS_Store
├── config.py
└── app.py
├── pyinstaller scripts
└── Apple silicon
│ ├── README.md
│ └── app.spec
├── requirements.txt
├── .gitignore
├── LICENSE
├── setup.py
├── legacy
└── VectorDatabaseYoutube.py
└── README.md
/offlineyoutube/lib/__init__.py:
--------------------------------------------------------------------------------
1 | from .functions import *
2 |
--------------------------------------------------------------------------------
/offlineyoutube/__init__.py:
--------------------------------------------------------------------------------
1 | # offline_youtube/__init__.py
2 | from .app import *
3 |
4 |
--------------------------------------------------------------------------------
/offlineyoutube/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DrewThomasson/offlineYoutube/HEAD/offlineyoutube/.DS_Store
--------------------------------------------------------------------------------
/offlineyoutube/lib/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DrewThomasson/offlineYoutube/HEAD/offlineyoutube/lib/.DS_Store
--------------------------------------------------------------------------------
/offlineyoutube/config.py:
--------------------------------------------------------------------------------
1 | # config.py
2 | import os
3 | OFFLINE_YOUTUBE_DIR = os.path.join(os.path.expanduser('~'), 'offlineyoutube_files')
4 |
5 |
--------------------------------------------------------------------------------
/pyinstaller scripts/Apple silicon/README.md:
--------------------------------------------------------------------------------
1 | ## To build as a binary on apple silicon move app.spec to root of repo at `/vectorDatabaseYoutube/` and run:
2 |
3 | ```bash
4 | pyinstaller --clean app.spec -y
5 | ```
6 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | yt-dlp
2 | pandas
3 | numpy
4 | requests
5 | faiss-cpu
6 | faster-whisper
7 | sentence-transformers
8 | gradio==3.36.1
9 | argparse
10 | beautifulsoup4
11 | pysrt
12 | webvtt-py
13 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Ignore build artifacts
2 | build/
3 | dist/
4 | *.egg-info/
5 | __pycache__/
6 |
7 | # macOS files
8 | .DS_Store
9 |
10 | # Python cache
11 | *.pyc
12 | *.pyo
13 |
14 | # Virtual environments
15 | venv/
16 |
17 | # Project-specific files
18 | offlineyoutube/offlineYoutubeFiles/
19 |
20 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2024 Drew Thomasson
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/pyinstaller scripts/Apple silicon/app.spec:
--------------------------------------------------------------------------------
1 | # -*- mode: python ; coding: utf-8 -*-
2 | from PyInstaller.utils.hooks import collect_data_files
3 |
4 | # Collect data files for Gradio and Gradio Client
5 | datas = []
6 | datas += collect_data_files('gradio')
7 | datas += collect_data_files('gradio_client')
8 |
9 | a = Analysis(
10 | ['app.py'], # Your main application entry point
11 | pathex=[], # Add paths if necessary
12 | binaries=[], # Include any additional binaries if needed
13 | datas=datas,
14 | hiddenimports=[], # Specify hidden imports if any
15 | hookspath=[], # Add hook paths if required
16 | hooksconfig={},
17 | runtime_hooks=[],
18 | excludes=[],
19 | noarchive=False,
20 | optimize=0, # Optimization level (0 for no optimization)
21 | module_collection_mode={
22 | 'gradio': 'py', # Collect Gradio as source .py files
23 | },
24 | )
25 |
26 | # Create the executable in a single-file format
27 | pyz = PYZ(a.pure)
28 |
29 | exe = EXE(
30 | pyz,
31 | a.scripts,
32 | [],
33 | exclude_binaries=True,
34 | name='app',
35 | debug=False,
36 | bootloader_ignore_signals=False,
37 | strip=False,
38 | upx=True,
39 | console=True,
40 | disable_windowed_traceback=False,
41 | argv_emulation=False,
42 | target_arch=None,
43 | codesign_identity=None,
44 | entitlements_file=None,
45 | onefile=True, # Ensure single-file build
46 | )
47 |
48 | # Final collection step, collecting necessary files and binaries
49 | coll = COLLECT(
50 | exe,
51 | a.binaries,
52 | a.datas,
53 | strip=False,
54 | upx=True,
55 | upx_exclude=[],
56 | name='app',
57 | )
58 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import platform
3 | from setuptools import setup, find_packages
4 |
5 | # Check for M1 Mac and Python version
6 | if platform.system() == "Darwin" and platform.processor() == "arm":
7 | if not (sys.version_info.major == 3 and sys.version_info.minor == 10):
8 | raise RuntimeError(
9 | "This package requires Python 3.10 on M1 Macs. "
10 | "Please create a Python 3.10 virtual environment and try again."
11 | )
12 |
13 | setup(
14 | name="offlineyoutube",
15 | version="2.1.9",
16 | packages=find_packages(),
17 | include_package_data=True,
18 | install_requires=[
19 | "yt-dlp",
20 | "pandas",
21 | "numpy",
22 | "requests",
23 | "faiss-cpu",
24 | "faster-whisper",
25 | "sentence-transformers",
26 | "gradio==3.36.1",
27 | "argparse",
28 | "beautifulsoup4",
29 | "pysrt",
30 | "webvtt-py"
31 | ],
32 | entry_points={
33 | "console_scripts": [
34 | "offlineyoutube=offlineyoutube.app:main"
35 | ]
36 | },
37 | python_requires=">=3.8",
38 | author="Andrew Phillip Thomasson",
39 | author_email="drew.thomasson100@gmail.com",
40 | description="A YouTube video search and management tool with a Gradio interface",
41 | long_description=open("README.md").read(),
42 | long_description_content_type="text/markdown",
43 | url="https://github.com/DrewThomasson/offlineYoutube",
44 | classifiers=[
45 | "Programming Language :: Python :: 3",
46 | "License :: OSI Approved :: MIT License",
47 | "Operating System :: OS Independent",
48 | ],
49 | )
50 |
--------------------------------------------------------------------------------
/legacy/VectorDatabaseYoutube.py:
--------------------------------------------------------------------------------
1 | import os
2 | import re
3 | import yt_dlp
4 | import pandas as pd
5 | import numpy as np
6 | import requests
7 | import faiss
8 | from faster_whisper import WhisperModel
9 | from sentence_transformers import SentenceTransformer
10 |
11 | # Setup directories
12 | os.makedirs('thumbnails', exist_ok=True)
13 | os.makedirs('datasets', exist_ok=True)
14 |
15 | # Initialize models
16 | whisper_model = WhisperModel("small", device="cpu", compute_type="int8")
17 | embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
18 |
19 | def extract_video_id_from_link(link):
20 | video_id = re.search(r"v=([0-9A-Za-z_-]{11})", link)
21 | return f"https://www.youtube.com/watch?v={video_id.group(1)}" if video_id else link
22 |
23 |
24 | # Helper function to extract YouTube video ID
25 | def get_video_id(youtube_link):
26 | pattern = r"(?:v=|\/)([0-9A-Za-z_-]{11}).*"
27 | match = re.search(pattern, youtube_link)
28 | return match.group(1) if match else None
29 |
30 | # Download thumbnail for offline use
31 | def download_thumbnail(video_id):
32 | thumbnail_url = f"https://img.youtube.com/vi/{video_id}/hqdefault.jpg"
33 | thumbnail_path = f"thumbnails/{video_id}.jpg"
34 |
35 | if not os.path.exists(thumbnail_path):
36 | response = requests.get(thumbnail_url, stream=True)
37 | if response.status_code == 200:
38 | with open(thumbnail_path, 'wb') as f:
39 | f.write(response.content)
40 | return thumbnail_path
41 |
42 | # Transcribe audio with faster-whisper
43 | def extract_transcript(video_url):
44 | video_id = get_video_id(video_url)
45 | print(f"Transcribing {video_id}...")
46 |
47 | with yt_dlp.YoutubeDL({'format': 'bestaudio'}) as ydl:
48 | info = ydl.extract_info(video_url, download=False)
49 | audio_url = info['url']
50 |
51 | segments, _ = whisper_model.transcribe(audio_url, vad_filter=True)
52 |
53 | sentences = []
54 | for segment in segments:
55 | for sentence in segment.text.split('.'):
56 | sentence = sentence.strip()
57 | if sentence:
58 | sentences.append((sentence, segment.start))
59 | return sentences
60 |
61 | # Process videos into a dataset
62 | def process_videos(video_links):
63 | data = []
64 |
65 | for link in video_links:
66 | video_id = get_video_id(link)
67 | sentences = extract_transcript(link)
68 | thumbnail_path = download_thumbnail(video_id)
69 |
70 | for sentence, timestamp in sentences:
71 | data.append({
72 | 'text': sentence,
73 | 'timestamp': timestamp,
74 | 'YouTube_link': link,
75 | 'thumbnail_path': thumbnail_path
76 | })
77 |
78 | return pd.DataFrame(data)
79 |
80 | # Save dataset to CSV
81 | def save_dataset(data):
82 | dataset_path = 'datasets/transcript_dataset.csv'
83 | if os.path.exists(dataset_path):
84 | existing_data = pd.read_csv(dataset_path)
85 | data = pd.concat([existing_data, data], ignore_index=True)
86 | data.to_csv(dataset_path, index=False)
87 | print(f"Dataset saved to {dataset_path}")
88 |
89 | # Create a vector database using FAISS
90 | def create_vector_database(data):
91 | data['embedding'] = data['text'].apply(lambda x: embedding_model.encode(x))
92 |
93 | dimension = len(data['embedding'].iloc[0])
94 | index = faiss.IndexFlatL2(dimension)
95 |
96 | embeddings = np.vstack(data['embedding'].values)
97 | index.add(embeddings)
98 |
99 | # Save the FAISS index
100 | faiss.write_index(index, 'datasets/vector_index.faiss')
101 | print("Vector database created and saved.")
102 | return index
103 |
104 | # Query the vector database
105 | def query_vector_database(query, top_k=5):
106 | index = faiss.read_index('datasets/vector_index.faiss')
107 | data = pd.read_csv('datasets/transcript_dataset.csv')
108 |
109 | query_vector = embedding_model.encode(query).reshape(1, -1)
110 | distances, indices = index.search(query_vector, top_k)
111 |
112 | results = data.loc[indices[0]].copy() # Avoid SettingWithCopyWarning
113 | results['score'] = distances[0]
114 |
115 | # Extract base video link for grouping
116 | results['video_id'] = results['YouTube_link'].apply(extract_video_id_from_link)
117 |
118 | # Aggregate most relevant videos by video ID
119 | video_relevance = (
120 | results.groupby('video_id')
121 | .agg(
122 | relevance=('score', 'mean'), # Average relevance for each video
123 | thumbnail=('thumbnail_path', 'first'), # Use the first thumbnail
124 | text=('text', 'first'), # Use the first text snippet
125 | original_link=('YouTube_link', 'first') # Use the first timestamped link
126 | )
127 | .reset_index()
128 | .sort_values(by='relevance', ascending=True) # Sort by relevance (lower is better)
129 | .head(5) # Limit to top 5 videos
130 | )
131 |
132 | return results[['text', 'YouTube_link', 'thumbnail_path', 'score']], video_relevance
133 |
134 |
135 | # Main function to handle video input and queries
136 | def main():
137 | if not os.path.exists('datasets/transcript_dataset.csv'):
138 | print("No database found. Please add videos to create the initial database.")
139 | video_links = get_video_links()
140 | data = process_videos(video_links)
141 | save_dataset(data)
142 | create_vector_database(data)
143 | else:
144 | print("1: Add more videos\n2: Query the existing database")
145 | option = input("Select an option: ").strip()
146 |
147 | if option == '1':
148 | video_links = get_video_links()
149 | data = process_videos(video_links)
150 | save_dataset(data)
151 | create_vector_database(data)
152 | elif option == '2':
153 | query_loop()
154 | else:
155 | print("Invalid option.")
156 |
157 | def get_video_links():
158 | print("1: Provide a playlist link\n2: Provide a list of video links")
159 | option = input("Select an option: ").strip()
160 |
161 | if option == '1':
162 | playlist_url = input("Enter YouTube playlist URL: ").strip()
163 | with yt_dlp.YoutubeDL({'extract_flat': 'in_playlist'}) as ydl:
164 | playlist_info = ydl.extract_info(playlist_url, download=False)
165 | video_links = [entry['url'] for entry in playlist_info['entries']]
166 | elif option == '2':
167 | video_links = input("Enter YouTube video links (comma-separated): ").strip().split(',')
168 | else:
169 | print("Invalid option.")
170 | return []
171 |
172 | return video_links
173 |
174 | def query_loop():
175 | while True:
176 | query = input("Enter your search query (or 'exit' to quit): ").strip()
177 | if query.lower() == 'exit':
178 | break
179 |
180 | results, top_videos = query_vector_database(query)
181 |
182 | # Print detailed results for each text entry
183 | print("\nDetailed Results:\n")
184 | for _, row in results.iterrows():
185 | print(f"Text: {row['text']}")
186 | print(f"Link: {row['YouTube_link']}")
187 | print(f"Thumbnail: {row['thumbnail_path']}")
188 | print(f"Score: {row['score']:.4f}\n")
189 |
190 | # Print top-ranked videos based on relevance
191 | print("\nTop Relevant Videos:\n")
192 | for idx, row in top_videos.iterrows():
193 | print(f"Rank {idx + 1}:")
194 | print(f"Relevance Score: {row['relevance']:.4f}")
195 | print(f"Video Link: {row['original_link']}")
196 | print(f"Thumbnail: {row['thumbnail']}")
197 | print(f"Example Text: {row['text']}\n")
198 |
199 | # Run the application
200 | if __name__ == "__main__":
201 | main()
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # **Offline YouTube Video Search Application**
2 |
3 | This application allows users to **extract transcripts from YouTube videos**, **upload their own video/audio files**, **create searchable vector databases**, and **perform semantic searches** using a **Gradio web interface** or **command-line interface (CLI)**. It's powered by `faster-whisper` for transcription, `FAISS` for vector search, and `sentence-transformers` for text embeddings.
4 |
5 | ---
6 |
7 | ## **Features**
8 |
9 | - Extract transcripts from individual videos, playlists, and entire channels.
10 | - **Upload your own video or audio files for processing.**
11 | - Automatically detect playlists, channels, and individual video links.
12 | - Automatically download video thumbnails.
13 | - Store transcripts and create a searchable vector database.
14 | - Perform semantic searches on video content.
15 | - Supports **Gradio web interface** and **CLI** for flexible usage.
16 | - Easily add more videos or your own files to the dataset.
17 |
18 | ---
19 |
20 | ## **Web Interface**
21 |
22 | ### **Add Videos Tab**
23 |
24 | - **Enter playlist, channel, and/or video URLs (comma-separated).**
25 | - **Upload your own video/audio files.**
26 | - **Option to process entire channels when a channel URL is provided.**
27 | - **Option to keep videos stored locally or not.**
28 |
29 |
30 |
31 | ### **Search Tab**
32 |
33 | - **Enter your search query to find relevant snippets.**
34 | - **View top relevant videos with thumbnails and play local videos if available.**
35 | - **View detailed results with timestamps and direct links.**
36 |
37 |
38 |
39 |
40 | ---
41 |
42 | ## **Installation**
43 | 
44 |
45 | Ensure you have Python installed (>= 3.8). Then, pip install:
46 | (Requires Python 3.10 for Apple Silicon Macs)
47 |
48 | ```bash
49 | pip install offlineyoutube
50 | ```
51 |
52 | ---
53 |
54 | ## **Usage**
55 |
56 | The app provides **two ways to interact**:
57 | 1. **Gradio Web Interface**
58 | 2. **Command-Line Interface (CLI)**
59 |
60 | ### **1. Running the Gradio Web Interface**
61 |
62 | Launch the web interface:
63 |
64 | ```bash
65 | offlineyoutube ui
66 | ```
67 |
68 | or simply:
69 |
70 | ```bash
71 | offlineyoutube
72 | ```
73 |
74 | Then, open the URL (usually `http://127.0.0.1:7860`) in your browser.
75 |
76 | #### **Gradio Interface Tabs:**
77 |
78 | - **Add Videos:**
79 | - Enter playlist URLs, channel URLs, and/or individual video URLs (comma-separated).
80 | - **Upload your own video or audio files for processing.**
81 | - **Option to process entire YouTube channels when a channel URL is provided.**
82 | - **Option to keep videos stored locally or not.**
83 | - The app will automatically detect whether each link is a playlist, channel, or a video.
84 | - Videos and uploaded files will be transcribed, and the database will be updated with the content.
85 |
86 | - **Search:**
87 | - Enter search queries to find relevant snippets from the video transcripts.
88 | - Results are ranked based on semantic similarity and include video thumbnails.
89 | - **If local videos are available, you can play them directly in the interface.**
90 |
91 | ---
92 |
93 | ### **2. Command-Line Interface (CLI)**
94 |
95 | The CLI provides more flexibility for programmatic use.
96 |
97 | #### **Commands Overview**
98 |
99 | Use the `--help` command to view available commands and examples:
100 |
101 | ```bash
102 | offlineyoutube --help
103 | ```
104 |
105 | **Output:**
106 |
107 | ```
108 | usage: offlineyoutube [-h] {add,search,ui} ...
109 |
110 | YouTube Video Search Application
111 |
112 | positional arguments:
113 | {add,search,ui} Available commands
114 | add Add videos to the database
115 | search Search the video database
116 | ui Run the Gradio web interface
117 |
118 | optional arguments:
119 | -h, --help Show this help message and exit
120 |
121 | Examples:
122 | # Add videos from a playlist and keep videos locally
123 | offlineyoutube add --input "https://www.youtube.com/playlist?list=YOUR_PLAYLIST_ID" --keep_videos
124 |
125 | # Add specific videos without keeping videos locally
126 | offlineyoutube add --input "https://www.youtube.com/watch?v=VIDEO_ID1,https://www.youtube.com/watch?v=VIDEO_ID2"
127 |
128 | # Add videos from a channel (process entire channel)
129 | offlineyoutube add --input "https://www.youtube.com/channel/CHANNEL_ID" --process_channel
130 |
131 | # Search the database with a query
132 | offlineyoutube search --query "Your search query" --top_k 5
133 |
134 | # Run the Gradio web interface
135 | offlineyoutube ui
136 | ```
137 |
138 | ---
139 |
140 | ### **Examples of CLI Usage**
141 |
142 | #### **1. Adding Videos**
143 |
144 | - **Add Playlists and Videos:**
145 |
146 | ```bash
147 | offlineyoutube add --input "https://www.youtube.com/playlist?list=YOUR_PLAYLIST_ID,https://www.youtube.com/watch?v=VIDEO_ID"
148 | ```
149 |
150 | - **Add Specific Videos Without Keeping Them Locally:**
151 |
152 | ```bash
153 | offlineyoutube add --input "https://www.youtube.com/watch?v=dQw4w9WgXcQ,https://www.youtube.com/watch?v=9bZkp7q19f0"
154 | ```
155 |
156 | - **Add Videos from a Channel (Process Entire Channel):**
157 |
158 | ```bash
159 | offlineyoutube add --input "https://www.youtube.com/channel/CHANNEL_ID" --process_channel
160 | ```
161 |
162 | - **Add Videos and Keep Videos Stored Locally:**
163 |
164 | ```bash
165 | offlineyoutube add --input "https://www.youtube.com/watch?v=VIDEO_ID" --keep_videos
166 | ```
167 |
168 | #### **2. Searching the Database**
169 |
170 | - **Perform a Search:**
171 |
172 | ```bash
173 | offlineyoutube search --query "machine learning tutorials" --top_k 5
174 | ```
175 |
176 | ---
177 |
178 | ### **How It Works**
179 |
180 | 1. **Adding Videos and Uploaded Files:**
181 | - The app accepts a list of links and automatically detects whether each link is a playlist, channel, or an individual video.
182 | - **You can upload your own video or audio files for processing.**
183 | - It downloads video audio (or uses uploaded files) and transcribes it using `faster-whisper`.
184 | - Thumbnails are downloaded and saved locally.
185 | - The transcript data is saved in `datasets/transcript_dataset.csv`.
186 | - A vector database is updated using FAISS with embeddings generated by `sentence-transformers`.
187 |
188 | 2. **Incremental Updating:**
189 | - Videos and uploaded files are processed one by one, and the dataset and vector database are updated incrementally.
190 | - This ensures efficient processing, especially when dealing with large datasets.
191 |
192 | 3. **Searching the Database:**
193 | - When a query is entered, the app computes its embedding and searches the FAISS index for relevant video snippets.
194 | - The top results are displayed with thumbnails, titles, and links to the videos.
195 | - **If local videos are available, you can play them directly in the interface.**
196 |
197 | ---
198 |
199 | ### **FAQ**
200 |
201 | #### **1. How do I add multiple playlists, channels, and videos at once?**
202 |
203 | Simply provide a comma-separated list of URLs, and the app will automatically detect and process each link:
204 |
205 | ```bash
206 | offlineyoutube add --input "https://www.youtube.com/playlist?list=PLAYLIST_ID1,https://www.youtube.com/watch?v=VIDEO_ID,https://www.youtube.com/channel/CHANNEL_ID"
207 | ```
208 |
209 | If you want to process entire channels, make sure to include the `--process_channel` flag:
210 |
211 | ```bash
212 | offlineyoutube add --input "https://www.youtube.com/channel/CHANNEL_ID" --process_channel
213 | ```
214 |
215 | #### **2. How can I upload my own video or audio files for processing?**
216 |
217 | In the Gradio web interface, navigate to the **Add Videos** tab. Use the **"Upload your own video/audio files"** option to upload one or multiple files. The app will process these files and add them to the database.
218 |
219 | #### **3. Why aren’t new videos or uploaded files showing up in search results?**
220 |
221 | Ensure that the videos or files have been fully processed and that the vector database has been updated. The app handles this automatically, but processing may take time for large videos, playlists, or channels.
222 |
223 | #### **4. How do I prevent videos from being stored locally?**
224 |
225 | By default, the app keeps videos stored locally. To change this behavior, use the `--keep_videos` flag and set it to `False`:
226 |
227 | ```bash
228 | offlineyoutube add --input "VIDEO_OR_PLAYLIST_URL" --keep_videos False
229 | ```
230 |
231 | In the Gradio interface, uncheck the **"Keep videos stored locally"** option in the **Add Videos** tab.
232 |
233 | #### **5. Can I process entire YouTube channels?**
234 |
235 | Yes! Use the `--process_channel` flag when adding videos via the CLI:
236 |
237 | ```bash
238 | offlineyoutube add --input "https://www.youtube.com/channel/CHANNEL_ID" --process_channel
239 | ```
240 |
241 | In the Gradio interface, check the **"Process entire channel when a channel URL is provided"** option in the **Add Videos** tab.
242 |
243 | #### **6. Can I search the database without launching the Gradio interface?**
244 |
245 | Yes! Use the `search` command via the CLI:
246 |
247 | ```bash
248 | offlineyoutube search --query "Your query" --top_k 5
249 | ```
250 |
251 | ---
252 |
253 | ### **Project Structure**
254 |
255 | ```
256 | .
257 | ├── app.py # Main application script (Gradio + CLI)
258 | ├── functions.py # Helper functions for transcription, FAISS, etc.
259 | ├── datasets/
260 | │ ├── transcript_dataset.csv # CSV file storing transcripts
261 | │ └── vector_index.faiss # FAISS vector index
262 | ├── thumbnails/ # Folder for storing video thumbnails
263 | ├── videos/ # Folder for storing downloaded videos (if keep_videos is True)
264 | ├── tmp/ # Temporary folder for videos (if keep_videos is False)
265 | ├── uploaded_files/ # Folder for storing uploaded files
266 | ```
267 |
268 | ---
269 |
270 | ### **Known Limitations**
271 |
272 | - **Processing Time:** Transcribing videos and generating embeddings can be time-consuming, especially for long videos, large playlists, or channels.
273 | - **Storage Requirements:** Keeping videos stored locally will require additional disk space. Use the `--keep_videos False` option if storage is a concern.
274 | - **Large Datasets:** As the dataset grows, querying may take longer. Consider optimizing the FAISS index for very large datasets.
275 |
276 | ---
277 |
278 | ### **Contributing**
279 |
280 | Feel free to fork the repository, open issues, or submit pull requests if you'd like to contribute to this project.
281 |
282 | ---
283 |
284 | ### **License**
285 |
286 | This project is licensed under the MIT License. See the LICENSE file for details.
287 |
288 | ---
289 |
290 | ### **Acknowledgments**
291 |
292 | - **faster-whisper** for fast transcription.
293 | - **FAISS** for efficient vector search.
294 | - **Gradio** for the interactive web interface.
295 | - **yt-dlp** for downloading video content.
296 |
297 | ---
298 |
--------------------------------------------------------------------------------
/offlineyoutube/app.py:
--------------------------------------------------------------------------------
1 | # app.py
2 |
3 | import os
4 | import sys
5 | sys.path.append(os.path.dirname(__file__)) # Add this line here
6 | import multiprocessing
7 | import shutil
8 | import gradio as gr
9 | import argparse
10 | import pandas as pd
11 | from lib.functions import (
12 | initialize_models, setup_directories, process_videos,
13 | query_vector_database, get_video_links
14 | )
15 | from config import OFFLINE_YOUTUBE_DIR # Ensure this path is correct
16 |
17 | def add_videos_interface(input_text, uploaded_files, process_channel, keep_videos, video_quality):
18 | """
19 | Interface function for adding videos to the database.
20 | """
21 | # Initialize models within the function to avoid multi-processing issues
22 | whisper_model, embedding_model = initialize_models()
23 |
24 | video_links = get_video_links(input_text, process_channel)
25 | uploaded_files_paths = []
26 | if uploaded_files:
27 | uploaded_files_dir = os.path.join(OFFLINE_YOUTUBE_DIR, 'uploaded_files')
28 | os.makedirs(uploaded_files_dir, exist_ok=True)
29 | for uploaded_file in uploaded_files:
30 | try:
31 | original_filename = os.path.basename(uploaded_file.name)
32 | file_path = os.path.join(uploaded_files_dir, original_filename)
33 |
34 | shutil.copy(uploaded_file.name, file_path)
35 |
36 | if os.path.getsize(file_path) == 0:
37 | print(f"Uploaded file {original_filename} is empty. Skipping.")
38 | continue
39 | uploaded_files_paths.append(file_path)
40 | print(f"Saved uploaded file {original_filename} to {file_path} ({os.path.getsize(file_path)} bytes)")
41 | except Exception as e:
42 | print(f"Error saving uploaded file {original_filename}: {e}")
43 | if not video_links and not uploaded_files_paths:
44 | return "No valid video links or files provided."
45 | # Process videos and uploaded files with selected video quality
46 | data, video_titles = process_videos(
47 | video_links, uploaded_files_paths, keep_videos=keep_videos, video_quality=video_quality
48 | )
49 |
50 | # Prepare a message with the video titles
51 | if video_titles:
52 | titles_message = "\n".join(f"- {title}" for title in video_titles)
53 | return f"Videos processed and database updated.\nAdded Videos:\n{titles_message}"
54 | else:
55 | return "No new videos were added to the database."
56 |
57 | def search_interface(query_text, top_k):
58 | """
59 | Interface function for searching the database.
60 | """
61 | # Initialize only the embedding model within the function
62 | _, embedding_model = initialize_models()
63 |
64 | index_path = os.path.join(OFFLINE_YOUTUBE_DIR, 'datasets', 'vector_index.faiss')
65 | dataset_path = os.path.join(OFFLINE_YOUTUBE_DIR, 'datasets', 'transcript_dataset.csv')
66 |
67 | if not os.path.exists(index_path):
68 | return "No database found. Please add videos first.", None
69 | try:
70 | results, top_videos = query_vector_database(query_text, embedding_model, top_k=top_k)
71 | except Exception as e:
72 | return f"Error: {e}", None
73 |
74 | # Prepare top videos
75 | top_videos_html = "
Title: {row['video_title']}
102 |Relevance Score: {row['relevance']:.4f}
103 |Example Text: {row['text']}
104 | 105 | {local_video_player} 106 | 107 |Title: {row['video_title']}
137 |Text: {row['text']}
138 |Score: {row['score']:.4f}
139 | 140 | {local_video_player} 141 | 142 |