├── .gitignore ├── README.md ├── app ├── __init__.py ├── bd_api.py ├── database.py ├── main.py ├── models.py ├── parse_sponsor.py ├── process_pool_manager.py ├── schemas.py └── sponsor_worker.py ├── env.sample ├── frontend ├── .gitignore ├── README.md ├── eslint.config.js ├── index.html ├── package-lock.json ├── package.json ├── public │ └── vite.svg ├── src │ ├── App.css │ ├── App.tsx │ ├── assets │ │ └── react.svg │ ├── components │ │ └── PendingChannels.tsx │ ├── index.css │ ├── main.tsx │ ├── pages │ │ ├── VideoDetail.tsx │ │ ├── VideoInput.tsx │ │ └── VideoList.tsx │ └── vite-env.d.ts ├── tsconfig.app.json ├── tsconfig.json ├── tsconfig.node.json └── vite.config.ts ├── requirements.txt └── sql_app.db /.gitignore: -------------------------------------------------------------------------------- 1 | .env 2 | env/ 3 | __pycache__/ -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Video Scraping API 2 | 3 | A FastAPI-based backend for video scraping and analysis. 4 | 5 | ## Setup 6 | 7 | 1. Create a virtual environment: 8 | ```bash 9 | python -m venv venv 10 | source venv/bin/activate # On Windows: venv\Scripts\activate 11 | ``` 12 | 13 | 2. Install dependencies: 14 | ```bash 15 | pip install -r requirements.txt 16 | ``` 17 | 18 | 3. Run the application: 19 | ```bash 20 | uvicorn app.main:app --reload 21 | ``` 22 | 23 | The API will be available at `http://localhost:8000` 24 | 25 | ## API Documentation 26 | 27 | After starting the server, visit `http://localhost:8000/docs` for the interactive API documentation. 28 | 29 | ### Endpoints 30 | 31 | - `POST /scrape-videos/`: Submit URLs for video scraping 32 | - `GET /videos/`: Get paginated list of videos 33 | - `GET /videos/{video_id}`: Get detailed information about a specific video 34 | 35 | ## Models 36 | 37 | - Video: Base video information 38 | - Metadata: Video transcript and metadata 39 | - VideoSummary: Generated summary of the video 40 | - VideoSponsor: Sponsorship information and brand mentions 41 | -------------------------------------------------------------------------------- /app/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /app/bd_api.py: -------------------------------------------------------------------------------- 1 | import httpx 2 | from typing import Dict, List 3 | from sqlalchemy.orm import Session 4 | import asyncio 5 | from . import models 6 | import os 7 | from dotenv import load_dotenv 8 | from .process_pool_manager import get_pool_manager 9 | 10 | load_dotenv() 11 | 12 | API_ENDPOINT = "https://api.brightdata.com/datasets/v3" 13 | VIDEO_DATASET_ID = "gd_lk56epmy2i5g7lzu0k" 14 | SNAPSHOT_POLL_INTERVAL = 5 # seconds 15 | MAX_RETRIES = 200 # 1 minute total polling time 16 | API_KEY = os.getenv("BD_API_KEY") 17 | 18 | async def check_progress(client: httpx.AsyncClient, headers: Dict[str, str], snapshot: str) -> bool: 19 | """Check if the scraping process is complete.""" 20 | response = await client.get( 21 | f"{API_ENDPOINT}/progress/{snapshot}", 22 | headers=headers 23 | ) 24 | response.raise_for_status() 25 | progress_data = response.json() 26 | print(f"Progress: {progress_data}, Snapshot: {snapshot}") 27 | 28 | # Check if all items are processed 29 | return progress_data.get("status") == "ready" 30 | 31 | async def get_snapshot_data(client: httpx.AsyncClient, headers: Dict[str, str], snapshot: str) -> Dict: 32 | """Poll for snapshot data until it's ready or max retries reached.""" 33 | for _ in range(MAX_RETRIES): 34 | # First check progress 35 | is_complete = await check_progress(client, headers, snapshot) 36 | if not is_complete: 37 | await asyncio.sleep(SNAPSHOT_POLL_INTERVAL) 38 | continue 39 | 40 | # If complete, get snapshot data 41 | response = await client.get( 42 | f"{API_ENDPOINT}/snapshot/{snapshot}", 43 | headers=headers, 44 | params={"format": "json"} 45 | ) 46 | response.raise_for_status() 47 | data = response.json() 48 | 49 | if isinstance(data, list) and len(data) > 0: 50 | return data 51 | 52 | # Wait before next poll if data not ready 53 | await asyncio.sleep(SNAPSHOT_POLL_INTERVAL) 54 | 55 | raise TimeoutError("Snapshot data not ready after maximum retries") 56 | 57 | async def process_video_data(video_data: Dict, video: models.Video, db: Session) -> None: 58 | try: 59 | # Update video metadata 60 | metadata = models.Metadata( 61 | video_id=video.id, 62 | metadata_json=video_data, 63 | creator=video_data.get("youtuber"), 64 | ) 65 | print(video_data.get("youtuber")) 66 | db.add(metadata) 67 | video.status = "completed" 68 | db.commit() 69 | 70 | # Start sponsor processing using the process pool 71 | if video_data.get("transcript") or video_data.get("description"): 72 | # Get database URL from current session 73 | db_url = db.get_bind().url.render_as_string(hide_password=False) 74 | 75 | # Add task to the process pool 76 | pool_manager = get_pool_manager() 77 | pool_manager.add_task( 78 | video.id, 79 | video_data.get("transcript", ""), 80 | video_data.get("description", ""), 81 | db_url 82 | ) 83 | 84 | except Exception as e: 85 | video.status = "failed" 86 | db.commit() 87 | print(f"Error processing video data: {str(e)}") 88 | 89 | async def scrape_videos(urls: List[str], db: Session, type: str = "video") -> List[models.Video]: 90 | if not API_KEY: 91 | raise ValueError("BD_API_KEY environment variable not set") 92 | 93 | # Prepare the request payload 94 | if type == "video": 95 | payload = [{"url": url} for url in urls] 96 | else: 97 | payload = [{"url": url, "num_of_posts": 50} for url in urls] 98 | 99 | headers = { 100 | "Authorization": f"Bearer {API_KEY}", 101 | "Content-Type": "application/json" 102 | } 103 | 104 | try: 105 | async with httpx.AsyncClient() as client: 106 | # Step 1: Trigger the scraping 107 | if type == "video": 108 | response = await client.post( 109 | f"{API_ENDPOINT}/trigger", 110 | params={"dataset_id": VIDEO_DATASET_ID, "include_errors": "true"}, 111 | headers=headers, 112 | json=payload 113 | ) 114 | else: 115 | response = await client.post( 116 | f"{API_ENDPOINT}/trigger", 117 | params={"dataset_id": VIDEO_DATASET_ID, "include_errors": "true", "type": "discover_new", "discover_by": "url"}, 118 | headers=headers, 119 | json=payload 120 | ) 121 | response.raise_for_status() 122 | snapshot = response.json().get("snapshot_id") 123 | 124 | # Step 2: Poll for progress and get snapshot data 125 | video_data_list = await get_snapshot_data(client, headers, snapshot) 126 | 127 | if type == "channel": 128 | for url in urls: 129 | channel = db.query(models.Channel).filter(models.Channel.url == url).first() 130 | if channel: 131 | channel.status = "completed" 132 | db.commit() 133 | 134 | # Process videos one by one to avoid transaction conflicts 135 | videos = [] 136 | for video_data in video_data_list: 137 | # Check if video already exists 138 | video = db.query(models.Video).filter(models.Video.url == video_data["url"]).first() 139 | if not video: 140 | # Only create new video if it doesn't exist 141 | video = models.Video(url=video_data["url"]) 142 | db.add(video) 143 | db.flush() # Flush to get the video ID 144 | 145 | # Always update the video data 146 | video.status = "pending" 147 | await process_video_data(video_data, video, db) 148 | videos.append(video) 149 | 150 | return videos 151 | 152 | except Exception as e: 153 | print(f"HTTP error occurred: {e}") 154 | if type == "channel": 155 | for url in urls: 156 | channel = db.query(models.Channel).filter(models.Channel.url == url).first() 157 | if channel: 158 | channel.status = "failed" 159 | db.commit() 160 | raise 161 | -------------------------------------------------------------------------------- /app/database.py: -------------------------------------------------------------------------------- 1 | from sqlalchemy import create_engine 2 | from sqlalchemy.ext.declarative import declarative_base 3 | from sqlalchemy.orm import sessionmaker 4 | 5 | SQLALCHEMY_DATABASE_URL = "sqlite:///./sql_app.db" 6 | 7 | engine = create_engine( 8 | SQLALCHEMY_DATABASE_URL, connect_args={"check_same_thread": False} 9 | ) 10 | SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine) 11 | 12 | Base = declarative_base() 13 | 14 | def get_db(): 15 | db = SessionLocal() 16 | try: 17 | yield db 18 | finally: 19 | db.close() 20 | -------------------------------------------------------------------------------- /app/main.py: -------------------------------------------------------------------------------- 1 | from fastapi import FastAPI, Depends, HTTPException, BackgroundTasks 2 | from sqlalchemy.orm import Session, joinedload 3 | from typing import List, Union 4 | from . import models, schemas, bd_api 5 | from .database import engine, get_db 6 | import json 7 | from sqlalchemy import String, text 8 | import logging 9 | 10 | # Set up logging 11 | logging.basicConfig(level=logging.INFO) 12 | logger = logging.getLogger(__name__) 13 | 14 | models.Base.metadata.create_all(bind=engine) 15 | 16 | app = FastAPI() 17 | 18 | def model_to_dict(obj): 19 | if obj is None: 20 | return None 21 | 22 | if not hasattr(obj, '__table__'): 23 | return obj 24 | 25 | result = {} 26 | for column in obj.__table__.columns: 27 | value = getattr(obj, column.name) 28 | result[column.name] = value 29 | 30 | # Handle relationships 31 | if hasattr(obj, 'video_metadata') and obj.video_metadata: 32 | result['video_metadata'] = model_to_dict(obj.video_metadata) 33 | if hasattr(obj, 'summary') and obj.summary: 34 | result['summary'] = model_to_dict(obj.summary) 35 | if hasattr(obj, 'sponsor') and obj.sponsor: 36 | sponsor_dict = model_to_dict(obj.sponsor) 37 | if sponsor_dict and 'brands_mentioned' in sponsor_dict: 38 | try: 39 | if isinstance(sponsor_dict['brands_mentioned'], str): 40 | sponsor_dict['brands_mentioned'] = json.loads(sponsor_dict['brands_mentioned']) 41 | except: 42 | sponsor_dict['brands_mentioned'] = [] 43 | result['sponsor'] = sponsor_dict 44 | 45 | return result 46 | 47 | async def process_videos_background(urls: List[str], db: Session, type: str = "video"): 48 | try: 49 | await bd_api.scrape_videos(urls, db, type) 50 | except Exception as e: 51 | # Get videos and update their status to Failed 52 | for url in urls: 53 | if type == "channel": 54 | channel = db.query(models.Channel).filter(models.Channel.url == url).first() 55 | if channel: 56 | channel.status = "failed" 57 | else: 58 | video = db.query(models.Video).filter(models.Video.url == url).first() 59 | if video: 60 | video.status = "failed" 61 | db.commit() 62 | print(f"Error processing videos: {str(e)}") 63 | 64 | async def handle_urls(urls: List[str], db: Session, background_tasks: BackgroundTasks, scrape_type: str = "video"): 65 | results = [] 66 | 67 | for url in urls: 68 | if scrape_type == "channel": 69 | # Handle channel URL 70 | existing_channel = db.query(models.Channel).filter(models.Channel.url == url).first() 71 | if existing_channel: 72 | # Update status if it was previously failed or completed 73 | if existing_channel.status in ["failed", "completed"]: 74 | existing_channel.status = "pending" 75 | db.commit() 76 | results.append(existing_channel) 77 | else: 78 | channel = models.Channel(url=url) 79 | db.add(channel) 80 | try: 81 | db.commit() 82 | results.append(channel) 83 | except: 84 | db.rollback() 85 | raise HTTPException(status_code=500, detail="Error adding channel to database") 86 | else: 87 | # Handle video URL 88 | existing_video = db.query(models.Video).filter(models.Video.url == url).first() 89 | if existing_video: 90 | # Update status if it was previously failed or completed 91 | if existing_video.status in ["failed", "completed"]: 92 | existing_video.status = "pending" 93 | db.commit() 94 | results.append(existing_video) 95 | else: 96 | video = models.Video(url=url) 97 | db.add(video) 98 | try: 99 | db.commit() 100 | results.append(video) 101 | except: 102 | db.rollback() 103 | raise HTTPException(status_code=500, detail="Error adding video to database") 104 | 105 | # Process in background 106 | if results: 107 | background_tasks.add_task(process_videos_background, urls, db, scrape_type) 108 | 109 | # Return appropriate schema based on type 110 | if scrape_type == "channel": 111 | return [schemas.Channel(**model_to_dict(result)) for result in results] 112 | else: 113 | return [schemas.Video(**model_to_dict(result)) for result in results] 114 | 115 | @app.post("/scrape-videos/", response_model=Union[List[schemas.Video], List[schemas.Channel]]) 116 | async def scrape_videos( 117 | request: schemas.ScrapeVideosRequest, 118 | background_tasks: BackgroundTasks, 119 | db: Session = Depends(get_db) 120 | ): 121 | return await handle_urls(request.urls, db, background_tasks, request.scrape_type) 122 | 123 | @app.get("/videos/", response_model=schemas.VideoList) 124 | async def get_videos( 125 | page: int = 1, 126 | size: int = 10, 127 | sponsor_filter: str = None, 128 | sponsor_name: str = None, 129 | creator: str = None, 130 | sort_by: str = None, 131 | sort_order: str = "asc", 132 | db: Session = Depends(get_db) 133 | ): 134 | # Calculate offset 135 | offset = (page - 1) * size 136 | 137 | # Create base subquery for IDs only 138 | subquery = db.query(models.Video.id).select_from(models.Video) 139 | metadata_joined = False 140 | sponsor_joined = False 141 | 142 | # Apply filters to subquery 143 | if sponsor_filter or sponsor_name: 144 | subquery = subquery.join(models.VideoSponsor) 145 | sponsor_joined = True 146 | if sponsor_filter == "sponsored": 147 | subquery = subquery.filter(models.VideoSponsor.is_sponsored == True) 148 | elif sponsor_filter == "not_sponsored": 149 | subquery = subquery.filter(models.VideoSponsor.is_sponsored == False) 150 | 151 | if sponsor_name: 152 | if not sponsor_joined: 153 | subquery = subquery.join(models.VideoSponsor) 154 | sponsor_joined = True 155 | subquery = subquery.filter( 156 | models.VideoSponsor.brands_mentioned.cast(String).ilike(f'%{sponsor_name}%') 157 | ) 158 | 159 | if creator: 160 | subquery = subquery.join(models.Metadata) 161 | metadata_joined = True 162 | subquery = subquery.filter(models.Metadata.creator.ilike(f'%{creator}%')) 163 | 164 | # Apply sorting to subquery 165 | if sort_by == "creator": 166 | if not metadata_joined: 167 | subquery = subquery.join(models.Metadata) 168 | metadata_joined = True 169 | if sort_order == "desc": 170 | subquery = subquery.order_by(models.Metadata.creator.desc()) 171 | else: 172 | subquery = subquery.order_by(models.Metadata.creator.asc()) 173 | else: 174 | if sort_order == "desc": 175 | subquery = subquery.order_by(models.Video.created_at.desc()) 176 | else: 177 | subquery = subquery.order_by(models.Video.created_at.asc()) 178 | 179 | # Apply pagination to subquery 180 | subquery = subquery.offset(offset).limit(size) 181 | 182 | # Get total count 183 | total_query = db.query(models.Video.id).select_from(models.Video) 184 | metadata_joined = False 185 | sponsor_joined = False 186 | 187 | if sponsor_filter or sponsor_name: 188 | total_query = total_query.join(models.VideoSponsor) 189 | sponsor_joined = True 190 | if sponsor_filter == "sponsored": 191 | total_query = total_query.filter(models.VideoSponsor.is_sponsored == True) 192 | elif sponsor_filter == "not_sponsored": 193 | total_query = total_query.filter(models.VideoSponsor.is_sponsored == False) 194 | if sponsor_name: 195 | total_query = total_query.filter( 196 | models.VideoSponsor.brands_mentioned.cast(String).ilike(f'%{sponsor_name}%') 197 | ) 198 | 199 | if creator: 200 | if not metadata_joined: 201 | total_query = total_query.join(models.Metadata) 202 | metadata_joined = True 203 | total_query = total_query.filter(models.Metadata.creator.ilike(f'%{creator}%')) 204 | 205 | total = total_query.count() 206 | 207 | # Get the actual videos with their relationships 208 | videos = db.query(models.Video).options( 209 | joinedload(models.Video.sponsor), 210 | joinedload(models.Video.video_metadata) 211 | ).filter( 212 | models.Video.id.in_(subquery) 213 | ).order_by( 214 | models.Video.created_at.asc() if sort_order == "asc" else models.Video.created_at.desc() 215 | ).all() 216 | 217 | # Convert to schema format 218 | video_list = [] 219 | for video in videos: 220 | try: 221 | video_dict = model_to_dict(video) 222 | video_schema = schemas.Video(**video_dict) 223 | video_list.append(video_schema) 224 | except Exception as e: 225 | logger.error(f"Error converting video ID {video.id} to schema: {str(e)}") 226 | continue 227 | 228 | return schemas.VideoList( 229 | items=video_list, 230 | total=total, 231 | page=page, 232 | size=size 233 | ) 234 | 235 | @app.get("/videos/{video_id}", response_model=schemas.Video) 236 | async def get_video(video_id: int, db: Session = Depends(get_db)): 237 | video = db.query(models.Video).options( 238 | joinedload(models.Video.video_metadata), 239 | joinedload(models.Video.summary), 240 | joinedload(models.Video.sponsor) 241 | ).filter(models.Video.id == video_id).first() 242 | 243 | if video is None: 244 | raise HTTPException(status_code=404, detail="Video not found") 245 | 246 | return schemas.Video(**model_to_dict(video)) 247 | 248 | @app.get("/pending-channels/", response_model=List[schemas.Channel]) 249 | async def get_pending_channels(db: Session = Depends(get_db)): 250 | channels = db.query(models.Channel).filter( 251 | models.Channel.status == "pending" 252 | ).order_by(models.Channel.created_at.desc()).all() 253 | 254 | return [schemas.Channel(**model_to_dict(channel)) for channel in channels] 255 | -------------------------------------------------------------------------------- /app/models.py: -------------------------------------------------------------------------------- 1 | from sqlalchemy import Column, Integer, String, DateTime, ForeignKey, JSON, Boolean 2 | from sqlalchemy.orm import relationship 3 | from .database import Base 4 | from datetime import datetime 5 | 6 | class Video(Base): 7 | __tablename__ = "videos" 8 | 9 | id = Column(Integer, primary_key=True, index=True) 10 | url = Column(String, index=True) 11 | created_at = Column(DateTime, default=datetime.now) 12 | status = Column(String, default="pending") 13 | 14 | # Relationships for additional processing 15 | video_metadata = relationship("Metadata", back_populates="video", uselist=False) 16 | summary = relationship("VideoSummary", back_populates="video", uselist=False) 17 | sponsor = relationship("VideoSponsor", back_populates="video", uselist=False) 18 | 19 | class Metadata(Base): 20 | __tablename__ = "metadata" 21 | 22 | id = Column(Integer, primary_key=True, index=True) 23 | video_id = Column(Integer, ForeignKey("videos.id")) 24 | created_at = Column(DateTime, default=datetime.now) 25 | status = Column(String, default="pending") 26 | creator = Column(String, index=True, nullable=True) 27 | metadata_json = Column(JSON, nullable=True) # Store all Bright Data response as JSON 28 | 29 | video = relationship("Video", back_populates="video_metadata") 30 | 31 | class VideoSummary(Base): 32 | __tablename__ = "video_summaries" 33 | 34 | id = Column(Integer, primary_key=True, index=True) 35 | video_id = Column(Integer, ForeignKey("videos.id")) 36 | summary = Column(String) 37 | created_at = Column(DateTime, default=datetime.now) 38 | status = Column(String, default="pending") 39 | 40 | video = relationship("Video", back_populates="summary") 41 | 42 | class VideoSponsor(Base): 43 | __tablename__ = "video_sponsors" 44 | 45 | id = Column(Integer, primary_key=True, index=True) 46 | video_id = Column(Integer, ForeignKey("videos.id")) 47 | is_sponsored = Column(Boolean, default=False) 48 | brands_mentioned = Column(JSON) 49 | created_at = Column(DateTime, default=datetime.now) 50 | status = Column(String, default="pending") 51 | 52 | video = relationship("Video", back_populates="sponsor") 53 | 54 | class Channel(Base): 55 | __tablename__ = "channels" 56 | 57 | id = Column(Integer, primary_key=True, index=True) 58 | url = Column(String, index=True) 59 | created_at = Column(DateTime, default=datetime.now) 60 | status = Column(String, default="pending") 61 | channel_metadata = Column(JSON, nullable=True) # Store channel metadata when scraping is complete 62 | -------------------------------------------------------------------------------- /app/parse_sponsor.py: -------------------------------------------------------------------------------- 1 | import os 2 | from openai import OpenAI 3 | from dotenv import load_dotenv 4 | 5 | load_dotenv() 6 | 7 | # Initialize OpenAI client 8 | client = OpenAI(api_key="") 9 | 10 | # Common sponsor-related keywords and phrases 11 | sponsor_keywords = [ 12 | "sponsor", "sponsored", "partnership", "partner", "brought to you by", 13 | "thanks to", "promotion", "promotional", "affiliate", "discount code", 14 | "promo code", "special offer", "check out", "sponsored by" 15 | ] 16 | 17 | def extract_context(text: str, keyword: str, window_size: int = 100) -> str: 18 | """ 19 | Extract context around a keyword with specified window size before and after. 20 | """ 21 | text = text.lower() 22 | keyword_pos = text.find(keyword.lower()) 23 | 24 | if keyword_pos == -1: 25 | return "" 26 | 27 | # Find the start and end positions for the context window 28 | start = max(0, keyword_pos - window_size) 29 | end = min(len(text), keyword_pos + len(keyword) + window_size) 30 | 31 | # Extract the context and clean it 32 | context = text[start:end].strip() 33 | return ' '.join(context.split()) # Normalize whitespace 34 | 35 | def find_sponsor(text: str, keywords: list[str]) -> list[tuple[str, str]]: 36 | """ 37 | Find potential sponsor mentions in text using keyword matching. 38 | Returns list of tuples containing (context, keyword). 39 | """ 40 | if not text: 41 | return [] 42 | 43 | found_contexts = [] 44 | 45 | for keyword in keywords: 46 | # Get all occurrences of the keyword 47 | text_lower = text.lower() 48 | start = 0 49 | while True: 50 | pos = text_lower.find(keyword, start) 51 | if pos == -1: 52 | break 53 | 54 | # Extract context around this occurrence 55 | context = extract_context(text[max(0, pos - 100):min(len(text), pos + 100 + len(keyword))], keyword) 56 | if context: 57 | found_contexts.append((context, keyword)) 58 | 59 | start = pos + len(keyword) 60 | 61 | return found_contexts 62 | 63 | def extract_sponsor_name(context: str) -> str: 64 | """ 65 | Use GPT to extract the sponsor name from the context. 66 | """ 67 | prompt = f"""Given this text, extract ONLY the company name that is sponsoring or advertising. 68 | If there is no clear sponsor, respond with 'None'. If there is a sponsor but you can't determine the exact name, respond with 'Unknown'. 69 | Respond with just the company name, no other text. 70 | 71 | Text: {context}""" 72 | 73 | response = client.chat.completions.create( 74 | model="gpt-4o-mini-2024-07-18", 75 | messages=[ 76 | {"role": "system", "content": "You are a sponsor detection system. Extract only the company name, nothing else."}, 77 | {"role": "user", "content": prompt} 78 | ], 79 | temperature=0.1 80 | ) 81 | 82 | sponsor = response.choices[0].message.content.strip() 83 | 84 | # Clean up common formatting issues 85 | if sponsor.lower() in ['none', 'no sponsor', 'no clear sponsor']: 86 | return None 87 | if sponsor.lower() in ['unknown', "can't determine", 'unclear']: 88 | return None 89 | 90 | return sponsor 91 | 92 | def parse_sponsors(description: str, transcript: str) -> dict: 93 | """ 94 | Parse description and transcript to find potential sponsors. 95 | Returns a dictionary with is_sponsored flag and list of sponsor brands with context. 96 | """ 97 | all_text = f"{description}\n{transcript}" 98 | sponsor_contexts = find_sponsor(all_text, sponsor_keywords) 99 | 100 | if not sponsor_contexts: 101 | return { 102 | "is_sponsored": False, 103 | "brands": [] 104 | } 105 | 106 | # Extract sponsor names for contexts 107 | brands = [] 108 | seen_brands = set() 109 | 110 | for context, _ in sponsor_contexts: 111 | sponsor_name = extract_sponsor_name(context) 112 | if sponsor_name and sponsor_name.lower() not in seen_brands: 113 | seen_brands.add(sponsor_name.lower()) 114 | brands.append({ 115 | "name": sponsor_name, 116 | "context": context 117 | }) 118 | 119 | return { 120 | "is_sponsored": bool(brands), 121 | "brands": brands 122 | } 123 | -------------------------------------------------------------------------------- /app/process_pool_manager.py: -------------------------------------------------------------------------------- 1 | from multiprocessing import Pool, Manager 2 | from .sponsor_worker import process_sponsor_task 3 | 4 | # Default to using half of available CPU cores, with a minimum of 2 5 | import multiprocessing 6 | DEFAULT_POOL_SIZE = max(2, multiprocessing.cpu_count() // 2) 7 | 8 | class ProcessPoolManager: 9 | def __init__(self, pool_size: int = DEFAULT_POOL_SIZE): 10 | self.pool_size = pool_size 11 | self.manager = Manager() 12 | self.task_queue = self.manager.Queue() 13 | self.active_tasks = self.manager.Value('i', 0) 14 | self.pool = Pool(processes=pool_size) 15 | 16 | def add_task(self, video_id: int, transcript: str, description: str, db_url: str): 17 | """Add a new task to the queue""" 18 | task_data = { 19 | 'video_id': video_id, 20 | 'transcript': transcript, 21 | 'description': description, 22 | 'db_url': db_url 23 | } 24 | self.task_queue.put(task_data) 25 | self._process_queue() 26 | 27 | def _process_queue(self): 28 | """Process tasks from the queue if there's capacity in the pool""" 29 | while self.active_tasks.value < self.pool_size and not self.task_queue.empty(): 30 | try: 31 | task_data = self.task_queue.get_nowait() 32 | self.active_tasks.value += 1 33 | 34 | # Start the task in the pool 35 | self.pool.apply_async( 36 | process_sponsor_task, 37 | args=( 38 | task_data['video_id'], 39 | task_data['transcript'], 40 | task_data['description'], 41 | task_data['db_url'] 42 | ), 43 | callback=self._task_complete, 44 | error_callback=self._task_error 45 | ) 46 | except Exception as e: 47 | print(f"Error starting task: {str(e)}") 48 | self.active_tasks.value -= 1 49 | 50 | def _task_complete(self, result): 51 | """Callback when a task completes successfully""" 52 | self.active_tasks.value -= 1 53 | self._process_queue() 54 | 55 | def _task_error(self, error): 56 | """Callback when a task fails""" 57 | print(f"Task failed with error: {str(error)}") 58 | self.active_tasks.value -= 1 59 | self._process_queue() 60 | 61 | def shutdown(self): 62 | """Shutdown the pool and wait for all tasks to complete""" 63 | self.pool.close() 64 | self.pool.join() 65 | 66 | # Global instance of the pool manager 67 | _pool_manager = None 68 | 69 | def get_pool_manager(pool_size: int = DEFAULT_POOL_SIZE) -> ProcessPoolManager: 70 | """Get or create the global pool manager instance""" 71 | global _pool_manager 72 | if _pool_manager is None: 73 | _pool_manager = ProcessPoolManager(pool_size) 74 | return _pool_manager 75 | -------------------------------------------------------------------------------- /app/schemas.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel, ConfigDict 2 | from typing import List, Optional, Dict, Any, Union 3 | from datetime import datetime 4 | 5 | class VideoBase(BaseModel): 6 | url: str 7 | 8 | class VideoCreate(VideoBase): 9 | pass 10 | 11 | class MetadataBase(BaseModel): 12 | metadata_json: Optional[Dict[str, Any]] = None 13 | status: Optional[str] = None 14 | creator: Optional[str] = None 15 | 16 | class MetadataCreate(MetadataBase): 17 | video_id: int 18 | 19 | class Metadata(MetadataBase): 20 | id: int 21 | video_id: int 22 | created_at: datetime 23 | 24 | model_config = ConfigDict(from_attributes=True) 25 | 26 | class VideoSummaryBase(BaseModel): 27 | summary: Optional[str] = None 28 | status: Optional[str] = None 29 | 30 | class VideoSummaryCreate(VideoSummaryBase): 31 | video_id: int 32 | 33 | class VideoSummary(VideoSummaryBase): 34 | id: int 35 | video_id: int 36 | created_at: datetime 37 | 38 | model_config = ConfigDict(from_attributes=True) 39 | 40 | class VideoSponsorBase(BaseModel): 41 | is_sponsored: Optional[bool] = None 42 | brands_mentioned: Optional[List[Dict[str, str]]] = None 43 | status: Optional[str] = None 44 | 45 | class VideoSponsorCreate(VideoSponsorBase): 46 | video_id: int 47 | 48 | class VideoSponsor(VideoSponsorBase): 49 | id: int 50 | video_id: int 51 | created_at: datetime 52 | 53 | model_config = ConfigDict(from_attributes=True) 54 | 55 | class Video(VideoBase): 56 | id: int 57 | status: Optional[str] = None 58 | created_at: datetime 59 | video_metadata: Optional[Metadata] = None 60 | summary: Optional[VideoSummary] = None 61 | sponsor: Optional[VideoSponsor] = None 62 | 63 | @property 64 | def creator(self) -> Optional[str]: 65 | return self.video_metadata.creator if self.video_metadata else None 66 | 67 | model_config = ConfigDict(from_attributes=True) 68 | 69 | class VideoList(BaseModel): 70 | items: List[Video] 71 | total: int 72 | page: int 73 | size: int 74 | 75 | class Channel(BaseModel): 76 | id: int 77 | url: str 78 | status: Optional[str] = None 79 | created_at: datetime 80 | channel_metadata: Optional[Dict[str, Any]] = None 81 | 82 | model_config = ConfigDict(from_attributes=True) 83 | 84 | class ChannelList(BaseModel): 85 | items: List[Channel] 86 | total: int 87 | page: int 88 | size: int 89 | 90 | class ScrapeVideosRequest(BaseModel): 91 | urls: List[str] 92 | scrape_type: str = "video" 93 | 94 | class ScrapeChannelRequest(BaseModel): 95 | url: str -------------------------------------------------------------------------------- /app/sponsor_worker.py: -------------------------------------------------------------------------------- 1 | from sqlalchemy import create_engine 2 | from sqlalchemy.orm import sessionmaker 3 | from .parse_sponsor import parse_sponsors 4 | from .models import Video, VideoSponsor 5 | from .database import Base 6 | 7 | def process_sponsor_task(video_id: int, transcript: str, description: str, db_url: str): 8 | try: 9 | # Create a new database session for this process 10 | engine = create_engine(db_url) 11 | Base.metadata.create_all(bind=engine) 12 | SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine) 13 | db = SessionLocal() 14 | 15 | try: 16 | # Get the video 17 | video = db.query(Video).filter(Video.id == video_id).first() 18 | if not video: 19 | print(f"Video {video_id} not found") 20 | return 21 | 22 | # Get or create VideoSponsor 23 | sponsor = db.query(VideoSponsor).filter(VideoSponsor.video_id == video_id).first() 24 | if not sponsor: 25 | sponsor = VideoSponsor(video_id=video_id) 26 | db.add(sponsor) 27 | 28 | # Update status to in_progress 29 | sponsor.status = "in_progress" 30 | db.commit() 31 | 32 | # Process sponsors 33 | try: 34 | sponsor_data = parse_sponsors(description, transcript) 35 | if isinstance(sponsor_data, dict): 36 | sponsor.is_sponsored = sponsor_data.get("is_sponsored", False) 37 | sponsor.brands_mentioned = sponsor_data.get("brands", []) 38 | else: 39 | # Handle the case where parse_sponsors returns a different structure 40 | sponsor.is_sponsored = bool(sponsor_data) 41 | sponsor.brands_mentioned = sponsor_data if isinstance(sponsor_data, list) else [] 42 | sponsor.status = "completed" 43 | except Exception as e: 44 | print(f"Error processing sponsors for video {video_id}: {str(e)}") 45 | sponsor.status = "failed" 46 | 47 | db.commit() 48 | 49 | finally: 50 | db.close() 51 | 52 | except Exception as e: 53 | print(f"Worker process error for video {video_id}: {str(e)}") 54 | -------------------------------------------------------------------------------- /env.sample: -------------------------------------------------------------------------------- 1 | BD_API_KEY= 2 | OPENAI_API_KEY= -------------------------------------------------------------------------------- /frontend/.gitignore: -------------------------------------------------------------------------------- 1 | # Logs 2 | logs 3 | *.log 4 | npm-debug.log* 5 | yarn-debug.log* 6 | yarn-error.log* 7 | pnpm-debug.log* 8 | lerna-debug.log* 9 | 10 | node_modules 11 | dist 12 | dist-ssr 13 | *.local 14 | 15 | # Editor directories and files 16 | .vscode/* 17 | !.vscode/extensions.json 18 | .idea 19 | .DS_Store 20 | *.suo 21 | *.ntvs* 22 | *.njsproj 23 | *.sln 24 | *.sw? 25 | -------------------------------------------------------------------------------- /frontend/README.md: -------------------------------------------------------------------------------- 1 | # React + TypeScript + Vite 2 | 3 | This template provides a minimal setup to get React working in Vite with HMR and some ESLint rules. 4 | 5 | Currently, two official plugins are available: 6 | 7 | - [@vitejs/plugin-react](https://github.com/vitejs/vite-plugin-react/blob/main/packages/plugin-react/README.md) uses [Babel](https://babeljs.io/) for Fast Refresh 8 | - [@vitejs/plugin-react-swc](https://github.com/vitejs/vite-plugin-react-swc) uses [SWC](https://swc.rs/) for Fast Refresh 9 | 10 | ## Expanding the ESLint configuration 11 | 12 | If you are developing a production application, we recommend updating the configuration to enable type aware lint rules: 13 | 14 | - Configure the top-level `parserOptions` property like this: 15 | 16 | ```js 17 | export default tseslint.config({ 18 | languageOptions: { 19 | // other options... 20 | parserOptions: { 21 | project: ['./tsconfig.node.json', './tsconfig.app.json'], 22 | tsconfigRootDir: import.meta.dirname, 23 | }, 24 | }, 25 | }) 26 | ``` 27 | 28 | - Replace `tseslint.configs.recommended` to `tseslint.configs.recommendedTypeChecked` or `tseslint.configs.strictTypeChecked` 29 | - Optionally add `...tseslint.configs.stylisticTypeChecked` 30 | - Install [eslint-plugin-react](https://github.com/jsx-eslint/eslint-plugin-react) and update the config: 31 | 32 | ```js 33 | // eslint.config.js 34 | import react from 'eslint-plugin-react' 35 | 36 | export default tseslint.config({ 37 | // Set the react version 38 | settings: { react: { version: '18.3' } }, 39 | plugins: { 40 | // Add the react plugin 41 | react, 42 | }, 43 | rules: { 44 | // other rules... 45 | // Enable its recommended rules 46 | ...react.configs.recommended.rules, 47 | ...react.configs['jsx-runtime'].rules, 48 | }, 49 | }) 50 | ``` 51 | -------------------------------------------------------------------------------- /frontend/eslint.config.js: -------------------------------------------------------------------------------- 1 | import js from '@eslint/js' 2 | import globals from 'globals' 3 | import reactHooks from 'eslint-plugin-react-hooks' 4 | import reactRefresh from 'eslint-plugin-react-refresh' 5 | import tseslint from 'typescript-eslint' 6 | 7 | export default tseslint.config( 8 | { ignores: ['dist'] }, 9 | { 10 | extends: [js.configs.recommended, ...tseslint.configs.recommended], 11 | files: ['**/*.{ts,tsx}'], 12 | languageOptions: { 13 | ecmaVersion: 2020, 14 | globals: globals.browser, 15 | }, 16 | plugins: { 17 | 'react-hooks': reactHooks, 18 | 'react-refresh': reactRefresh, 19 | }, 20 | rules: { 21 | ...reactHooks.configs.recommended.rules, 22 | 'react-refresh/only-export-components': [ 23 | 'warn', 24 | { allowConstantExport: true }, 25 | ], 26 | }, 27 | }, 28 | ) 29 | -------------------------------------------------------------------------------- /frontend/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Vite + React + TS 8 | 9 | 10 |
11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /frontend/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "frontend", 3 | "private": true, 4 | "version": "0.0.0", 5 | "type": "module", 6 | "scripts": { 7 | "dev": "vite", 8 | "build": "tsc -b && vite build", 9 | "lint": "eslint .", 10 | "preview": "vite preview" 11 | }, 12 | "dependencies": { 13 | "@emotion/react": "^11.14.0", 14 | "@emotion/styled": "^11.14.0", 15 | "@mui/icons-material": "^6.3.1", 16 | "@mui/material": "^6.3.1", 17 | "react": "^18.3.1", 18 | "react-dom": "^18.3.1", 19 | "react-router-dom": "^7.1.1" 20 | }, 21 | "devDependencies": { 22 | "@eslint/js": "^9.17.0", 23 | "@types/react": "^18.3.18", 24 | "@types/react-dom": "^18.3.5", 25 | "@vitejs/plugin-react": "^4.3.4", 26 | "eslint": "^9.17.0", 27 | "eslint-plugin-react-hooks": "^5.0.0", 28 | "eslint-plugin-react-refresh": "^0.4.16", 29 | "globals": "^15.14.0", 30 | "typescript": "~5.6.2", 31 | "typescript-eslint": "^8.18.2", 32 | "vite": "^6.0.5" 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /frontend/public/vite.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /frontend/src/App.css: -------------------------------------------------------------------------------- 1 | #root { 2 | max-width: 1280px; 3 | margin: 0 auto; 4 | padding: 2rem; 5 | text-align: center; 6 | } 7 | 8 | .logo { 9 | height: 6em; 10 | padding: 1.5em; 11 | will-change: filter; 12 | transition: filter 300ms; 13 | } 14 | .logo:hover { 15 | filter: drop-shadow(0 0 2em #646cffaa); 16 | } 17 | .logo.react:hover { 18 | filter: drop-shadow(0 0 2em #61dafbaa); 19 | } 20 | 21 | @keyframes logo-spin { 22 | from { 23 | transform: rotate(0deg); 24 | } 25 | to { 26 | transform: rotate(360deg); 27 | } 28 | } 29 | 30 | @media (prefers-reduced-motion: no-preference) { 31 | a:nth-of-type(2) .logo { 32 | animation: logo-spin infinite 20s linear; 33 | } 34 | } 35 | 36 | .card { 37 | padding: 2em; 38 | } 39 | 40 | .read-the-docs { 41 | color: #888; 42 | } 43 | -------------------------------------------------------------------------------- /frontend/src/App.tsx: -------------------------------------------------------------------------------- 1 | import { BrowserRouter as Router, Routes, Route, Link, useNavigate } from 'react-router-dom'; 2 | import { Container, AppBar, Toolbar, Typography, Box, Button, CssBaseline } from '@mui/material'; 3 | import { VideoInput } from './pages/VideoInput'; 4 | import { VideoList } from './pages/VideoList'; 5 | import { VideoDetail } from './pages/VideoDetail'; 6 | 7 | // Wrapper component to handle navigation 8 | const VideoInputWrapper = () => { 9 | const navigate = useNavigate(); 10 | 11 | const handleSubmit = async (urls: string[], type: string) => { 12 | try { 13 | const response = await fetch('/api/scrape-videos/', { 14 | method: 'POST', 15 | headers: { 16 | 'Content-Type': 'application/json', 17 | }, 18 | body: JSON.stringify({ urls, scrape_type: type }), 19 | }); 20 | 21 | if (!response.ok) { 22 | throw new Error('Failed to submit videos'); 23 | } 24 | 25 | // Navigate to videos page after successful submission 26 | navigate('/videos'); 27 | } catch (error) { 28 | console.error('Error submitting videos:', error); 29 | throw error; 30 | } 31 | }; 32 | 33 | return ; 34 | }; 35 | 36 | function App() { 37 | return ( 38 | 39 | 40 | 45 | 46 | 47 | 48 | YouTube Video Scraper 49 | 50 | 53 | 56 | 57 | 58 | 59 | 67 | 68 | 69 | } /> 70 | } /> 71 | } /> 72 | 73 | 74 | 75 | 76 | 77 | ); 78 | } 79 | 80 | export default App; 81 | -------------------------------------------------------------------------------- /frontend/src/assets/react.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /frontend/src/components/PendingChannels.tsx: -------------------------------------------------------------------------------- 1 | import { useEffect, useState } from 'react'; 2 | import { Box, Typography, Paper, CircularProgress, List, ListItem, ListItemText } from '@mui/material'; 3 | 4 | interface Channel { 5 | id: number; 6 | url: string; 7 | status: string; 8 | created_at: string; 9 | } 10 | 11 | export const PendingChannels = () => { 12 | const [channels, setChannels] = useState([]); 13 | const [isLoading, setIsLoading] = useState(true); 14 | const [error, setError] = useState(''); 15 | 16 | const fetchPendingChannels = async () => { 17 | try { 18 | const response = await fetch('/api/pending-channels/'); 19 | if (!response.ok) { 20 | throw new Error('Failed to fetch pending channels'); 21 | } 22 | const data = await response.json(); 23 | setChannels(data); 24 | } catch (err) { 25 | setError('Failed to load pending channels'); 26 | console.error('Error fetching pending channels:', err); 27 | } finally { 28 | setIsLoading(false); 29 | } 30 | }; 31 | 32 | useEffect(() => { 33 | fetchPendingChannels(); 34 | // Poll for updates every 10 seconds 35 | const interval = setInterval(fetchPendingChannels, 10000); 36 | return () => clearInterval(interval); 37 | }, []); 38 | 39 | if (isLoading) { 40 | return ( 41 | 42 | 43 | 44 | ); 45 | } 46 | 47 | if (error) { 48 | return ( 49 | 50 | {error} 51 | 52 | ); 53 | } 54 | 55 | if (channels.length === 0) { 56 | return null; 57 | } 58 | 59 | return ( 60 | 61 | 62 | Pending Channels 63 | 64 | 65 | {channels.map((channel) => ( 66 | 67 | 71 | 72 | 73 | ))} 74 | 75 | 76 | ); 77 | }; 78 | -------------------------------------------------------------------------------- /frontend/src/index.css: -------------------------------------------------------------------------------- 1 | * { 2 | margin: 0; 3 | padding: 0; 4 | box-sizing: border-box; 5 | } 6 | 7 | :root { 8 | font-family: Inter, system-ui, Avenir, Helvetica, Arial, sans-serif; 9 | line-height: 1.5; 10 | font-weight: 400; 11 | color-scheme: light dark; 12 | color: #213547; 13 | background-color: #ffffff; 14 | font-synthesis: none; 15 | text-rendering: optimizeLegibility; 16 | -webkit-font-smoothing: antialiased; 17 | -moz-osx-font-smoothing: grayscale; 18 | } 19 | 20 | html, body { 21 | margin: 0; 22 | padding: 0; 23 | width: 100%; 24 | height: 100%; 25 | overflow-x: hidden; 26 | } 27 | 28 | body { 29 | display: flex; 30 | flex-direction: column; 31 | place-items: center; 32 | min-width: 320px; 33 | } 34 | 35 | #root { 36 | display: flex; 37 | flex-direction: column; 38 | min-height: 100vh; 39 | width: 100%; 40 | } 41 | 42 | a { 43 | font-weight: 500; 44 | color: #646cff; 45 | text-decoration: inherit; 46 | } 47 | a:hover { 48 | color: #535bf2; 49 | } 50 | 51 | h1 { 52 | font-size: 3.2em; 53 | line-height: 1.1; 54 | } 55 | 56 | button { 57 | border-radius: 8px; 58 | border: 1px solid transparent; 59 | padding: 0.6em 1.2em; 60 | font-size: 1em; 61 | font-weight: 500; 62 | font-family: inherit; 63 | background-color: #1a1a1a; 64 | cursor: pointer; 65 | transition: border-color 0.25s; 66 | } 67 | button:hover { 68 | border-color: #646cff; 69 | } 70 | button:focus, 71 | button:focus-visible { 72 | outline: 4px auto -webkit-focus-ring-color; 73 | } 74 | 75 | @media (prefers-color-scheme: dark) { 76 | :root { 77 | color: rgba(255, 255, 255, 0.87); 78 | background-color: #242424; 79 | } 80 | a:hover { 81 | color: #747bff; 82 | } 83 | button { 84 | background-color: #f9f9f9; 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /frontend/src/main.tsx: -------------------------------------------------------------------------------- 1 | import { StrictMode } from 'react' 2 | import { createRoot } from 'react-dom/client' 3 | import './index.css' 4 | import App from './App.tsx' 5 | 6 | createRoot(document.getElementById('root')!).render( 7 | 8 | 9 | , 10 | ) 11 | -------------------------------------------------------------------------------- /frontend/src/pages/VideoDetail.tsx: -------------------------------------------------------------------------------- 1 | import { useEffect, useState } from 'react'; 2 | import { useParams } from 'react-router-dom'; 3 | import { 4 | Paper, 5 | Typography, 6 | Box, 7 | Grid, 8 | Chip, 9 | CircularProgress, 10 | Card, 11 | CardContent, 12 | Link, 13 | Accordion, 14 | AccordionSummary, 15 | AccordionDetails, 16 | Tooltip 17 | } from '@mui/material'; 18 | import ExpandMoreIcon from '@mui/icons-material/ExpandMore'; 19 | import MonetizationOnIcon from '@mui/icons-material/MonetizationOn'; 20 | 21 | interface Video { 22 | id: number; 23 | url: string; 24 | status: string; 25 | created_at: string; 26 | sponsor?: { 27 | status: string; 28 | is_sponsored: boolean; 29 | brands_mentioned: Array<{ 30 | name: string; 31 | context: string; 32 | }>; 33 | }; 34 | video_metadata?: { 35 | metadata_json?: { 36 | title?: string; 37 | description?: string; 38 | views?: number; 39 | likes?: number; 40 | date_posted?: string; 41 | youtuber?: string; 42 | channel_url?: string; 43 | subscribers?: number; 44 | video_length?: number; 45 | preview_image?: string; 46 | transcript?: string; 47 | }; 48 | }; 49 | } 50 | 51 | export const VideoDetail = () => { 52 | const { id } = useParams<{ id: string }>(); 53 | const [video, setVideo] = useState