├── contentannotation ├── gemini_prompt.txt └── video2annotation.py ├── contentselection ├── content_taxonomy.json └── oracle.py ├── dataset-creation.png ├── dynamicfilters ├── videodynamismfiltering │ ├── Dockerfile │ └── check_static.py └── worddensityfiltering.py ├── finealignment └── video_alignment.py ├── finevideo.gif ├── logo.png ├── rawdataset ├── filter-yt-commons.py └── ytdlps3 │ ├── Dockerfile │ └── download_and_upload.py ├── readme.md └── videocategorization ├── content_taxonomy.json ├── create_prompts.py ├── launchTGI-Slurm.sh └── tgi_inference_client.py /contentannotation/gemini_prompt.txt: -------------------------------------------------------------------------------- 1 | Study the video and provide the following details about the video and the semantic scenes that compose it: 2 | 3 | - characterList: a list of characters that appear in the whole video and a visual description that should allow me to identify them just seeing an image of them. 4 | - scenes: a list of the scenes with the following properties: 5 | - start/end timestamps of the scene 6 | - list of all the characters that appear in the scene 7 | - list of all activities and their timestamps 8 | - list of all props and their timestamps 9 | - list of all video editing details and their start/end timestamps. Details include transitions, effects, music as well as suggestions like segments of the scene that could be removed and why 10 | - scene mood with notes on how the visuals, audio and context contribute to it. Use the following taxonomy returning only the name in your answer {"moods":{"Positive":[{"name":"Happy","description":"Feeling joyful, content, or delighted."},{"name":"Excited","description":"Feeling enthusiastic, energetic, or eager."},{"name":"Calm","description":"Feeling peaceful, relaxed, or serene."},{"name":"Grateful","description":"Feeling appreciative or thankful."},{"name":"Proud","description":"Feeling satisfied with one's achievements or the achievements of others."}],"Negative":[{"name":"Sad","description":"Feeling down, unhappy, or sorrowful."},{"name":"Angry","description":"Feeling irritated, frustrated, or furious."},{"name":"Anxious","description":"Feeling nervous, worried, or uneasy."},{"name":"Lonely","description":"Feeling isolated, disconnected, or abandoned."},{"name":"Bored","description":"Feeling uninterested, disengaged, or restless."}],"Neutral":[{"name":"Indifferent","description":"Feeling neither particularly positive nor negative."},{"name":"Content","description":"Feeling satisfied but not overly excited."},{"name":"Curious","description":"Feeling interested or inquisitive without strong emotion."},{"name":"Confused","description":"Feeling uncertain or unclear but without strong negative feelings."},{"name":"Pensive","description":"Feeling thoughtful or reflective without strong emotional engagement."}]}} 11 | - specific mood changing moments inside the scene, report the timestamp and what we transition from/to in any of the dimensions (visual / auditive) 12 | - scene narrative progression and plot development 13 | - specific narrative moments inside the scene. Report the timestamp and what happened 14 | - character interaction and dynamics descriptions and their start/end timestamps 15 | - specific thematic elements and descriptions 16 | - specific relevant happenings to create deeper meanings and subtexts not explicitly stated that contribute to the richness and depth of the content, timestamp and descriptions 17 | - dynamism score of the scene. Score between 0 and 1. 1 is highly dynamic 18 | - audio - visual correlation score. Score between 0 and 1. 0 what we see is not correlated with the speech and 1 is highly correlated 19 | 20 | - storylines: a list of the different storylines found and which scenes belong to it. 21 | - Specify where is the climax (scene and timestamp) and if the content is being presented a narrative story, or is it more like a collection of facts or non-narrative information 22 | - if there are scenes not matching storylines, explain how those scenes contribute to the video 23 | - looking at the overall video and the storylines, which segments of the video could be trimmed to make it more dynamic? 24 | - q&a: a list of 5 questions/answers about the video that focus on fine details (objects and or activities), overall story reasoning and mood. Focus on Q&A aspects captured on the audio and the video whenever possible difficult to get only by looking at the transcription. -------------------------------------------------------------------------------- /contentannotation/video2annotation.py: -------------------------------------------------------------------------------- 1 | import boto3 2 | import google.generativeai as genai 3 | from openai import OpenAI 4 | import json 5 | import time 6 | from datetime import datetime 7 | from typing import List, Dict, Any, Optional, TypedDict 8 | import instructor 9 | import os 10 | import google.api_core.exceptions 11 | import argparse 12 | 13 | # 14 | # Given an input list of videos, this script downloads them from S3 and annotates the videos with Gemini 15 | # and structures the data with instructor using GPT4o underneath. 16 | # 17 | # The code is prepared to run as a standalone application: 18 | # The first parameter is size_chunk: it basically divide the list of videos in sublists of length size_chunk 19 | # The worker_number decides in which sublist of size size_chunk the current execution will be working on 20 | # --video-list is to specify the json file that contains a list of videoids as a JSON list. If that is not provided, it defaults to oracle_videos_server.json 21 | # 22 | 23 | 24 | ### CONFIG ### 25 | 26 | # Directories to download input videos and output annotation results 27 | input_directory = 'videos_minioracle/' 28 | output_directory = 'videos_minioracle_results/' 29 | bucket_name = '' 30 | 31 | GEMINI_PATH="/path/to/your/key/file" 32 | OPENAI_PATH="/path/to/your/key/file" 33 | ### 34 | 35 | 36 | ### Data Schema ### 37 | 38 | class Character(TypedDict): 39 | characterId: str 40 | name: str 41 | description: str 42 | 43 | class Timestamps(TypedDict): 44 | start_timestamp: str 45 | end_timestamp: str 46 | 47 | class Activity(TypedDict): 48 | description: str 49 | timestamp: Timestamps 50 | 51 | class Prop(TypedDict): 52 | name: str 53 | timestamp: Timestamps 54 | 55 | class VideoEditingDetail(TypedDict): 56 | description: str 57 | timestamps: Timestamps 58 | 59 | class KeyMoment(TypedDict): 60 | timestamp: str 61 | changeDescription: str 62 | 63 | class Mood(TypedDict): 64 | description: str 65 | keyMoments: List[KeyMoment] 66 | 67 | class NarrativeProgression(TypedDict): 68 | description: str 69 | timestamp: str 70 | 71 | class CharacterInteraction(TypedDict): 72 | characters: List[str] 73 | description: str 74 | 75 | class Scene(TypedDict): 76 | sceneId: int 77 | title: str 78 | timestamps: Timestamps 79 | cast: List[str] 80 | activities: List[Activity] 81 | props: List[Prop] 82 | videoEditingDetails: List[VideoEditingDetail] 83 | mood: Mood 84 | narrativeProgression: List[NarrativeProgression] 85 | characterInteraction: List[CharacterInteraction] 86 | thematicElements: str 87 | contextualRelevance: str 88 | dynamismScore: float 89 | audioVisualCorrelation: float 90 | 91 | class Climax(TypedDict): 92 | description: str 93 | timestamp: str 94 | 95 | class Storyline(TypedDict): 96 | description: str 97 | scenes: List[int] 98 | climax: Climax 99 | 100 | class QAndA(TypedDict): 101 | question: str 102 | answer: str 103 | 104 | class TrimmingSuggestion(TypedDict, total=False): 105 | timestamps: Timestamps 106 | description: str 107 | 108 | class Schema(TypedDict): 109 | title: str 110 | description: str 111 | characterList: List[Character] 112 | scenes: List[Scene] 113 | storylines: Storyline 114 | qAndA: List[QAndA] 115 | trimmingSuggestions: List[TrimmingSuggestion] 116 | 117 | ### 118 | 119 | class VideoProcessor: 120 | def __init__(self, gemini_api_key_path: str, openai_api_key_path: str): 121 | # Initialize API keys and clients 122 | self.gemini_apikey = self._read_api_key(gemini_api_key_path) 123 | self.openai_apikey = self._read_api_key(openai_api_key_path) 124 | genai.configure(api_key=self.gemini_apikey) 125 | self.clientOpenAI = OpenAI(api_key=self.openai_apikey) 126 | 127 | def _read_api_key(self, path: str) -> str: 128 | with open(path, "r") as file: 129 | return file.read().strip() 130 | 131 | 132 | def upload_video(self, file_path: str) -> Dict[str, Any]: 133 | print(f"Uploading file... {file_path}") 134 | try: 135 | video_file = genai.upload_file(path=file_path) 136 | 137 | while video_file.state.name == "PROCESSING": 138 | time.sleep(10) 139 | video_file = genai.get_file(video_file.name) 140 | 141 | if video_file.state.name == "FAILED": 142 | return {"error": "Upload failed", "video_file": None} 143 | return {"video_file": video_file} 144 | 145 | except Exception as e: 146 | return {"error": str(e), "video_file": None} 147 | 148 | 149 | 150 | def process_video(self, video_file: Any, addition_to_prompt=None) -> Dict[str, Optional[str]]: 151 | if "error" in video_file: 152 | return {"error": "video_file error: " + video_file["error"], "gemini_text": None} 153 | 154 | max_retries = 5 155 | attempt = 0 156 | delay = 2 # in seconds 157 | 158 | while attempt < max_retries: 159 | try: 160 | print(f"Processing {video_file['video_file'].display_name} (Attempt {attempt + 1})") 161 | prompt = open("gemini_prompt.txt", "r").read() 162 | if addition_to_prompt: 163 | print(f"\t adding addition to prompt: {addition_to_prompt}") 164 | prompt = prompt + addition_to_prompt 165 | model = genai.GenerativeModel(model_name="gemini-1.5-pro-latest") 166 | response = model.generate_content( 167 | [video_file['video_file'], prompt], 168 | request_options={"timeout": 600}, 169 | safety_settings=[ 170 | {"category": genai.types.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT, 171 | "threshold": genai.types.HarmBlockThreshold.BLOCK_NONE}, 172 | {"category": genai.types.HarmCategory.HARM_CATEGORY_HARASSMENT, 173 | "threshold": genai.types.HarmBlockThreshold.BLOCK_NONE}, 174 | {"category": genai.types.HarmCategory.HARM_CATEGORY_HATE_SPEECH, 175 | "threshold": genai.types.HarmBlockThreshold.BLOCK_NONE}, 176 | {"category": genai.types.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT, 177 | "threshold": genai.types.HarmBlockThreshold.BLOCK_NONE} 178 | ] 179 | ) 180 | 181 | if not response.candidates: 182 | return {"error": "No candidates returned. Feedback: " + str(response.prompt_feedback), "gemini_text": None} 183 | #Cleaning up the analyzed file 184 | genai.delete_file(video_file['video_file'].name) 185 | 186 | return {"gemini_text": response.text} 187 | 188 | except google.api_core.exceptions.InternalServerError as e: 189 | print(f"InternalServerError occurred: {e}. Retrying in {delay} seconds...") 190 | attempt += 1 191 | time.sleep(delay) 192 | delay *= 2 # Exponential backoff 193 | 194 | except Exception as e: 195 | if "The read operation timed out" in str(e) or "record layer failure" in str(e): 196 | print(f"Gemini error: {str(e)}. Retrying in {delay} seconds...") 197 | attempt +=1 198 | time.sleep(delay) 199 | delay *= 2 200 | else: 201 | return {"error": str(e), "gemini_text": None} 202 | 203 | # If all retries fail 204 | return {"error": f"Failed after {max_retries} attempts due to InternalServerError / timeouts / SSL.", "gemini_text": None} 205 | 206 | 207 | 208 | def obtain_json(self, gemini_answer: Optional[str]) -> Dict[str, Optional[str]]: 209 | 210 | if gemini_answer is None or (isinstance(gemini_answer, dict) and "error" in gemini_answer): 211 | return {"error": gemini_answer.get("error") if gemini_answer else "No Gemini answer", "json_result": None} 212 | 213 | try: 214 | # Patch the OpenAI client 215 | client = instructor.from_openai(self.clientOpenAI) 216 | promptOpenAI = gemini_answer 217 | completion = client.chat.completions.create( 218 | model="gpt-4o-2024-08-06", 219 | response_model=Schema, 220 | messages=[ 221 | {"role": "user", "content": promptOpenAI}, 222 | ] 223 | ) 224 | 225 | return {"json_result": completion.json()} 226 | except Exception as e: 227 | return {"error": str(e), "json_result": None} 228 | def prep_return(self, final_answer=None, gemini_error = None, gemini_raw_result=None, 229 | instructor_error = None, instructor_raw_result=None): 230 | return { 231 | "final_answer": final_answer, 232 | "gemini": { 233 | "error": gemini_error, 234 | "raw_result": gemini_raw_result 235 | }, 236 | "instructor": { 237 | "error": instructor_error, 238 | "raw_result": instructor_raw_result 239 | } 240 | } 241 | 242 | def process(self, file_path: str) -> Dict[str, Any]: 243 | 244 | # Upload video to Gemini 245 | gemini_result = self.upload_video(file_path) 246 | if gemini_result.get("error"): 247 | return self.prep_return(gemini_error=gemini_result['error']) 248 | 249 | # Process video with Gemini 250 | gemini_text = self.process_video(gemini_result) 251 | if gemini_text.get("error"): 252 | return self.prep_return(gemini_error=gemini_text['error']) 253 | 254 | gemini_out_text = gemini_text.get("gemini_text") 255 | if gemini_out_text is None or gemini_text == "": 256 | return self.prep_return(gemini_error="Empty gemini answer", 257 | gemini_raw_result=gemini_out_text) 258 | 259 | 260 | # Obtain JSON from the instructor 261 | instructor_result = self.obtain_json(gemini_out_text) 262 | 263 | if "IncompleteOutputException" in instructor_result.get("error", "") or "ValidationError" in instructor_result.get("error", ""): 264 | print(f"\tRetrying full pipeline due to Instructor exception: {instructor_result['error']}") 265 | # Retry processing the video 266 | gemini_result = self.upload_video(file_path) 267 | gemini_text = self.process_video(gemini_result, addition_to_prompt=" be concise with your answer") 268 | print("\t retry completed") 269 | 270 | # Check if there was an error during reprocessing 271 | if gemini_text.get("error"): 272 | return self.prep_return(gemini_error=gemini_text['error']) 273 | gemini_out_text = gemini_text.get("gemini_text") 274 | if gemini_out_text is None or gemini_text == "": 275 | return self.prep_return(gemini_error="Empty gemini answer", 276 | gemini_raw_result=gemini_out_text) 277 | 278 | # Retry obtaining JSON from the instructor with the new Gemini text 279 | instructor_result = self.obtain_json(gemini_text.get("gemini_text")) 280 | 281 | 282 | 283 | # Prepare the final response 284 | final_answer = json.loads(instructor_result["json_result"]) if instructor_result["json_result"] and not instructor_result.get("error") else None 285 | return self.prep_return(final_answer=final_answer, 286 | gemini_raw_result=gemini_text.get("gemini_text"), 287 | instructor_raw_result=instructor_result.get("json_result",None), 288 | instructor_error=instructor_result.get("error",None)) 289 | 290 | # AWS S3 Configuration 291 | session = boto3.Session() 292 | s3_client = session.client('s3') 293 | 294 | # Ensure the input and output directories exist 295 | os.makedirs(input_directory, exist_ok=True) 296 | os.makedirs(output_directory, exist_ok=True) 297 | 298 | # Function to check if a result or error file exists for a video 299 | def result_exists(video_filename): 300 | video_id = os.path.splitext(video_filename)[0] 301 | result_file = os.path.join(output_directory, f"{video_id}.json") 302 | error_file = os.path.join(output_directory, f"errors_{video_id}.json") 303 | return os.path.exists(result_file) or os.path.exists(error_file) 304 | 305 | # Function to download video from S3 306 | def download_video_from_s3(video_key, local_path): 307 | bucket_name = '' 308 | try: 309 | s3_client.download_file(bucket_name, video_key, local_path) 310 | print(f"Downloaded {video_key} to {local_path}") 311 | return True 312 | except Exception as e: 313 | print(f"Failed to download {video_key} from S3: {e}") 314 | return False 315 | 316 | def process_single_video(video_id, worker_number): 317 | videos_path = 'path/' 318 | video_key = f'{videos_path}/{video_id}.mp4' 319 | video_filename = f'{video_id}.mp4' 320 | local_path = os.path.join(input_directory, video_filename) 321 | 322 | if result_exists(video_filename): 323 | print(f"Skipping {video_filename}, result already exists.") 324 | return 325 | 326 | # Download video from S3 327 | if not download_video_from_s3(video_key, local_path): 328 | # Handle download failure 329 | error_data = {"error": "File not found in S3"} 330 | error_file_path = os.path.join(output_directory, f"errors_{video_id}.json") 331 | with open(error_file_path, "w") as f: 332 | json.dump(error_data, f, indent=4) 333 | 334 | # Update status report for download failure 335 | timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S') 336 | status_report = f"{timestamp} - {video_id} - failed - File not found in S3\n" 337 | print(status_report) 338 | with open("status.txt", "a") as f: 339 | f.write(status_report) 340 | 341 | return 342 | 343 | # Process the video using VideoProcessor class 344 | processor = VideoProcessor(gemini_api_key_path=GEMINI_PATH, openai_api_key_path=OPENAI_PATH) 345 | result = processor.process(local_path) 346 | video_id = os.path.splitext(os.path.basename(local_path))[0] 347 | 348 | # Save final answer to JSON if available 349 | if result.get("final_answer") is not None: 350 | with open(os.path.join(output_directory, f"{video_id}.json"), "w") as f: 351 | json.dump(result["final_answer"], f, indent=4) 352 | status = "successful" 353 | else: 354 | status = "failed" 355 | 356 | # Save errors to JSON if any errors exist 357 | errors = {} 358 | if (result.get("gemini", {}).get("error") is not None) or (result.get("instructor", {}).get("error") is not None): 359 | gemini_raw = result["gemini"].get("raw_result") 360 | errors = { 361 | "gemini_error": result["gemini"].get("error"), 362 | "instructor_error": result["instructor"].get("error"), 363 | "gemini_raw_result": gemini_raw 364 | } 365 | with open(os.path.join(output_directory, f"errors_{video_id}.json"), "w") as f: 366 | json.dump(errors, f, indent=4) 367 | 368 | # Prepare the status report 369 | timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S') 370 | error_details = ', '.join(filter(None, [result.get("gemini", {}).get("error"), result.get("instructor", {}).get("error")])) 371 | status_report = f"{timestamp} - {video_id} - {status} - {error_details if error_details else 'None'}\n" 372 | print(status_report) 373 | 374 | # Append the status report to status.txt 375 | if worker_number is None: 376 | with open("status.txt", "a") as f: 377 | f.write(status_report) 378 | else: 379 | with open(f"status/status_{worker_number}.txt", "a") as f: 380 | f.write(status_report) 381 | 382 | # Remove the video file after processing 383 | os.remove(local_path) 384 | print(f"Deleted local file {local_path} after processing.") 385 | 386 | def process_chunk(videos_to_process, size_chunk, worker_number): 387 | # Calculate start and end indices for this worker's chunk 388 | start_index = worker_number * size_chunk 389 | end_index = min(start_index + size_chunk, len(videos_to_process)) 390 | 391 | # Process videos in this worker's chunk 392 | for video_id in videos_to_process[start_index:end_index]: 393 | process_single_video(video_id, worker_number) 394 | 395 | if __name__ == "__main__": 396 | # Parse command-line arguments 397 | parser = argparse.ArgumentParser(description='Process videos in chunks.') 398 | parser.add_argument('size_chunk', type=int, help='Size of each chunk to process') 399 | parser.add_argument('worker_number', type=int, help='Worker number (zero-indexed)') 400 | parser.add_argument('--video_list', type=str, help='Optional video list file in JSON format') 401 | args = parser.parse_args() 402 | 403 | # Load the list of videos 404 | if args.video_list: 405 | with open(args.video_list, 'r') as f: 406 | videos_to_process = json.load(f) 407 | print(f"Using provided video list: {args.video_list}") 408 | else: 409 | with open('oracle_videos_server.json', 'r') as f: 410 | videos_to_process = json.load(f) 411 | print("Using default video list: oracle_videos_server.json") 412 | 413 | # Process the assigned chunk 414 | process_chunk(videos_to_process, args.size_chunk, args.worker_number) 415 | 416 | -------------------------------------------------------------------------------- /contentselection/content_taxonomy.json: -------------------------------------------------------------------------------- 1 | { 2 | "Entertainment": { 3 | "Comedy": { 4 | "Stand-up": {}, 5 | "Sketches": {}, 6 | "Parodies": {} 7 | }, 8 | "Music": { 9 | "Music Videos": {}, 10 | "Covers": {}, 11 | "Remixes": {}, 12 | "Lyric Videos": {} 13 | }, 14 | "Movies & Trailers": { 15 | "Film Trailers": {}, 16 | "Short Films": {}, 17 | "Movie Reviews": {} 18 | }, 19 | "Gaming": { 20 | "Let's Plays": {}, 21 | "Game Reviews": {}, 22 | "Walkthroughs": {}, 23 | "Game Commentary": {} 24 | }, 25 | "Vlogs": { 26 | "Daily Vlogs": {}, 27 | "Travel Vlogs": {}, 28 | "Storytime": {} 29 | }, 30 | "Livestreams": { 31 | "Gaming Livestreams": {}, 32 | "Q&A Sessions": {}, 33 | "Event Livestreams": {} 34 | } 35 | }, 36 | "Education": { 37 | "Tutorials": { 38 | "Software Tutorials": {}, 39 | "DIY & Crafts": {}, 40 | "Cooking Tutorials": {} 41 | }, 42 | "Lectures & Talks": { 43 | "Academic Lectures": {}, 44 | "TED Talks": {}, 45 | "Motivational Talks": {} 46 | }, 47 | "Science & Technology": { 48 | "Science Explainers": {}, 49 | "Tech Reviews": {}, 50 | "Engineering Projects": {} 51 | }, 52 | "Language Learning": { 53 | "Language Lessons": {}, 54 | "Pronunciation Guides": {} 55 | }, 56 | "History & Culture": { 57 | "Documentaries": {}, 58 | "Cultural Explainers": {}, 59 | "Historical Analysis": {} 60 | }, 61 | "Business & Finance": { 62 | "Entrepreneurship": {}, 63 | "Investment Guides": {}, 64 | "Marketing Strategies": {} 65 | } 66 | }, 67 | "Lifestyle": { 68 | "Health & Fitness": { 69 | "Workout Routines": {}, 70 | "Nutrition Guides": {}, 71 | "Mental Health Tips": {} 72 | }, 73 | "Fashion & Beauty": { 74 | "Makeup Tutorials": {}, 75 | "Fashion Hauls": {}, 76 | "Skincare Routines": {} 77 | }, 78 | "Travel": { 79 | "Destination Guides": {}, 80 | "Travel Tips": {}, 81 | "Travel Vlogs": {} 82 | }, 83 | "Food & Cooking": { 84 | "Recipe Videos": {}, 85 | "Cooking Shows": {}, 86 | "Food Reviews": {} 87 | }, 88 | "Home & Garden": { 89 | "Home Improvement": {}, 90 | "Gardening Tips": {}, 91 | "Interior Design": {} 92 | }, 93 | "Parenting & Family": { 94 | "Parenting Tips": {}, 95 | "Family Vlogs": {}, 96 | "Childcare Advice": {} 97 | } 98 | }, 99 | "News & Politics": { 100 | "News Reports": { 101 | "Breaking News": {}, 102 | "Political News": {}, 103 | "World News": {} 104 | }, 105 | "Opinion & Commentary": { 106 | "Political Commentary": {}, 107 | "Social Commentary": {}, 108 | "Editorials": {} 109 | }, 110 | "Interviews": { 111 | "Celebrity Interviews": {}, 112 | "Political Interviews": {}, 113 | "Expert Interviews": {} 114 | }, 115 | "Debates": { 116 | "Political Debates": {}, 117 | "Social Issue Debates": {} 118 | } 119 | }, 120 | "Sports": { 121 | "Highlights & Replays": { 122 | "Game Highlights": {}, 123 | "Match Replays": {} 124 | }, 125 | "Sports Commentary": { 126 | "Analysis Shows": {}, 127 | "Sports Talk Shows": {} 128 | }, 129 | "Athlete Profiles": { 130 | "Career Highlights": {}, 131 | "Documentary Profiles": {} 132 | }, 133 | "Fitness & Training": { 134 | "Athlete Workouts": {}, 135 | "Training Techniques": {} 136 | } 137 | }, 138 | "Art & Creativity": { 139 | "Visual Arts": { 140 | "Painting Tutorials": {}, 141 | "Drawing Tutorials": {}, 142 | "Art Exhibitions": {} 143 | }, 144 | "Photography & Film": { 145 | "Photography Tips": {}, 146 | "Cinematography": {}, 147 | "Short Films": {} 148 | }, 149 | "Crafts & DIY": { 150 | "Home Crafts": {}, 151 | "DIY Projects": {}, 152 | "Upcycling": {} 153 | }, 154 | "Writing & Literature": { 155 | "Writing Tips": {}, 156 | "Book Reviews": {}, 157 | "Poetry Readings": {} 158 | } 159 | }, 160 | "Science & Technology": { 161 | "Tech Reviews": { 162 | "Gadget Reviews": {}, 163 | "Software Reviews": {} 164 | }, 165 | "Science Explainers": { 166 | "Physics": {}, 167 | "Biology": {}, 168 | "Chemistry": {} 169 | }, 170 | "Space Exploration": { 171 | "Astronomy": {}, 172 | "Space Missions": {} 173 | }, 174 | "Engineering": { 175 | "Mechanical Engineering": {}, 176 | "Electrical Engineering": {} 177 | }, 178 | "Environmental Science": { 179 | "Climate Change": {}, 180 | "Conservation Efforts": {} 181 | }, 182 | "Artificial Intelligence": { 183 | "AI Concepts": {}, 184 | "Machine Learning Tutorials": {} 185 | } 186 | }, 187 | "Automotive": { 188 | "Car Reviews": {}, 189 | "Car Modifications": {}, 190 | "Driving Tutorials": {}, 191 | "Motorsports": { 192 | "Racing Highlights": {}, 193 | "Motorsport Commentary": {} 194 | }, 195 | "Off-Roading": {} 196 | }, 197 | "Hobbies & Interests": { 198 | "Collecting": { 199 | "Toy Collections": {}, 200 | "Stamp Collections": {}, 201 | "Memorabilia": {} 202 | }, 203 | "Board Games & Puzzles": { 204 | "Gameplay Tutorials": {}, 205 | "Game Reviews": {} 206 | }, 207 | "Outdoor Activities": { 208 | "Camping": {}, 209 | "Hiking": {}, 210 | "Fishing": {} 211 | }, 212 | "Arts & Crafts": { 213 | "Knitting": {}, 214 | "Pottery": {}, 215 | "Scrapbooking": {} 216 | }, 217 | "Listicles & Rankings": { 218 | "Top 10 Videos": {}, 219 | "Best of [Category]": {}, 220 | "Ranked Lists": {}, 221 | "Must-See [Topic]": {}, 222 | "Buyer’s Guides": {} 223 | }, 224 | "Miscellaneous": { 225 | "ASMR": {}, 226 | "Unboxing Videos": {}, 227 | "Reaction Videos": {}, 228 | "Pranks": {}, 229 | "Social Experiments": {} 230 | } 231 | } 232 | } 233 | -------------------------------------------------------------------------------- /contentselection/oracle.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from sklearn.preprocessing import MinMaxScaler 4 | import json 5 | 6 | # 7 | # Given a pandas dataframe with a list of videos and the metadata extracted from YT-Commons, 8 | # this script creates a new dataframe with a list of videoids that the target the hours of video that we want to collect. 9 | # 10 | 11 | ### CONFIG ### 12 | input_pkl = 'path_to_your_current_videos_df.pkl' 13 | output_pkl = 'path_to_your_output_df.pkl' 14 | taxonomy_path = 'content_taxonomy.json' 15 | target_hours = 4500 16 | ### 17 | 18 | # Step 1: Preprocess the Data 19 | def preprocess_df(df): 20 | # Fill NaNs with 0 or suitable values 21 | df['comment_count'] = df['comment_count'].fillna(0) 22 | df['view_count'] = df['view_count'].fillna(0) 23 | df['like_count'] = df['like_count'].fillna(0) 24 | df['channel_follower_count'] = df['channel_follower_count'].fillna(0) 25 | df['duration_seconds'] = df['duration_seconds'].fillna(0) 26 | 27 | # Normalize numerical columns for fair weighting 28 | scaler = MinMaxScaler() 29 | df[['comment_count', 'view_count', 'like_count']] = scaler.fit_transform( 30 | df[['comment_count', 'view_count', 'like_count']] 31 | ) 32 | 33 | return df 34 | 35 | # Step 2: Compute User Activity Score 36 | def compute_user_activity(df, weights=(0.2, 0.5, 0.3)): 37 | # Weights: 0.2 for comments, 0.5 for views, 0.3 for likes 38 | df['user_activity_score'] = ( 39 | weights[0] * df['comment_count'] + 40 | weights[1] * df['view_count'] + 41 | weights[2] * df['like_count'] 42 | ) 43 | return df 44 | 45 | # Step 3: Map Inferred Categories to Higher Taxonomy Levels 46 | # Note: this was not used in the final version of the content selection algorithm but is useful data that we let in the dataset. 47 | def map_to_parent_categories(df, taxonomy): 48 | """ 49 | Maps each inferred category in the DataFrame to its top-level parent category 50 | in the hierarchical taxonomy. 51 | 52 | :param df: DataFrame containing video data with an 'inferred_category' column. 53 | :param taxonomy: A nested dictionary representing the hierarchical taxonomy. 54 | :return: DataFrame with an added 'parent_category' column representing the top-level parent category. 55 | """ 56 | 57 | # Helper function to find the top-level parent category 58 | def find_top_parent_category(leaf_name, taxonomy): 59 | """ 60 | Finds the top-level parent category of a given leaf in the hierarchical taxonomy. 61 | 62 | :param leaf_name: A string representing the leaf node to search for. 63 | :param taxonomy: A dictionary representing the full hierarchical taxonomy. 64 | :return: The top-level parent category of the given leaf if found, else None. 65 | """ 66 | def recursive_search(taxonomy, leaf_name, current_top_category): 67 | for category, subcategories in taxonomy.items(): 68 | if category == leaf_name: 69 | # Found the leaf node; return the top-level category 70 | return current_top_category 71 | if isinstance(subcategories, dict): 72 | # Continue searching deeper 73 | found_category = recursive_search(subcategories, leaf_name, current_top_category) 74 | if found_category: 75 | return found_category 76 | return None 77 | 78 | # Start the search with top-level categories 79 | for top_category, subcategories in taxonomy.items(): 80 | result = recursive_search(subcategories, leaf_name, top_category) 81 | if result: 82 | return result 83 | 84 | return None 85 | 86 | # Map each inferred category to its top-level parent category 87 | df['parent_category'] = df['inferred_category'].apply(lambda x: find_top_parent_category(x, taxonomy)) 88 | 89 | return df 90 | 91 | 92 | # Step 4: Select Videos for Diversity and Total Duration 93 | def select_videos(df, target_hours=4500): 94 | target_seconds = target_hours * 3600 # Convert hours to seconds 95 | selected_videos = pd.DataFrame() 96 | 97 | # Calculate the total number of inferred categories 98 | inferred_categories = df['inferred_category'].unique() 99 | total_categories = len(inferred_categories) 100 | 101 | # Calculate the initial target seconds per inferred category 102 | target_seconds_per_category = target_seconds / total_categories 103 | 104 | # Shuffle rows to mix categories and channels 105 | df = df.sample(frac=1, random_state=42).reset_index(drop=True) 106 | 107 | # Initialize dictionary to keep track of selected durations per inferred category 108 | category_durations = {category: 0 for category in inferred_categories} 109 | 110 | # Define a progressive penalty for repeated channels 111 | channel_penalty_increment = 0.1 # Incremental penalty for each additional video from the same channel 112 | 113 | # Process each inferred category 114 | for inferred_category in inferred_categories: 115 | category_df = df[df['inferred_category'] == inferred_category] 116 | 117 | # Sort by user activity score and channel follower count in reverse order 118 | category_df = category_df.sort_values( 119 | by=['user_activity_score', 'channel_follower_count'], 120 | ascending=[False, True] 121 | ) 122 | 123 | current_duration = 0 124 | channel_counter = {} 125 | 126 | for _, row in category_df.iterrows(): 127 | if current_duration >= target_seconds_per_category: 128 | break 129 | 130 | channel = row['channel'] 131 | 132 | # Calculate the penalty based on the number of videos already selected from this channel 133 | penalty_factor = 1 - (channel_counter.get(channel, 0) * channel_penalty_increment) 134 | penalty_factor = max(penalty_factor, 0) # Ensure penalty factor doesn't go negative 135 | 136 | # Apply penalty by using a probability check 137 | if np.random.rand() < penalty_factor: 138 | selected_videos = pd.concat([selected_videos, pd.DataFrame([row])]) 139 | current_duration += row['duration_seconds'] 140 | category_durations[inferred_category] += row['duration_seconds'] 141 | channel_counter[channel] = channel_counter.get(channel, 0) + 1 142 | 143 | # Update target duration if some categories can't meet the target 144 | remaining_seconds = target_seconds - selected_videos['duration_seconds'].sum() 145 | remaining_categories = total_categories - len(selected_videos['inferred_category'].unique()) 146 | if remaining_categories > 0: 147 | target_seconds_per_category = remaining_seconds / remaining_categories 148 | 149 | # Adjust to match exactly the target duration or close 150 | selected_videos = selected_videos.sort_values(by='duration_seconds', ascending=True) 151 | 152 | final_selected = pd.DataFrame() 153 | total_duration = 0 154 | 155 | for _, row in selected_videos.iterrows(): 156 | if total_duration + row['duration_seconds'] <= target_seconds: 157 | final_selected = pd.concat([final_selected, pd.DataFrame([row])]) 158 | total_duration += row['duration_seconds'] 159 | 160 | return final_selected 161 | 162 | def main_algorithm(df, taxonomy_file, target_hours = 4500): 163 | df = preprocess_df(df) 164 | df = compute_user_activity(df) 165 | 166 | # Load taxonomy from JSON file 167 | with open(taxonomy_file, 'r') as file: 168 | taxonomy = json.load(file) 169 | 170 | # Map inferred categories to their parent categories 171 | df = map_to_parent_categories(df, taxonomy) 172 | 173 | # Select videos based on updated criteria 174 | selected_videos = select_videos(df, target_hours=target_hours) 175 | 176 | print(f"Total selected videos: {len(selected_videos)}") 177 | print(f"Total duration (seconds): {selected_videos['duration_seconds'].sum()}") 178 | 179 | return selected_videos 180 | 181 | # Run the algorithm 182 | df = pd.read_pickle(input_pkl) 183 | selected_videos_df = main_algorithm(df, taxonomy_path, target_hours=target_hours) 184 | select_videos.to_pickle(output_pkl) -------------------------------------------------------------------------------- /dataset-creation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/fineVideo/b961b6ade22910d041aa75451afa94e454bca372/dataset-creation.png -------------------------------------------------------------------------------- /dynamicfilters/videodynamismfiltering/Dockerfile: -------------------------------------------------------------------------------- 1 | # Use the official Python image with necessary packages 2 | FROM python:3.9-slim 3 | 4 | # Install ffmpeg and other dependencies 5 | RUN apt-get update && \ 6 | apt-get install -y ffmpeg && \ 7 | apt-get clean && \ 8 | rm -rf /var/lib/apt/lists/* 9 | 10 | # Set the working directory 11 | WORKDIR /app 12 | 13 | # Copy the Python script into the container 14 | COPY check_static.py . 15 | 16 | # Install Python dependencies 17 | RUN pip install boto3 ffmpeg-python 18 | 19 | # Command to run the script 20 | CMD ["python", "check_static.py"] 21 | -------------------------------------------------------------------------------- /dynamicfilters/videodynamismfiltering/check_static.py: -------------------------------------------------------------------------------- 1 | import boto3 2 | import subprocess 3 | import os 4 | import math 5 | 6 | # Initialize the S3 client 7 | s3 = boto3.client('s3') 8 | 9 | def download_video_from_s3(bucket, path, video_id): 10 | """Download a video from S3 given a video ID.""" 11 | # Safely handle filenames with hyphens and special characters 12 | video_file = f"./{video_id}.mp4" 13 | s3_key = f"{path}/{video_id}.mp4" 14 | try: 15 | s3.download_file(bucket, s3_key, video_file) 16 | print(f"Downloaded {video_file} from s3://{bucket}/{s3_key}") 17 | return video_file 18 | except Exception as e: 19 | print(f"Error downloading {video_file}: {e}") 20 | return None 21 | 22 | def check_static_video(video_file, segment_duration=60, freeze_n=0.05, freeze_d=50, threshold=0.4): 23 | """Use ffmpeg freezedetect to check if a video has significant static content.""" 24 | 25 | # Get video duration using ffprobe 26 | try: 27 | result = subprocess.run( 28 | ["ffprobe", "-v", "error", "-show_entries", "format=duration", 29 | "-of", "default=noprint_wrappers=1:nokey=1", video_file], 30 | capture_output=True, text=True 31 | ) 32 | video_duration = float(result.stdout.strip()) 33 | except Exception as e: 34 | print(f"Error getting video duration for {video_file}: {e}") 35 | return None 36 | 37 | # Calculate the number of segments to analyze 38 | num_segments = math.ceil(video_duration / segment_duration) 39 | freeze_count = 0 40 | 41 | # Analyze video in segments 42 | for start_time in range(0, int(video_duration), segment_duration): 43 | try: 44 | command = [ 45 | "ffmpeg", "-hide_banner", "-ss", str(start_time), "-i", video_file, 46 | "-t", str(segment_duration), "-vf", f"freezedetect=n={freeze_n}:d={freeze_d}", "-an", "-f", "null", "-" 47 | ] 48 | result = subprocess.run(command, capture_output=True, text=True) 49 | 50 | # Check the stderr output for freeze detection 51 | if "freezedetect" in result.stderr: 52 | print(f"Static content detected in segment starting at {start_time} of {video_file}.") 53 | freeze_count += 1 54 | except Exception as e: 55 | print(f"Error processing segment starting at {start_time} of {video_file}: {e}") 56 | return None 57 | 58 | # Calculate the percentage of segments with freezes 59 | freeze_percentage = freeze_count / num_segments 60 | 61 | print(f"Freeze percentage for {video_file}: {freeze_percentage:.2%}") 62 | 63 | # Determine if the video is considered static based on threshold 64 | return freeze_percentage >= threshold 65 | 66 | def upload_result_to_s3(bucket, video_id, is_static): 67 | """Upload the result to S3 based on whether the video is static or dynamic.""" 68 | s3_key = f"{'static' if is_static else 'dynamic'}/{video_id}.txt" 69 | try: 70 | s3.put_object(Bucket=bucket, Key=s3_key, Body="") 71 | print(f"Uploaded result to s3://{bucket}/{s3_key}") 72 | except Exception as e: 73 | print(f"Error uploading result for {video_id}: {e}") 74 | 75 | def main(): 76 | # Environment variables set in AWS Batch 77 | bucket = os.environ.get("VIDEO_BUCKET") 78 | video_ids = os.environ.get("VIDEO_IDS").split(",") 79 | video_path = os.environ.get("BUCKET_VIDEO_FOLDER_PATH") 80 | 81 | for video_id in video_ids: 82 | # Download video from S3 83 | video_file = download_video_from_s3(bucket, video_path, video_id) 84 | if not video_file: 85 | continue 86 | 87 | # Check if the video is static 88 | is_static = check_static_video(video_file) 89 | if is_static is None: 90 | continue 91 | 92 | # Upload result to S3 93 | upload_result_to_s3(bucket, video_id, is_static) 94 | 95 | # Clean up downloaded video file 96 | os.remove(video_file) 97 | 98 | if __name__ == "__main__": 99 | main() 100 | -------------------------------------------------------------------------------- /dynamicfilters/worddensityfiltering.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | # 4 | # Given a pandas dataframe with a list of videos and the metadata extracted from YT-Commons, 5 | # this script creates the columns duration_seconds and word_density with the goal to study word_density across the dataset 6 | # Finally it drops all entries in the dataframe with word density < 0.5 7 | # 8 | 9 | ### CONFIG ### 10 | input_pkl = 'path_to_your_input_df.pkl' 11 | output_pkl = 'path_to_your_output_df.pkl' 12 | visualize = False # Toggle to true to inspect some results close to 1 and 0.5 word density values. 13 | ### 14 | 15 | 16 | 17 | df = pd.read_pickle(input_pkl) 18 | 19 | #Adding word_density and duration_seconds to the dataframe 20 | def duration_to_seconds(duration): 21 | if pd.isnull(duration): 22 | return 0 # or np.nan or another default 23 | parts = duration.split(':') 24 | parts = [int(p) for p in parts] 25 | if len(parts) == 3: # hh:mm:ss 26 | return parts[0] * 3600 + parts[1] * 60 + parts[2] 27 | elif len(parts) == 2: # mm:ss 28 | return parts[0] * 60 + parts[1] 29 | elif len(parts) == 1: # ss 30 | return parts[0] 31 | else: 32 | return 0 # or np.nan if format is unrecognized 33 | 34 | # Apply the conversion function to the 'duration_string' column 35 | df['duration_seconds'] = df['duration_string'].apply(duration_to_seconds) 36 | 37 | # Calculate word density 38 | # Word density is the number of words per second, so we divide word_count by duration_seconds 39 | df['word_density'] = df.apply(lambda row: row['word_count'] / row['duration_seconds'] 40 | if row['duration_seconds'] > 0 else 0, axis=1) 41 | 42 | 43 | 44 | if visualize: 45 | from tabulate import tabulate 46 | #Visualizing some results 47 | def get_samples_near_target(df, target, range_width=0.1, num_samples=3): 48 | """ 49 | Get samples from the DataFrame that have 'word_density' close to the target value. 50 | 51 | :param df: DataFrame to sample from. 52 | :param target: The target word density to find samples around. 53 | :param range_width: The width of the range around the target value. 54 | :param num_samples: Number of samples to return. 55 | :return: A DataFrame with samples close to the target density. 56 | """ 57 | # Define the range around the target 58 | lower_bound = target - range_width 59 | upper_bound = target + range_width 60 | 61 | # Filter and sample 62 | samples = df[(df['word_density'] >= lower_bound) & (df['word_density'] <= upper_bound)].sample(n=num_samples, random_state=1) 63 | return samples 64 | 65 | close_to_1 = get_samples_near_target(df, 1, num_samples = 100)[['video_id', 'duration_string', 'title']] 66 | print(tabulate(close_to_1,headers='keys', tablefmt='pretty', showindex=False)) 67 | 68 | close_to_05 = get_samples_near_target(df, 0.5, num_samples = 100)[['video_id', 'duration_string', 'title']] 69 | print(tabulate(close_to_05,headers='keys', tablefmt='pretty', showindex=False)) 70 | 71 | 72 | # We cut at 0.5 73 | df = df.loc[df['word_density'] > 0.5] 74 | print(f"Total videos: {len(df)}") 75 | df.to_pickle(output_pkl) -------------------------------------------------------------------------------- /finealignment/video_alignment.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from typing import List 4 | from scenedetect import VideoManager, SceneManager 5 | from scenedetect.detectors import ContentDetector 6 | from scenedetect.frame_timecode import FrameTimecode 7 | import argparse 8 | import re 9 | import json 10 | import boto3 11 | from datetime import datetime 12 | 13 | # 14 | # Given an input list of videos, this script downloads them from S3 and aligns the metadata from those videos generated with video2annotation.py with the videos itself. 15 | # 16 | # The code is prepared to run as a standalone application: 17 | # The first parameter is size_chunk: it basically divide the list of videos in sublists of length size_chunk 18 | # The worker_number decides in which sublist of size size_chunk the current execution will be working on 19 | # --video-list is to specify the json file that contains a list of videoids as a JSON list. If that is not provided, it defaults to video_alignment_to_process.json 20 | # 21 | 22 | 23 | ### CONFIG ### 24 | bucket_name = '' 25 | video_folder_path = 'videos_minioracle/' 26 | json_folder_path = 'videos_minioracle_results/' 27 | output_folder_path = 'results_minioracle_aligned/' 28 | ### 29 | 30 | # AWS S3 Configuration - specify your personal profile 31 | session = boto3.Session() 32 | s3_client = session.client('s3') 33 | 34 | # Function to download video from S3 35 | def download_video_from_s3(video_key, local_path): 36 | try: 37 | s3_client.download_file(bucket_name, video_key, local_path) 38 | print(f"Downloaded {video_key} to {local_path}") 39 | return True 40 | except Exception as e: 41 | print(f"Failed to download {video_key} from S3: {e}") 42 | return False 43 | 44 | 45 | def handle_error(video_id: str, error_message: str, output_folder_path: str, worker_number: str): 46 | """Handle errors by creating an error file and updating the status report.""" 47 | error_data = { 48 | "error": error_message, 49 | "video_id": video_id, 50 | "worker_number": worker_number 51 | } 52 | error_file_path = os.path.join(output_folder_path, f"errors_{video_id}.json") 53 | with open(error_file_path, "w") as f: 54 | json.dump(error_data, f, indent=4) 55 | 56 | # Update status report for failure 57 | timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S') 58 | status_report = f"{timestamp} - {video_id} - failed - {error_message}\n" 59 | print(status_report) 60 | with open(f"status/status_alignment_{worker_number}.txt", "a") as f: 61 | f.write(status_report) 62 | 63 | 64 | def time_to_frametimecode(time_str: str, fps: float, scene_end_time: FrameTimecode = None, filename: str = "unknown_file", worker_number: str = None) -> str: 65 | """Convert mm:ss or ss time format to FrameTimecode, or handle special cases like 'end'.""" 66 | # Define special cases 67 | if time_str == "end": 68 | if scene_end_time is not None: 69 | return scene_end_time.get_timecode() 70 | else: 71 | raise ValueError("time_str is end and no replacement for scene_end_time provided") 72 | 73 | special_cases = ["", "n/a", "varies", "throughout scene", "throughout the scene", 74 | "end", "throughout", "not present", "not applicable"] 75 | if time_str.lower() in special_cases or re.match(r"scene\s\d+", time_str.lower()): 76 | return None 77 | 78 | match = re.match(r"(\d+)s$", time_str.lower()) 79 | if match: 80 | time_str = match.group(1) 81 | if 'around ' in time_str: 82 | time_str = time_str.split('around ')[0] 83 | if '~' in time_str: 84 | time_str = time_str.split('~')[0] 85 | if '+' in time_str: 86 | time_str = time_str.split('+')[0] 87 | if '-' in time_str: 88 | time_str = time_str.split("-")[0] 89 | if ' ' in time_str and ":" in time_str: 90 | time_str = time_str.split(" ")[0] 91 | if ":" in time_str: 92 | parts = time_str.split(":") 93 | if len(parts) == 3: 94 | hours, minutes, seconds = parts 95 | elif len(parts) == 2: 96 | hours = 0 97 | minutes, seconds = parts 98 | elif len(parts) == 1: 99 | hours = 0 100 | minutes = 0 101 | seconds = parts[0] 102 | else: 103 | raise ValueError(f"Invalid timestamp format: {time_str}") 104 | 105 | if '.' in seconds: 106 | seconds = seconds.split(".")[0] 107 | 108 | match = re.match(r"^\d+", seconds) 109 | if match: 110 | seconds = int(match.group()) 111 | else: 112 | raise ValueError(f"Invalid timestamp format: {time_str}") 113 | 114 | 115 | total_seconds = float(hours) * 3600 + float(minutes) * 60 + float(seconds) 116 | else: 117 | try: 118 | total_seconds = float(time_str) 119 | except ValueError: 120 | raise ValueError(f"Invalid timestamp format: {time_str}") 121 | return FrameTimecode(timecode=total_seconds, fps=fps).get_timecode() 122 | 123 | 124 | def adjust_scene_boundaries(video_path, initial_scenes, video_id, worker_number): 125 | """Adjust scene boundaries based on scene detection.""" 126 | # Initialize video manager and scene manager 127 | video_manager = VideoManager([video_path]) 128 | scene_manager = SceneManager() 129 | scene_manager.add_detector(ContentDetector(threshold=15.0)) # Adjust threshold for sensitivity 130 | 131 | # Start the video manager and obtain FPS 132 | video_manager.start() 133 | fps = video_manager.get_framerate() # Get FPS from VideoManager 134 | # print(f"Detected FPS: {fps}") 135 | 136 | # Get total frames using duration in seconds and fps 137 | duration_seconds = video_manager.get_duration()[0].get_seconds() 138 | total_frames = int(duration_seconds * fps) 139 | last_frame_timecode = FrameTimecode(timecode=total_frames, fps=fps).get_timecode().split(".")[0].split(":") 140 | last_frame_timecode = last_frame_timecode[1] + ":" + last_frame_timecode[2] 141 | 142 | adjusted_scenes = [] 143 | 144 | for idx, initial_scene in enumerate(initial_scenes): 145 | 146 | if idx == len(initial_scenes) - 1: 147 | #Hack to avoid issues with answers that signal the last timestamp as 'end' 148 | initial_scene['timestamps']['end_timestamp'] = last_frame_timecode 149 | # print(last_frame_timecode) 150 | 151 | start_timecode = time_to_frametimecode(initial_scene['timestamps']['start_timestamp'], fps, filename=video_id, worker_number = worker_number) 152 | end_timecode = time_to_frametimecode(initial_scene['timestamps']['end_timestamp'], fps, filename=video_id, worker_number = worker_number) 153 | 154 | # Ensure all FrameTimecode objects use the same fps 155 | start_frame_number = int(max(0, FrameTimecode(timecode=start_timecode, fps=fps).get_frames() - 2 * fps)) 156 | end_frame_number = int(min(total_frames, FrameTimecode(timecode=end_timecode, fps=fps).get_frames() + 2 * fps)) 157 | 158 | search_start = FrameTimecode(timecode=start_frame_number, fps=fps) 159 | search_end = FrameTimecode(timecode=end_frame_number, fps=fps) 160 | 161 | # Seek to the start frame for detection using FrameTimecode 162 | video_manager.seek(search_start) 163 | scene_manager.detect_scenes(frame_source=video_manager, end_time=search_end.get_seconds()) 164 | 165 | detected_scenes = scene_manager.get_scene_list() 166 | 167 | # Find closest detected boundaries, default to original timecodes if no match found 168 | adjusted_start_timecode = start_timecode 169 | adjusted_end_timecode = end_timecode 170 | 171 | if detected_scenes: 172 | closest_start = min(detected_scenes, key=lambda x: abs(x[0].get_frames() - FrameTimecode(timecode=start_timecode, fps=fps).get_frames()), default=None) 173 | closest_end = min(detected_scenes, key=lambda x: abs(x[1].get_frames() - FrameTimecode(timecode=end_timecode, fps=fps).get_frames()), default=None) 174 | 175 | if closest_start and abs(closest_start[0].get_frames() - FrameTimecode(timecode=start_timecode, fps=fps).get_frames()) < 2 * fps: 176 | adjusted_start_timecode = closest_start[0].get_timecode() 177 | distance = abs(closest_start[0].get_seconds() - FrameTimecode(timecode=start_timecode, fps=fps).get_seconds()) 178 | if distance > 2: 179 | print(f"\t adjusting start timestamp by {distance:.2f} seconds") 180 | print(f"\t\tFrom: {start_timecode} to {adjusted_start_timecode}" ) 181 | if distance >=5: 182 | raise ValueError(f"Large start timestamp adjustment ({distance:.2f} seconds) required for scene {idx+1}") 183 | 184 | if closest_end and abs(closest_end[1].get_frames() - FrameTimecode(timecode=end_timecode, fps=fps).get_frames()) < 2 * fps: 185 | distance = abs(closest_end[1].get_seconds() - FrameTimecode(timecode=end_timecode, fps=fps).get_seconds()) 186 | adjusted_end_timecode = closest_end[1].get_timecode() 187 | if distance > 2: 188 | print(f"\t adjusting end timestamp by {distance:.2f} seconds") 189 | print(f"\t\tFrom: {end_timecode} to {adjusted_end_timecode}" ) 190 | if distance >=5: 191 | raise ValueError(f"Large start timestamp adjustment ({distance:.2f} seconds) required for scene {idx+1}") 192 | 193 | # Update the JSON with FrameTimecode formatted as HH:MM:SS:FF 194 | initial_scene['timestamps']['start_timestamp'] = adjusted_start_timecode 195 | initial_scene['timestamps']['end_timestamp'] = adjusted_end_timecode 196 | 197 | adjusted_scenes.append(initial_scene) 198 | 199 | # Ensure continuity between scenes 200 | if idx > 0: 201 | previous_scene_end = FrameTimecode(timecode=adjusted_scenes[idx - 1]['timestamps']['end_timestamp'], fps=fps) 202 | current_scene_start = FrameTimecode(timecode=adjusted_start_timecode, fps=fps) 203 | 204 | # if current_scene_start.get_frames() <= previous_scene_end.get_frames(): 205 | # Set start of current scene to be exactly the frame after the end of the previous scene 206 | new_start_timecode = previous_scene_end.get_frames() + 1 207 | adjusted_scenes[idx]['timestamps']['start_timestamp'] = FrameTimecode(timecode=new_start_timecode, fps=fps).get_timecode() 208 | 209 | frame_adjustment = abs(current_scene_start.get_frames() - new_start_timecode) 210 | if frame_adjustment > 25: 211 | print(f"\t\tWARNING: adjusting a scene start by {frame_adjustment} frames") 212 | if frame_adjustment > 125: 213 | raise ValueError(f"Large frame adjustment ({frame_adjustment} frames) required for scene {idx+1}") 214 | 215 | 216 | video_manager.release() 217 | return fps, adjusted_scenes 218 | 219 | def update_timestamps_in_json(data: dict, fps: float, video_id: str, worker_number: str) -> dict: 220 | """Update all timestamp fields in the JSON data to FrameTimecode format and ensure they stay within scene boundaries.""" 221 | # Update timestamps in scenes 222 | for scene in data.get('scenes', []): 223 | scene_start = FrameTimecode(timecode=scene['timestamps']['start_timestamp'], fps=fps) 224 | scene_end = FrameTimecode(timecode=scene['timestamps']['end_timestamp'], fps=fps) 225 | 226 | def enforce_within_boundaries(timestamp, start, end): 227 | if timestamp is None: 228 | return None 229 | frame_timecode = FrameTimecode(timecode=timestamp, fps=fps) 230 | if frame_timecode.get_frames() < start.get_frames(): 231 | return start.get_timecode() 232 | elif frame_timecode.get_frames() > end.get_frames(): 233 | return end.get_timecode() 234 | else: 235 | return timestamp 236 | 237 | # Update activities timestamps 238 | for activity in scene.get('activities', []): 239 | if 'timestamp' in activity: 240 | if 'start_timestamp' in activity['timestamp']: 241 | activity['timestamp']['start_timestamp'] = enforce_within_boundaries( 242 | time_to_frametimecode(activity['timestamp']['start_timestamp'], fps, filename=video_id, scene_end_time=scene_end, worker_number = worker_number), scene_start, scene_end 243 | ) 244 | if 'end_timestamp' in activity['timestamp']: 245 | activity['timestamp']['end_timestamp'] = enforce_within_boundaries( 246 | time_to_frametimecode(activity['timestamp']['end_timestamp'], fps, filename=video_id, scene_end_time=scene_end,worker_number = worker_number), scene_start, scene_end 247 | ) 248 | 249 | # Update props timestamps 250 | for prop in scene.get('props', []): 251 | if 'timestamp' in prop: 252 | if 'start_timestamp' in prop['timestamp']: 253 | prop['timestamp']['start_timestamp'] = enforce_within_boundaries( 254 | time_to_frametimecode(prop['timestamp']['start_timestamp'], fps, filename=video_id, scene_end_time=scene_end,worker_number = worker_number), scene_start, scene_end 255 | ) 256 | if 'end_timestamp' in prop['timestamp']: 257 | prop['timestamp']['end_timestamp'] = enforce_within_boundaries( 258 | time_to_frametimecode(prop['timestamp']['end_timestamp'], fps, filename=video_id, scene_end_time=scene_end,worker_number = worker_number), scene_start, scene_end 259 | ) 260 | 261 | # Update video editing details timestamps 262 | for video_editing in scene.get('videoEditingDetails', []): 263 | if 'timestamps' in video_editing: 264 | if 'start_timestamp' in video_editing['timestamps']: 265 | video_editing['timestamps']['start_timestamp'] = enforce_within_boundaries( 266 | time_to_frametimecode(video_editing['timestamps']['start_timestamp'], fps, filename=video_id, scene_end_time=scene_end,worker_number = worker_number), scene_start, scene_end 267 | ) 268 | if 'end_timestamp' in video_editing['timestamps']: 269 | video_editing['timestamps']['end_timestamp'] = enforce_within_boundaries( 270 | time_to_frametimecode(video_editing['timestamps']['end_timestamp'], fps, filename=video_id, scene_end_time=scene_end,worker_number = worker_number), scene_start, scene_end 271 | ) 272 | 273 | # Update mood key moments timestamps 274 | for key_moment in scene.get('mood', {}).get('keyMoments', []): 275 | if 'timestamp' in key_moment: 276 | key_moment['timestamp'] = enforce_within_boundaries( 277 | time_to_frametimecode(key_moment['timestamp'], fps, filename=video_id, scene_end_time=scene_end,worker_number = worker_number), scene_start, scene_end 278 | ) 279 | 280 | # Update narrative progression timestamps 281 | for narrative in scene.get('narrativeProgression', []): 282 | if 'timestamp' in narrative: 283 | narrative['timestamp'] = enforce_within_boundaries( 284 | time_to_frametimecode(narrative['timestamp'], fps, filename=video_id, scene_end_time=scene_end,worker_number = worker_number), scene_start, scene_end 285 | ) 286 | 287 | # Update storylines climax timestamps 288 | if 'storylines' in data and 'climax' in data['storylines'] and 'timestamp' in data['storylines']['climax']: 289 | data['storylines']['climax']['timestamp'] = time_to_frametimecode(data['storylines']['climax']['timestamp'], fps, filename=video_id, scene_end_time=scene_end,worker_number = worker_number) 290 | 291 | # Update trimming suggestions timestamps 292 | for trimming in data.get('trimmingSuggestions', []): 293 | if 'timestamps' in trimming: 294 | if 'start_timestamp' in trimming['timestamps']: 295 | trimming['timestamps']['start_timestamp'] = enforce_within_boundaries( 296 | time_to_frametimecode(trimming['timestamps']['start_timestamp'], fps, filename=video_id, scene_end_time=scene_end,worker_number = worker_number), scene_start, scene_end 297 | ) 298 | if 'end_timestamp' in trimming['timestamps']: 299 | trimming['timestamps']['end_timestamp'] = enforce_within_boundaries( 300 | time_to_frametimecode(trimming['timestamps']['end_timestamp'], fps, filename=video_id, scene_end_time=scene_end,worker_number = worker_number), scene_start, scene_end 301 | ) 302 | 303 | return data 304 | 305 | def result_exists(video_filename,output_directory): 306 | video_id = os.path.splitext(video_filename)[0] 307 | result_file = os.path.join(output_directory, f"{video_id}.json") 308 | error_file = os.path.join(output_directory, f"errors_{video_id}.json") 309 | return os.path.exists(result_file) or os.path.exists(error_file) 310 | 311 | def process_single_video(video_id, worker_number): 312 | s3_folder_videos = 'videos/' 313 | video_key = f'{s3_folder_videos}/{video_id}.mp4' 314 | video_filename = f'{video_id}.mp4' 315 | video_local_path = os.path.join(video_folder_path, video_filename) 316 | if result_exists(video_filename,output_folder_path): 317 | print(f"Skipping {video_filename}, result already exists.") 318 | return 319 | 320 | # Download video from S3 321 | if not download_video_from_s3(video_key, video_local_path): 322 | # Handle download failure 323 | error_data = {"error": "File not found in S3"} 324 | error_file_path = os.path.join(output_folder_path, f"errors_{video_id}.json") 325 | with open(error_file_path, "w") as f: 326 | json.dump(error_data, f, indent=4) 327 | 328 | # Update status report for download failure 329 | timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S') 330 | status_report = f"{timestamp} - {video_id} - failed - File not found in S3\n" 331 | print(status_report) 332 | with open(f"status/status_alignment_{worker_number}.txt", "a") as f: 333 | f.write(status_report) 334 | 335 | return 336 | 337 | # Construct paths 338 | json_path = os.path.join(json_folder_path, f"{video_id}.json") 339 | json_result_path = os.path.join(output_folder_path, f"{video_id}.json") 340 | 341 | # Load JSON file 342 | with open(json_path, 'r') as json_file: 343 | video_data = json.load(json_file) 344 | 345 | try: 346 | # Adjust scene boundaries using PySceneDetect to determine FPS 347 | fps, adjusted_scenes = adjust_scene_boundaries(video_local_path, video_data['scenes'], video_id, str(worker_number)) 348 | 349 | # Update scenes in the original data 350 | video_data['scenes'] = adjusted_scenes 351 | video_data['fps'] = fps 352 | 353 | # Update all timestamps to FrameTimecode format 354 | video_data = update_timestamps_in_json(video_data, fps, video_id, str(worker_number)) 355 | 356 | # Write updated JSON back to file 357 | with open(json_result_path, 'w') as json_file: 358 | json.dump(video_data, json_file, indent=4) 359 | 360 | print(f"Processed video {video_id}.") 361 | 362 | # Prepare the status report 363 | timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S') 364 | status_report = f"{timestamp} - {video_id} - complete\n" 365 | print(status_report) 366 | 367 | # Append the status report to status.txt 368 | if worker_number is None: 369 | with open("status_alignment.txt", "a") as f: 370 | f.write(status_report) 371 | else: 372 | with open(f"status/status_alignment_{worker_number}.txt", "a") as f: 373 | f.write(status_report) 374 | 375 | except Exception as e: 376 | # Handle any errors in adjusting scenes or updating timestamps 377 | error_data = { 378 | "error": str(e), 379 | "video_id": video_id, 380 | "worker_number": worker_number 381 | } 382 | error_file_path = os.path.join(output_folder_path, f"errors_{video_id}.json") 383 | with open(error_file_path, "w") as f: 384 | json.dump(error_data, f, indent=4) 385 | 386 | # Update status report for failure 387 | timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S') 388 | status_report = f"{timestamp} - {video_id} - failed - Error during processing: {str(e)}\n" 389 | print(status_report) 390 | with open(f"status/status_alignment_{worker_number}.txt", "a") as f: 391 | f.write(status_report) 392 | 393 | finally: 394 | # Remove the video file after processing, even if an error occurred 395 | if os.path.exists(video_local_path): 396 | os.remove(video_local_path) 397 | print(f"Deleted local file {video_local_path} after processing.") 398 | 399 | 400 | def process_chunk(videos_to_process, size_chunk, worker_number): 401 | # Calculate start and end indices for this worker's chunk 402 | start_index = worker_number * size_chunk 403 | end_index = min(start_index + size_chunk, len(videos_to_process)) 404 | 405 | # Process videos in this worker's chunk 406 | for video_id in videos_to_process[start_index:end_index]: 407 | process_single_video(video_id, worker_number) 408 | 409 | if __name__ == "__main__": 410 | # Parse command-line arguments 411 | parser = argparse.ArgumentParser(description='Process videos in chunks.') 412 | parser.add_argument('size_chunk', type=int, help='Size of each chunk to process') 413 | parser.add_argument('worker_number', type=int, help='Worker number (zero-indexed)') 414 | parser.add_argument('--video_list', type=str, help='Optional video list file in JSON format') 415 | args = parser.parse_args() 416 | 417 | # Load the list of videos 418 | if args.video_list: 419 | with open(args.video_list, 'r') as f: 420 | videos_to_process = json.load(f) 421 | print(f"Using provided video list: {args.video_list}") 422 | else: 423 | with open('video_alignment_to_process.json', 'r') as f: 424 | videos_to_process = json.load(f) 425 | print("Using default video list: video_alignment_to_process.json") 426 | 427 | # Process the assigned chunk 428 | process_chunk(videos_to_process, args.size_chunk, args.worker_number) 429 | 430 | 431 | 432 | 433 | -------------------------------------------------------------------------------- /finevideo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/fineVideo/b961b6ade22910d041aa75451afa94e454bca372/finevideo.gif -------------------------------------------------------------------------------- /logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/fineVideo/b961b6ade22910d041aa75451afa94e454bca372/logo.png -------------------------------------------------------------------------------- /rawdataset/filter-yt-commons.py: -------------------------------------------------------------------------------- 1 | from huggingface_hub import snapshot_download 2 | import pandas as pd 3 | import pyarrow.parquet as pq 4 | import os 5 | 6 | # 7 | # This script downloads YTCommons dataset from Hugging Face and parses some relevant fields of each video to finally store them in a dataframe 8 | # Be careful - this script requires a decent amount of RAM to work. 9 | # 10 | 11 | 12 | ### CONFIG ### 13 | dataset_path = './Youtube-Commons/' 14 | output_pkl = 'en_ycommons.pkl' 15 | ### 16 | 17 | 18 | 19 | def read_filtered_parquet_files(folder_path, fields, filters=None): 20 | """ 21 | Reads specified fields from all Parquet files in a folder with filtering and combines them into a single DataFrame. 22 | 23 | Parameters: 24 | folder_path (str): The path to the folder containing Parquet files. 25 | fields (list): List of fields to read from the Parquet files. 26 | filters (list): List of tuples for filtering, e.g., [('column_name', '==', value)] 27 | 28 | Returns: 29 | pd.DataFrame: A DataFrame containing the specified fields from all filtered Parquet files. 30 | """ 31 | # List to store DataFrames 32 | dataframes = [] 33 | 34 | # Iterate over all files in the folder 35 | for file_name in os.listdir(folder_path): 36 | if file_name.endswith('.parquet'): 37 | file_path = os.path.join(folder_path, file_name) 38 | print(f"Processing file: {file_path}") 39 | 40 | # Read the entire Parquet file 41 | df = pq.read_table(file_path).to_pandas() 42 | 43 | # Apply filters if provided 44 | if filters: 45 | for column, operator, value in filters: 46 | if operator == '==': 47 | df = df[df[column] == value] 48 | elif operator == '>': 49 | df = df[df[column] > value] 50 | elif operator == '<': 51 | df = df[df[column] < value] 52 | # Add other operators as needed 53 | 54 | # Check if 'word_count' column exists and filter rows with word_count > 50 55 | if 'word_count' in df.columns: 56 | df = df[df['word_count'] > 50] 57 | 58 | # Handle 'source_language' and 'language_id_method' fields 59 | if 'source_language' not in df.columns and 'language_id_method' in df.columns: 60 | df['source_language'] = df['language_id_method'] 61 | elif 'source_language' in df.columns: 62 | pass # 'source_language' already exists, no action needed 63 | 64 | # Ensure 'source_language' is in the fields to select 65 | if 'source_language' not in fields: 66 | fields.append('source_language') 67 | 68 | # Select only the specified fields 69 | df = df[fields] 70 | dataframes.append(df) 71 | 72 | # Concatenate all DataFrames 73 | combined_df = pd.concat(dataframes, ignore_index=True) 74 | return combined_df 75 | 76 | 77 | fields = ['acodec', 'age_limit', 'categories', 'channel', 'channel_follower_count', 'channel_id', 'character_count', 'comment_count', 'date', 'description', 'duration_string', 'language', 'license', 'like_count', 'original_language', 'resolution', 'tags', 'text', 'title', 'transcription_language', 'upload_date', 'vcodec', 'video_id', 'video_link', 'view_count', 'word_count'] 78 | filters = [('original_language', '==', 'en'), ('transcription_language', '==', 'en')] 79 | 80 | folder = snapshot_download("PleIAs/YouTube-Commons", 81 | repo_type='dataset', 82 | local_dir=dataset_path) 83 | 84 | 85 | df = read_filtered_parquet_files(dataset_path, fields, filters=filters) 86 | 87 | print(df.head()) 88 | print(f"Total videos: {len(df)}") 89 | df.to_pickle(output_pkl) -------------------------------------------------------------------------------- /rawdataset/ytdlps3/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.9-slim 2 | 3 | ENV PYTHONDONTWRITEBYTECODE 1 4 | ENV PYTHONUNBUFFERED 1 5 | 6 | # Install required packages 7 | RUN apt-get update && apt-get install -y \ 8 | wget \ 9 | ffmpeg \ 10 | && apt-get clean 11 | 12 | # Install yt-dlp (a fork of youtube-dl with more features and better maintenance) 13 | RUN pip install yt-dlp boto3 14 | 15 | # Create a directory for the application 16 | WORKDIR /app 17 | 18 | # Copy the script into the Docker image 19 | COPY download_and_upload.py /app/download_and_upload.py 20 | 21 | # Set the entry point to the script 22 | ENTRYPOINT ["python", "/app/download_and_upload.py"] 23 | -------------------------------------------------------------------------------- /rawdataset/ytdlps3/download_and_upload.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import boto3 4 | from yt_dlp import YoutubeDL 5 | 6 | def download_youtube_video(video_id, output_path): 7 | ydl_opts = { 8 | 'format': 'best', 9 | 'writesubtitles': True, 10 | 'subtitleslangs': ['en'], 11 | 'subtitlesformat': 'vtt', 12 | 'writeinfojson': True, 13 | 'skip_download': False, 14 | 'outtmpl': os.path.join(output_path, f'{video_id}.%(ext)s'), 15 | } 16 | with YoutubeDL(ydl_opts) as ydl: 17 | info_dict = ydl.extract_info(video_id, download=True) 18 | 19 | # Get the correct subtitle file path from the info_dict 20 | subtitle_file_path = None 21 | subtitles = info_dict.get('subtitles') 22 | if subtitles and 'en' in subtitles: 23 | subtitle_data = subtitles['en'][0] # Get the first English subtitle entry 24 | subtitle_file_path = ydl.prepare_filename(info_dict).replace('.mp4', '.en.vtt') 25 | 26 | return info_dict, subtitle_file_path 27 | 28 | def upload_to_s3(local_file_path, s3_bucket, s3_key): 29 | s3_client = boto3.client('s3') 30 | s3_client.upload_file(local_file_path, s3_bucket, s3_key) 31 | 32 | def log_failure(video_id, error_message, s3_bucket, s3_path): 33 | error_file_path = f"/tmp/{video_id}.txt" 34 | with open(error_file_path, 'w') as f: 35 | f.write(error_message) 36 | 37 | # Upload the error file to S3 in the failed/ subfolder 38 | s3_client = boto3.client('s3') 39 | s3_client.upload_file(error_file_path, s3_bucket, f"failed/{video_id}.txt") 40 | 41 | def process_video(video_id, s3_bucket, s3_path): 42 | try: 43 | # Create a temporary directory to store downloaded files 44 | download_path = '/tmp/youtube_downloads' 45 | os.makedirs(download_path, exist_ok=True) 46 | 47 | # Download the video, subtitles (if available), and metadata 48 | info_dict, subtitle_file_path = download_youtube_video(video_id, download_path) 49 | 50 | # Define file paths 51 | video_file = os.path.join(download_path, f'{video_id}.mp4') 52 | metadata_file = os.path.join(download_path, f'{video_id}.info.json') 53 | 54 | # Upload each file to the specified S3 path if it exists 55 | if os.path.exists(video_file): 56 | upload_to_s3(video_file, s3_bucket, os.path.join(s3_path, f'{video_id}.mp4')) 57 | if os.path.exists(metadata_file): 58 | upload_to_s3(metadata_file, s3_bucket, os.path.join(s3_path, f'{video_id}.json')) 59 | if subtitle_file_path and os.path.exists(subtitle_file_path): 60 | upload_to_s3(subtitle_file_path, s3_bucket, os.path.join(s3_path, f'{video_id}.en.vtt')) 61 | 62 | # Cleanup 63 | for file_name in os.listdir(download_path): 64 | os.remove(os.path.join(download_path, file_name)) 65 | 66 | except Exception as e: 67 | error_message = str(e) 68 | log_failure(video_id, error_message, s3_bucket, s3_path) 69 | 70 | def main(video_ids, s3_bucket, s3_path): 71 | for video_id in video_ids: 72 | process_video(video_id, s3_bucket, s3_path) 73 | 74 | if __name__ == "__main__": 75 | if len(sys.argv) < 4: 76 | print("Usage: python download_and_upload.py [ ...]") 77 | sys.exit(1) 78 | 79 | s3_bucket = sys.argv[1] 80 | s3_path = sys.argv[2] 81 | video_ids = sys.argv[3:] 82 | 83 | main(video_ids, s3_bucket, s3_path) 84 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | ![Fine Video](logo.png) 2 | 3 | ## Introduction 4 | 5 | We recently released [FineVideo](https://huggingface.co/spaces/HuggingFaceFV/FineVideo-Explorer), a dataset with 43k+ videos/3.4k hours annotated with rich descriptions, narrative details scene splits and QA pairs. 6 | 7 | We cannot be more excited about the response of the community! If you have not seen FineVideo yet, take a look at it through the [dataset explorer page](https://huggingface.co/spaces/HuggingFaceFV/FineVideo-Explorer) 8 | 9 | ![FineVideo Explorer page](finevideo.gif) 10 | 11 | 12 | If you are interested in more technical details about the pipeline, we invite you to take a look at our [blog post](https://huggingface.co/). 13 | 14 | 15 | ## Content of the repository 16 | 17 | This repository contains the code that we used in FineVideo to gather videos and annotate them. Those scripts cover all the different steps in the pipeline below. 18 | ![alt text](dataset-creation.png) 19 | 20 | The scripts are grouped in folders and each folder represent one or more steps of the pipeline: 21 | 22 | ``` 23 | ├── rawdataset 24 | │ ├── filter-yt-commons.py 25 | │ └── ytdlps3 26 | │ ├── Dockerfile 27 | │ └── download_and_upload.py 28 | ├── dynamicfilters 29 | │ ├── videodynamismfiltering 30 | │ │ ├── Dockerfile 31 | │ │ └── check_static.py 32 | │ └── worddensityfiltering.py 33 | ├── videocategorization 34 | │ ├── content_taxonomy.json 35 | │ ├── create_prompts.py 36 | │ ├── launchTGI-Slurm.sh 37 | │ └── tgi_inference_client.py 38 | ├── contentselection 39 | │ ├── content_taxonomy.json 40 | │ └── oracle.py 41 | ├── contentannotation 42 | │ ├── gemini_prompt.txt 43 | │ └── video2annotation.py 44 | ├── finealignment 45 | └── video_alignment.py 46 | 47 | ``` 48 | 49 | Given the size of the content to scan and/or annotate, all the parts that require scalability are implemented as docker containers that can be launched in a distributed way or prepared to split a list of work in chunks and process specific chunks of it so that you can launch multiple instances of the same script to parallelize. 50 | 51 | For example: 52 | * video download `ytdlps3` and video dynamism filtering `videodynamismfiltering` are packaged as Docker containers. 53 | * video id gathering for the raw dataset `filter-yt-commons.py`, content selection `oracle.py` or word density filtering `worddensityfiltering.py` are scripts that process all the content at once 54 | * content annotation `video2annotation.py`, video categorization `tgi_inference_client.py` & `create_prompts.py` and video-metadata alignment `video_alignment.py` are prepared to process chunks of a queue so that you can launch multiple instances of the same script. -------------------------------------------------------------------------------- /videocategorization/content_taxonomy.json: -------------------------------------------------------------------------------- 1 | { 2 | "Entertainment": { 3 | "Comedy": { 4 | "Stand-up": {}, 5 | "Sketches": {}, 6 | "Parodies": {} 7 | }, 8 | "Music": { 9 | "Music Videos": {}, 10 | "Covers": {}, 11 | "Remixes": {}, 12 | "Lyric Videos": {} 13 | }, 14 | "Movies & Trailers": { 15 | "Film Trailers": {}, 16 | "Short Films": {}, 17 | "Movie Reviews": {} 18 | }, 19 | "Gaming": { 20 | "Let's Plays": {}, 21 | "Game Reviews": {}, 22 | "Walkthroughs": {}, 23 | "Game Commentary": {} 24 | }, 25 | "Vlogs": { 26 | "Daily Vlogs": {}, 27 | "Travel Vlogs": {}, 28 | "Storytime": {} 29 | }, 30 | "Livestreams": { 31 | "Gaming Livestreams": {}, 32 | "Q&A Sessions": {}, 33 | "Event Livestreams": {} 34 | } 35 | }, 36 | "Education": { 37 | "Tutorials": { 38 | "Software Tutorials": {}, 39 | "DIY & Crafts": {}, 40 | "Cooking Tutorials": {} 41 | }, 42 | "Lectures & Talks": { 43 | "Academic Lectures": {}, 44 | "TED Talks": {}, 45 | "Motivational Talks": {} 46 | }, 47 | "Science & Technology": { 48 | "Science Explainers": {}, 49 | "Tech Reviews": {}, 50 | "Engineering Projects": {} 51 | }, 52 | "Language Learning": { 53 | "Language Lessons": {}, 54 | "Pronunciation Guides": {} 55 | }, 56 | "History & Culture": { 57 | "Documentaries": {}, 58 | "Cultural Explainers": {}, 59 | "Historical Analysis": {} 60 | }, 61 | "Business & Finance": { 62 | "Entrepreneurship": {}, 63 | "Investment Guides": {}, 64 | "Marketing Strategies": {} 65 | } 66 | }, 67 | "Lifestyle": { 68 | "Health & Fitness": { 69 | "Workout Routines": {}, 70 | "Nutrition Guides": {}, 71 | "Mental Health Tips": {} 72 | }, 73 | "Fashion & Beauty": { 74 | "Makeup Tutorials": {}, 75 | "Fashion Hauls": {}, 76 | "Skincare Routines": {} 77 | }, 78 | "Travel": { 79 | "Destination Guides": {}, 80 | "Travel Tips": {}, 81 | "Travel Vlogs": {} 82 | }, 83 | "Food & Cooking": { 84 | "Recipe Videos": {}, 85 | "Cooking Shows": {}, 86 | "Food Reviews": {} 87 | }, 88 | "Home & Garden": { 89 | "Home Improvement": {}, 90 | "Gardening Tips": {}, 91 | "Interior Design": {} 92 | }, 93 | "Parenting & Family": { 94 | "Parenting Tips": {}, 95 | "Family Vlogs": {}, 96 | "Childcare Advice": {} 97 | } 98 | }, 99 | "News & Politics": { 100 | "News Reports": { 101 | "Breaking News": {}, 102 | "Political News": {}, 103 | "World News": {} 104 | }, 105 | "Opinion & Commentary": { 106 | "Political Commentary": {}, 107 | "Social Commentary": {}, 108 | "Editorials": {} 109 | }, 110 | "Interviews": { 111 | "Celebrity Interviews": {}, 112 | "Political Interviews": {}, 113 | "Expert Interviews": {} 114 | }, 115 | "Debates": { 116 | "Political Debates": {}, 117 | "Social Issue Debates": {} 118 | } 119 | }, 120 | "Sports": { 121 | "Highlights & Replays": { 122 | "Game Highlights": {}, 123 | "Match Replays": {} 124 | }, 125 | "Sports Commentary": { 126 | "Analysis Shows": {}, 127 | "Sports Talk Shows": {} 128 | }, 129 | "Athlete Profiles": { 130 | "Career Highlights": {}, 131 | "Documentary Profiles": {} 132 | }, 133 | "Fitness & Training": { 134 | "Athlete Workouts": {}, 135 | "Training Techniques": {} 136 | } 137 | }, 138 | "Art & Creativity": { 139 | "Visual Arts": { 140 | "Painting Tutorials": {}, 141 | "Drawing Tutorials": {}, 142 | "Art Exhibitions": {} 143 | }, 144 | "Photography & Film": { 145 | "Photography Tips": {}, 146 | "Cinematography": {}, 147 | "Short Films": {} 148 | }, 149 | "Crafts & DIY": { 150 | "Home Crafts": {}, 151 | "DIY Projects": {}, 152 | "Upcycling": {} 153 | }, 154 | "Writing & Literature": { 155 | "Writing Tips": {}, 156 | "Book Reviews": {}, 157 | "Poetry Readings": {} 158 | } 159 | }, 160 | "Science & Technology": { 161 | "Tech Reviews": { 162 | "Gadget Reviews": {}, 163 | "Software Reviews": {} 164 | }, 165 | "Science Explainers": { 166 | "Physics": {}, 167 | "Biology": {}, 168 | "Chemistry": {} 169 | }, 170 | "Space Exploration": { 171 | "Astronomy": {}, 172 | "Space Missions": {} 173 | }, 174 | "Engineering": { 175 | "Mechanical Engineering": {}, 176 | "Electrical Engineering": {} 177 | }, 178 | "Environmental Science": { 179 | "Climate Change": {}, 180 | "Conservation Efforts": {} 181 | }, 182 | "Artificial Intelligence": { 183 | "AI Concepts": {}, 184 | "Machine Learning Tutorials": {} 185 | } 186 | }, 187 | "Automotive": { 188 | "Car Reviews": {}, 189 | "Car Modifications": {}, 190 | "Driving Tutorials": {}, 191 | "Motorsports": { 192 | "Racing Highlights": {}, 193 | "Motorsport Commentary": {} 194 | }, 195 | "Off-Roading": {} 196 | }, 197 | "Hobbies & Interests": { 198 | "Collecting": { 199 | "Toy Collections": {}, 200 | "Stamp Collections": {}, 201 | "Memorabilia": {} 202 | }, 203 | "Board Games & Puzzles": { 204 | "Gameplay Tutorials": {}, 205 | "Game Reviews": {} 206 | }, 207 | "Outdoor Activities": { 208 | "Camping": {}, 209 | "Hiking": {}, 210 | "Fishing": {} 211 | }, 212 | "Arts & Crafts": { 213 | "Knitting": {}, 214 | "Pottery": {}, 215 | "Scrapbooking": {} 216 | }, 217 | "Listicles & Rankings": { 218 | "Top 10 Videos": {}, 219 | "Best of [Category]": {}, 220 | "Ranked Lists": {}, 221 | "Must-See [Topic]": {}, 222 | "Buyer’s Guides": {} 223 | }, 224 | "Miscellaneous": { 225 | "ASMR": {}, 226 | "Unboxing Videos": {}, 227 | "Reaction Videos": {}, 228 | "Pranks": {}, 229 | "Social Experiments": {} 230 | } 231 | } 232 | } 233 | -------------------------------------------------------------------------------- /videocategorization/create_prompts.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import pandas as pd 4 | from concurrent.futures import ThreadPoolExecutor 5 | 6 | 7 | # 8 | # Given a pandas dataframe with a list of videos, this script will generate custom prompts for your videos and by default store 9 | # them in a subfolder 'prompts' 10 | # 11 | 12 | ### CONFIG ### 13 | df_path = 'current_videos.pkl' 14 | ### 15 | 16 | 17 | 18 | # prompt_template = """ 19 | # Given those categories: {leaves} 20 | # Classify a youtube video given its closed captioning and some metadata details. RETURN ONLY the selected category and nothing else! 21 | # Title: {title} 22 | # Description: {description} 23 | # Categories: {categories} 24 | # Tags: {tags} 25 | # Channel: {channel} 26 | # Closed Caption: {closed_caption} 27 | # """ 28 | prompt_template = """ 29 | Given those categories: {leaves} 30 | Classify a youtube video given its closed captioning and some metadata details. RETURN ONLY the selected category and nothing else! 31 | Title: {title} 32 | Description: {description} 33 | Channel: {channel} 34 | Closed Caption: {closed_caption} 35 | """ 36 | 37 | def get_leaves(taxonomy): 38 | leaves = [] 39 | for key, value in taxonomy.items(): 40 | if isinstance(value, dict) and value: # If it's a non-empty dictionary 41 | leaves.extend(get_leaves(value)) 42 | else: # If it's an empty dictionary, consider it as a leaf 43 | if not value: # Check if the value is an empty dictionary 44 | leaves.append(key) 45 | return leaves 46 | 47 | def generate_prompt(row, text, leaves): 48 | return prompt_template.format( 49 | leaves=json.dumps(leaves, indent=2), 50 | title=row['title'], 51 | # description=row['description'], 52 | # categories=row['categories'], 53 | tags=row['tags'], 54 | channel=row['channel'], 55 | closed_caption=row['text'][:5000] # Trim closed captions 56 | ) 57 | 58 | def save_prompts_to_file(prompts, output_file): 59 | """Save prompts to the output JSON file, overwriting it.""" 60 | with open(output_file, 'w', encoding='utf-8') as file: 61 | json.dump(prompts, file, indent=4, ensure_ascii=False) 62 | 63 | def process_row(row, leaves): 64 | video_id = row['video_id'] 65 | 66 | # Generate the prompt 67 | prompt = generate_prompt(row, leaves) 68 | return {"video_id": video_id, "prompt": prompt} 69 | 70 | def generate_prompts_and_save(df_path, output_dir='prompts', max_workers=None, chunksize=1000): 71 | # Ensure the output directory exists 72 | os.makedirs(output_dir, exist_ok=True) 73 | 74 | # Load the taxonomy content 75 | with open('content_taxonomy.json', 'r') as file: 76 | taxonomy_content = json.load(file) 77 | 78 | leaves = get_leaves(taxonomy_content) 79 | 80 | # Load the entire DataFrame first (ensure this fits in memory) 81 | df = pd.read_pickle(df_path) 82 | 83 | # Process in chunks 84 | chunk_index = 0 85 | for start in range(0, len(df), chunksize): 86 | chunk = df.iloc[start:start + chunksize] 87 | prompts = [] 88 | 89 | # Use ThreadPoolExecutor for file I/O-bound operations 90 | with ThreadPoolExecutor(max_workers=max_workers) as executor: 91 | results = executor.map( 92 | process_row, 93 | (row for _, row in chunk.iterrows()), 94 | [leaves] * len(chunk) 95 | ) 96 | 97 | # Collect results and filter out None results 98 | results = [result for result in results if result is not None] 99 | 100 | # Save results to file in chunks 101 | if results: 102 | chunk_file = os.path.join(output_dir, f'prompts_{chunk_index}.json') 103 | save_prompts_to_file(results, chunk_file) 104 | print(f"Saved chunk {chunk_index} to {chunk_file}") 105 | chunk_index += 1 106 | 107 | print(f"Completed processing.") 108 | 109 | # Specify the number of workers, for example, 8 110 | generate_prompts_and_save(df_path, max_workers=8) 111 | -------------------------------------------------------------------------------- /videocategorization/launchTGI-Slurm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=tgi-tests 3 | #SBATCH --partition hopper-prod 4 | #SBATCH --gpus=8 5 | #SBATCH --cpus-per-task=12 6 | #SBATCH --mem-per-cpu=11G 7 | #SBATCH -o slurm/logs/%x_%j.out 8 | #SBATCH --qos=high 9 | 10 | export HF_TOKEN=XXXXX 11 | export PORT=1456 12 | srun --container-image='ghcr.io#huggingface/text-generation-inference' \ 13 | --container-env=HUGGING_FACE_HUB_TOKEN,PORT \ 14 | --container-mounts="/scratch:/data" \ 15 | --container-workdir='/usr/src' \ 16 | --no-container-mount-home \ 17 | --qos normal \ 18 | --gpus=8 \ 19 | /usr/local/bin/text-generation-launcher --model-id meta-llama/Meta-Llama-3.1-70B-Instruct -------------------------------------------------------------------------------- /videocategorization/tgi_inference_client.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import requests 4 | from tqdm import tqdm 5 | from transformers import AutoTokenizer 6 | import re 7 | import sys 8 | from math import ceil 9 | 10 | # 11 | # This script will run the defined prompts against one or more TGI services 12 | # the prompts are stored in chunks in a folder called prompts/ 13 | 14 | # The script is called with 3 parameters: 15 | # python tgi_inference_client.py 16 | # block_number is a number between 0 and 3 (both included). Those blocks are 4 subdivisions of the prompts in prompts/ 17 | # and by specifying the block number we run inference in each different block, this allow us to parallelize inference. 18 | # 19 | 20 | 21 | 22 | 23 | # Ensure the output directory exists 24 | os.makedirs("processed", exist_ok=True) 25 | 26 | # Function to load prompts from a single JSON file 27 | def load_prompts_from_file(file_path): 28 | with open(file_path, "r", encoding="utf-8") as file: 29 | tasks = json.load(file) 30 | return tasks 31 | 32 | # Function to process a single file's tasks and save results 33 | def process_file(file_path, tokenizer, endpoint_url): 34 | # Load tasks from the current file 35 | tasks = load_prompts_from_file(file_path) 36 | results = [] 37 | 38 | # Headers for the HTTP request 39 | headers = { 40 | "Content-Type": "application/json", 41 | } 42 | 43 | # Process each task 44 | for task in tqdm(tasks, desc="Processing tasks"): 45 | video_id = task['video_id'] 46 | input_text = task['prompt'] 47 | input_text = input_text.replace("Given those categories:", "Given this taxonomy:") 48 | pattern = r"Categories: \[.*?\]\n?" 49 | input_text = re.sub(pattern, '', input_text) 50 | pattern = r"Tags: \[.*?\]\n?" 51 | input_text = re.sub(pattern, '', input_text) 52 | pattern = r"Description: \[.*?\]\n?" 53 | input_text = re.sub(pattern, '', input_text) 54 | input_text = input_text + "RETURN A CATEGORY FROM THE TAXONOMY PROVIDED: " 55 | 56 | prompt_tokens = tokenizer.apply_chat_template( 57 | [ 58 | {"role": "user", "content": input_text}, 59 | ], 60 | tokenize=False, 61 | add_generation_prompt=True 62 | ) 63 | 64 | # Prepare the data for the request 65 | data = { 66 | "inputs": prompt_tokens, 67 | "parameters": { 68 | "max_new_tokens": 20, # Adjust as needed 69 | }, 70 | } 71 | 72 | # Make a synchronous request to the model endpoint 73 | response = requests.post(endpoint_url, headers=headers, json=data) 74 | if response.status_code == 200: 75 | response_data = response.json() 76 | completion = response_data.get('generated_text', '') 77 | else: 78 | completion = "Error: Unable to get response" 79 | 80 | # Append the result 81 | results.append({"video_id": video_id, "completion": completion}) 82 | 83 | # Save results to file after processing all tasks in the file 84 | output_filename = os.path.splitext(os.path.basename(file_path))[0] 85 | with open(f"processed/{output_filename}_results.json", "w", encoding="utf-8") as f: 86 | json.dump(results, f, ensure_ascii=False, indent=4) 87 | 88 | # Main function to process a subset of files 89 | def main(): 90 | # Get server address, port, and block number from command-line arguments 91 | if len(sys.argv) != 4: 92 | print("Usage: python script_name.py ") 93 | sys.exit(1) 94 | 95 | server_address = sys.argv[1] 96 | port = sys.argv[2] 97 | block_number = int(sys.argv[3]) 98 | 99 | # Validate block number 100 | if block_number < 0 or block_number > 3: 101 | print("Error: block_number must be between 0 and 3.") 102 | sys.exit(1) 103 | 104 | # Construct endpoint URL 105 | endpoint_url = f"http://{server_address}:{port}/generate" 106 | 107 | # Initialize tokenizer 108 | tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-70B-Instruct") 109 | 110 | # List all JSON files in the prompts directory 111 | files = [f for f in os.listdir("prompts") if f.endswith(".json")] 112 | 113 | # Sort files to ensure consistent partitioning 114 | files.sort() 115 | 116 | # Divide files into 4 blocks 117 | total_files = len(files) 118 | block_size = ceil(total_files / 4) 119 | 120 | # Determine start and end indices for the current block 121 | start_index = block_number * block_size 122 | end_index = min(start_index + block_size, total_files) 123 | 124 | # Process only the files in the current block 125 | for i in range(start_index, end_index): 126 | file_path = os.path.join("prompts", files[i]) 127 | process_file(file_path, tokenizer, endpoint_url) 128 | 129 | # Run the main function 130 | if __name__ == "__main__": 131 | main() --------------------------------------------------------------------------------