├── contentannotation
    ├── gemini_prompt.txt
    └── video2annotation.py
├── contentselection
    ├── content_taxonomy.json
    └── oracle.py
├── dataset-creation.png
├── dynamicfilters
    ├── videodynamismfiltering
    │   ├── Dockerfile
    │   └── check_static.py
    └── worddensityfiltering.py
├── finealignment
    └── video_alignment.py
├── finevideo.gif
├── logo.png
├── rawdataset
    ├── filter-yt-commons.py
    └── ytdlps3
    │   ├── Dockerfile
    │   └── download_and_upload.py
├── readme.md
└── videocategorization
    ├── content_taxonomy.json
    ├── create_prompts.py
    ├── launchTGI-Slurm.sh
    └── tgi_inference_client.py


/contentannotation/gemini_prompt.txt:
--------------------------------------------------------------------------------
 1 | Study the video and provide the following details about the video and the semantic scenes that compose it:
 2 | 
 3 | - characterList: a list of characters that appear in the whole video and a visual description that should allow me to identify them just seeing an image of them.
 4 | - scenes: a list of the scenes with the following properties:
 5 |   - start/end timestamps of the scene
 6 |   - list of all the characters that appear in the scene
 7 |   - list of all activities and their timestamps
 8 |   - list of all props and their timestamps
 9 |   - list of all video editing details and their start/end timestamps. Details include transitions, effects, music as well as suggestions like segments of the scene that could be removed and why 
10 |   - scene mood with notes on how the visuals, audio and context contribute to it. Use the following taxonomy returning only the name in your answer {"moods":{"Positive":[{"name":"Happy","description":"Feeling joyful, content, or delighted."},{"name":"Excited","description":"Feeling enthusiastic, energetic, or eager."},{"name":"Calm","description":"Feeling peaceful, relaxed, or serene."},{"name":"Grateful","description":"Feeling appreciative or thankful."},{"name":"Proud","description":"Feeling satisfied with one's achievements or the achievements of others."}],"Negative":[{"name":"Sad","description":"Feeling down, unhappy, or sorrowful."},{"name":"Angry","description":"Feeling irritated, frustrated, or furious."},{"name":"Anxious","description":"Feeling nervous, worried, or uneasy."},{"name":"Lonely","description":"Feeling isolated, disconnected, or abandoned."},{"name":"Bored","description":"Feeling uninterested, disengaged, or restless."}],"Neutral":[{"name":"Indifferent","description":"Feeling neither particularly positive nor negative."},{"name":"Content","description":"Feeling satisfied but not overly excited."},{"name":"Curious","description":"Feeling interested or inquisitive without strong emotion."},{"name":"Confused","description":"Feeling uncertain or unclear but without strong negative feelings."},{"name":"Pensive","description":"Feeling thoughtful or reflective without strong emotional engagement."}]}}
11 |     - specific  mood changing moments inside the scene, report the timestamp and what we transition from/to in any of the dimensions (visual / auditive)
12 |   - scene narrative progression and plot development
13 |     - specific narrative moments inside the scene. Report the timestamp and what happened
14 |   - character interaction and dynamics descriptions and their start/end timestamps
15 |   - specific thematic elements and descriptions
16 |   - specific relevant happenings to create deeper meanings and subtexts not explicitly stated that contribute to the richness and depth of the content, timestamp and descriptions
17 |   - dynamism score of the scene. Score between 0 and 1. 1 is highly dynamic
18 |   - audio - visual correlation score. Score between 0 and 1. 0 what we see is not correlated with the speech and 1 is highly correlated
19 | 
20 | - storylines: a list of the different storylines found and which scenes belong to it. 
21 |   - Specify where is the climax (scene and timestamp) and if the content is being presented a narrative story, or is it more like a collection of facts or non-narrative information
22 |   - if there are scenes not matching storylines, explain how those scenes contribute to the video
23 | - looking at the overall video and the storylines, which segments of the video could be trimmed to make it more dynamic?
24 | - q&a: a list of 5 questions/answers about the video that focus on fine details (objects and or activities), overall story reasoning and mood. Focus on Q&A aspects captured on the audio and the video whenever possible difficult to get only by looking at the transcription.


--------------------------------------------------------------------------------
/contentannotation/video2annotation.py:
--------------------------------------------------------------------------------
  1 | import boto3  
  2 | import google.generativeai as genai
  3 | from openai import OpenAI
  4 | import json
  5 | import time
  6 | from datetime import datetime
  7 | from typing import List, Dict, Any, Optional, TypedDict
  8 | import instructor
  9 | import os
 10 | import google.api_core.exceptions 
 11 | import argparse
 12 | 
 13 | #
 14 | # Given an input list of videos, this script downloads them from S3 and annotates the videos with Gemini 
 15 | # and structures the data with instructor using GPT4o underneath.
 16 | #
 17 | # The code is prepared to run as a standalone application:
 18 | # The first parameter is size_chunk: it basically divide the list of videos in sublists of length size_chunk
 19 | # The worker_number decides in which sublist of size size_chunk the current execution will be working on
 20 | # --video-list is to specify the json file that contains a list of videoids as a JSON list. If that is not provided, it defaults to oracle_videos_server.json
 21 | #
 22 | 
 23 | 
 24 | ### CONFIG ###
 25 | 
 26 | # Directories to download input videos and output annotation results
 27 | input_directory = 'videos_minioracle/'
 28 | output_directory = 'videos_minioracle_results/'
 29 | bucket_name = '<bucket_name>'
 30 | 
 31 | GEMINI_PATH="/path/to/your/key/file"
 32 | OPENAI_PATH="/path/to/your/key/file"
 33 | ###
 34 | 
 35 | 
 36 | ### Data Schema ###
 37 | 
 38 | class Character(TypedDict):
 39 |     characterId: str
 40 |     name: str
 41 |     description: str
 42 | 
 43 | class Timestamps(TypedDict):
 44 |     start_timestamp: str
 45 |     end_timestamp: str
 46 | 
 47 | class Activity(TypedDict):
 48 |     description: str
 49 |     timestamp: Timestamps
 50 | 
 51 | class Prop(TypedDict):
 52 |     name: str
 53 |     timestamp: Timestamps
 54 | 
 55 | class VideoEditingDetail(TypedDict):
 56 |     description: str
 57 |     timestamps: Timestamps
 58 | 
 59 | class KeyMoment(TypedDict):
 60 |     timestamp: str
 61 |     changeDescription: str
 62 | 
 63 | class Mood(TypedDict):
 64 |     description: str
 65 |     keyMoments: List[KeyMoment]
 66 | 
 67 | class NarrativeProgression(TypedDict):
 68 |     description: str
 69 |     timestamp: str
 70 | 
 71 | class CharacterInteraction(TypedDict):
 72 |     characters: List[str]
 73 |     description: str
 74 | 
 75 | class Scene(TypedDict):
 76 |     sceneId: int
 77 |     title: str
 78 |     timestamps: Timestamps
 79 |     cast: List[str]
 80 |     activities: List[Activity]
 81 |     props: List[Prop]
 82 |     videoEditingDetails: List[VideoEditingDetail]
 83 |     mood: Mood
 84 |     narrativeProgression: List[NarrativeProgression]
 85 |     characterInteraction: List[CharacterInteraction]
 86 |     thematicElements: str
 87 |     contextualRelevance: str
 88 |     dynamismScore: float
 89 |     audioVisualCorrelation: float
 90 | 
 91 | class Climax(TypedDict):
 92 |     description: str
 93 |     timestamp: str
 94 | 
 95 | class Storyline(TypedDict):
 96 |     description: str
 97 |     scenes: List[int]
 98 |     climax: Climax
 99 | 
100 | class QAndA(TypedDict):
101 |     question: str
102 |     answer: str
103 | 
104 | class TrimmingSuggestion(TypedDict, total=False):
105 |     timestamps: Timestamps
106 |     description: str
107 | 
108 | class Schema(TypedDict):
109 |     title: str
110 |     description: str
111 |     characterList: List[Character]
112 |     scenes: List[Scene]
113 |     storylines: Storyline
114 |     qAndA: List[QAndA]
115 |     trimmingSuggestions: List[TrimmingSuggestion]
116 | 
117 | ###
118 | 
119 | class VideoProcessor:
120 |     def __init__(self, gemini_api_key_path: str, openai_api_key_path: str):
121 |         # Initialize API keys and clients
122 |         self.gemini_apikey = self._read_api_key(gemini_api_key_path)
123 |         self.openai_apikey = self._read_api_key(openai_api_key_path)
124 |         genai.configure(api_key=self.gemini_apikey)
125 |         self.clientOpenAI = OpenAI(api_key=self.openai_apikey)
126 | 
127 |     def _read_api_key(self, path: str) -> str:
128 |         with open(path, "r") as file:
129 |             return file.read().strip()
130 | 
131 | 
132 |     def upload_video(self, file_path: str) -> Dict[str, Any]:
133 |         print(f"Uploading file... {file_path}")
134 |         try:
135 |             video_file = genai.upload_file(path=file_path)
136 | 
137 |             while video_file.state.name == "PROCESSING":
138 |                 time.sleep(10)
139 |                 video_file = genai.get_file(video_file.name)
140 | 
141 |             if video_file.state.name == "FAILED":
142 |                 return {"error": "Upload failed", "video_file": None}
143 |             return {"video_file": video_file}
144 |         
145 |         except Exception as e:
146 |             return {"error": str(e), "video_file": None}
147 | 
148 | 
149 | 
150 |     def process_video(self, video_file: Any, addition_to_prompt=None) -> Dict[str, Optional[str]]:
151 |         if "error" in video_file:
152 |             return {"error": "video_file error: " + video_file["error"], "gemini_text": None}
153 | 
154 |         max_retries = 5
155 |         attempt = 0
156 |         delay = 2  # in seconds
157 | 
158 |         while attempt < max_retries:
159 |             try:
160 |                 print(f"Processing {video_file['video_file'].display_name} (Attempt {attempt + 1})")
161 |                 prompt = open("gemini_prompt.txt", "r").read()
162 |                 if addition_to_prompt:
163 |                     print(f"\t adding addition to prompt: {addition_to_prompt}")
164 |                     prompt = prompt + addition_to_prompt
165 |                 model = genai.GenerativeModel(model_name="gemini-1.5-pro-latest")
166 |                 response = model.generate_content(
167 |                     [video_file['video_file'], prompt],
168 |                     request_options={"timeout": 600},
169 |                     safety_settings=[
170 |                         {"category": genai.types.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
171 |                         "threshold": genai.types.HarmBlockThreshold.BLOCK_NONE},
172 |                         {"category": genai.types.HarmCategory.HARM_CATEGORY_HARASSMENT,
173 |                         "threshold": genai.types.HarmBlockThreshold.BLOCK_NONE},
174 |                         {"category": genai.types.HarmCategory.HARM_CATEGORY_HATE_SPEECH,
175 |                         "threshold": genai.types.HarmBlockThreshold.BLOCK_NONE},
176 |                         {"category": genai.types.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
177 |                         "threshold": genai.types.HarmBlockThreshold.BLOCK_NONE}
178 |                     ]
179 |                 )
180 | 
181 |                 if not response.candidates:
182 |                     return {"error": "No candidates returned. Feedback: " + str(response.prompt_feedback), "gemini_text": None}
183 |                 #Cleaning up the analyzed file
184 |                 genai.delete_file(video_file['video_file'].name)
185 |                 
186 |                 return {"gemini_text": response.text}
187 | 
188 |             except google.api_core.exceptions.InternalServerError as e:
189 |                 print(f"InternalServerError occurred: {e}. Retrying in {delay} seconds...")
190 |                 attempt += 1
191 |                 time.sleep(delay)
192 |                 delay *= 2  # Exponential backoff
193 | 
194 |             except Exception as e:
195 |                 if "The read operation timed out" in str(e) or "record layer failure" in str(e):
196 |                     print(f"Gemini error: {str(e)}. Retrying in {delay} seconds...")
197 |                     attempt +=1
198 |                     time.sleep(delay)
199 |                     delay *= 2
200 |                 else:
201 |                     return {"error": str(e), "gemini_text": None}
202 | 
203 |         # If all retries fail
204 |         return {"error": f"Failed after {max_retries} attempts due to InternalServerError / timeouts / SSL.", "gemini_text": None}
205 | 
206 | 
207 | 
208 |     def obtain_json(self, gemini_answer: Optional[str]) -> Dict[str, Optional[str]]:
209 | 
210 |         if gemini_answer is None or (isinstance(gemini_answer, dict) and "error" in gemini_answer):
211 |             return {"error": gemini_answer.get("error") if gemini_answer else "No Gemini answer", "json_result": None}
212 | 
213 |         try:
214 |             # Patch the OpenAI client
215 |             client = instructor.from_openai(self.clientOpenAI)
216 |             promptOpenAI = gemini_answer
217 |             completion = client.chat.completions.create(
218 |                 model="gpt-4o-2024-08-06",
219 |                 response_model=Schema,
220 |                 messages=[
221 |                     {"role": "user", "content": promptOpenAI},
222 |                 ]
223 |             )
224 | 
225 |             return {"json_result": completion.json()}
226 |         except Exception as e:
227 |             return {"error": str(e), "json_result": None}
228 |     def prep_return(self, final_answer=None, gemini_error = None, gemini_raw_result=None, 
229 |                     instructor_error = None, instructor_raw_result=None):
230 |         return {
231 |                 "final_answer": final_answer,
232 |                 "gemini": {
233 |                     "error": gemini_error,
234 |                     "raw_result": gemini_raw_result
235 |                 },
236 |                 "instructor": {
237 |                     "error": instructor_error,
238 |                     "raw_result": instructor_raw_result
239 |                 }
240 |             }
241 |     
242 |     def process(self, file_path: str) -> Dict[str, Any]:
243 | 
244 |         # Upload video to Gemini
245 |         gemini_result = self.upload_video(file_path)
246 |         if gemini_result.get("error"):
247 |             return self.prep_return(gemini_error=gemini_result['error'])
248 | 
249 |         # Process video with Gemini
250 |         gemini_text = self.process_video(gemini_result)
251 |         if gemini_text.get("error"):
252 |             return self.prep_return(gemini_error=gemini_text['error'])
253 | 
254 |         gemini_out_text = gemini_text.get("gemini_text")
255 |         if gemini_out_text is None or gemini_text == "":
256 |             return self.prep_return(gemini_error="Empty gemini answer", 
257 |                                     gemini_raw_result=gemini_out_text)
258 | 
259 | 
260 |         # Obtain JSON from the instructor
261 |         instructor_result = self.obtain_json(gemini_out_text)
262 | 
263 |         if "IncompleteOutputException" in instructor_result.get("error", "") or "ValidationError" in instructor_result.get("error", ""):
264 |             print(f"\tRetrying full pipeline due to Instructor exception: {instructor_result['error']}")
265 |             # Retry processing the video
266 |             gemini_result = self.upload_video(file_path)
267 |             gemini_text = self.process_video(gemini_result, addition_to_prompt=" be concise with your answer")
268 |             print("\t retry completed")
269 | 
270 |             # Check if there was an error during reprocessing
271 |             if gemini_text.get("error"):
272 |                 return self.prep_return(gemini_error=gemini_text['error'])
273 |             gemini_out_text = gemini_text.get("gemini_text")
274 |             if gemini_out_text is None or gemini_text == "":
275 |                 return self.prep_return(gemini_error="Empty gemini answer", 
276 |                                         gemini_raw_result=gemini_out_text)
277 | 
278 |             # Retry obtaining JSON from the instructor with the new Gemini text
279 |             instructor_result = self.obtain_json(gemini_text.get("gemini_text"))
280 | 
281 | 
282 | 
283 |         # Prepare the final response
284 |         final_answer = json.loads(instructor_result["json_result"]) if instructor_result["json_result"] and not instructor_result.get("error") else None
285 |         return self.prep_return(final_answer=final_answer,
286 |                          gemini_raw_result=gemini_text.get("gemini_text"),
287 |                          instructor_raw_result=instructor_result.get("json_result",None),
288 |                          instructor_error=instructor_result.get("error",None))
289 | 
290 | # AWS S3 Configuration
291 | session = boto3.Session() 
292 | s3_client = session.client('s3')
293 | 
294 | # Ensure the input and output directories exist
295 | os.makedirs(input_directory, exist_ok=True)
296 | os.makedirs(output_directory, exist_ok=True)
297 | 
298 | # Function to check if a result or error file exists for a video
299 | def result_exists(video_filename):
300 |     video_id = os.path.splitext(video_filename)[0]
301 |     result_file = os.path.join(output_directory, f"{video_id}.json")
302 |     error_file = os.path.join(output_directory, f"errors_{video_id}.json")
303 |     return os.path.exists(result_file) or os.path.exists(error_file)
304 | 
305 | # Function to download video from S3
306 | def download_video_from_s3(video_key, local_path):
307 |     bucket_name = '<bucket_name>'
308 |     try:
309 |         s3_client.download_file(bucket_name, video_key, local_path)
310 |         print(f"Downloaded {video_key} to {local_path}")
311 |         return True
312 |     except Exception as e:
313 |         print(f"Failed to download {video_key} from S3: {e}")
314 |         return False
315 | 
316 | def process_single_video(video_id, worker_number):
317 |     videos_path = 'path/'
318 |     video_key = f'{videos_path}/{video_id}.mp4'
319 |     video_filename = f'{video_id}.mp4'
320 |     local_path = os.path.join(input_directory, video_filename)
321 | 
322 |     if result_exists(video_filename):
323 |         print(f"Skipping {video_filename}, result already exists.")
324 |         return
325 | 
326 |     # Download video from S3
327 |     if not download_video_from_s3(video_key, local_path):
328 |         # Handle download failure
329 |         error_data = {"error": "File not found in S3"}
330 |         error_file_path = os.path.join(output_directory, f"errors_{video_id}.json")
331 |         with open(error_file_path, "w") as f:
332 |             json.dump(error_data, f, indent=4)
333 |         
334 |         # Update status report for download failure
335 |         timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
336 |         status_report = f"{timestamp} - {video_id} - failed - File not found in S3\n"
337 |         print(status_report)
338 |         with open("status.txt", "a") as f:
339 |             f.write(status_report)
340 |         
341 |         return
342 | 
343 |     # Process the video using VideoProcessor class
344 |     processor = VideoProcessor(gemini_api_key_path=GEMINI_PATH, openai_api_key_path=OPENAI_PATH)
345 |     result = processor.process(local_path)
346 |     video_id = os.path.splitext(os.path.basename(local_path))[0]
347 | 
348 |     # Save final answer to JSON if available
349 |     if result.get("final_answer") is not None:
350 |         with open(os.path.join(output_directory, f"{video_id}.json"), "w") as f:
351 |             json.dump(result["final_answer"], f, indent=4)
352 |         status = "successful"
353 |     else:
354 |         status = "failed"
355 | 
356 |     # Save errors to JSON if any errors exist
357 |     errors = {}
358 |     if (result.get("gemini", {}).get("error") is not None) or (result.get("instructor", {}).get("error") is not None):
359 |         gemini_raw = result["gemini"].get("raw_result")
360 |         errors = {
361 |             "gemini_error": result["gemini"].get("error"),
362 |             "instructor_error": result["instructor"].get("error"),
363 |             "gemini_raw_result": gemini_raw
364 |         }
365 |         with open(os.path.join(output_directory, f"errors_{video_id}.json"), "w") as f:
366 |             json.dump(errors, f, indent=4)
367 |     
368 |     # Prepare the status report
369 |     timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
370 |     error_details = ', '.join(filter(None, [result.get("gemini", {}).get("error"), result.get("instructor", {}).get("error")]))
371 |     status_report = f"{timestamp} - {video_id} - {status} - {error_details if error_details else 'None'}\n"
372 |     print(status_report)
373 |     
374 |     # Append the status report to status.txt
375 |     if worker_number is None:
376 |         with open("status.txt", "a") as f:
377 |             f.write(status_report)
378 |     else:    
379 |         with open(f"status/status_{worker_number}.txt", "a") as f:
380 |             f.write(status_report)
381 | 
382 |     # Remove the video file after processing
383 |     os.remove(local_path)
384 |     print(f"Deleted local file {local_path} after processing.")
385 | 
386 | def process_chunk(videos_to_process, size_chunk, worker_number):
387 |     # Calculate start and end indices for this worker's chunk
388 |     start_index = worker_number * size_chunk
389 |     end_index = min(start_index + size_chunk, len(videos_to_process))
390 | 
391 |     # Process videos in this worker's chunk
392 |     for video_id in videos_to_process[start_index:end_index]:
393 |         process_single_video(video_id, worker_number)
394 | 
395 | if __name__ == "__main__":
396 |     # Parse command-line arguments
397 |     parser = argparse.ArgumentParser(description='Process videos in chunks.')
398 |     parser.add_argument('size_chunk', type=int, help='Size of each chunk to process')
399 |     parser.add_argument('worker_number', type=int, help='Worker number (zero-indexed)')
400 |     parser.add_argument('--video_list', type=str, help='Optional video list file in JSON format')
401 |     args = parser.parse_args()
402 | 
403 |     # Load the list of videos
404 |     if args.video_list:
405 |         with open(args.video_list, 'r') as f:
406 |             videos_to_process = json.load(f)
407 |         print(f"Using provided video list: {args.video_list}")
408 |     else:
409 |         with open('oracle_videos_server.json', 'r') as f:
410 |             videos_to_process = json.load(f)
411 |         print("Using default video list: oracle_videos_server.json")
412 | 
413 |     # Process the assigned chunk
414 |     process_chunk(videos_to_process, args.size_chunk, args.worker_number)
415 | 
416 | 


--------------------------------------------------------------------------------
/contentselection/content_taxonomy.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "Entertainment": {
  3 |       "Comedy": {
  4 |         "Stand-up": {},
  5 |         "Sketches": {},
  6 |         "Parodies": {}
  7 |       },
  8 |       "Music": {
  9 |         "Music Videos": {},
 10 |         "Covers": {},
 11 |         "Remixes": {},
 12 |         "Lyric Videos": {}
 13 |       },
 14 |       "Movies & Trailers": {
 15 |         "Film Trailers": {},
 16 |         "Short Films": {},
 17 |         "Movie Reviews": {}
 18 |       },
 19 |       "Gaming": {
 20 |         "Let's Plays": {},
 21 |         "Game Reviews": {},
 22 |         "Walkthroughs": {},
 23 |         "Game Commentary": {}
 24 |       },
 25 |       "Vlogs": {
 26 |         "Daily Vlogs": {},
 27 |         "Travel Vlogs": {},
 28 |         "Storytime": {}
 29 |       },
 30 |       "Livestreams": {
 31 |         "Gaming Livestreams": {},
 32 |         "Q&A Sessions": {},
 33 |         "Event Livestreams": {}
 34 |       }
 35 |     },
 36 |     "Education": {
 37 |       "Tutorials": {
 38 |         "Software Tutorials": {},
 39 |         "DIY & Crafts": {},
 40 |         "Cooking Tutorials": {}
 41 |       },
 42 |       "Lectures & Talks": {
 43 |         "Academic Lectures": {},
 44 |         "TED Talks": {},
 45 |         "Motivational Talks": {}
 46 |       },
 47 |       "Science & Technology": {
 48 |         "Science Explainers": {},
 49 |         "Tech Reviews": {},
 50 |         "Engineering Projects": {}
 51 |       },
 52 |       "Language Learning": {
 53 |         "Language Lessons": {},
 54 |         "Pronunciation Guides": {}
 55 |       },
 56 |       "History & Culture": {
 57 |         "Documentaries": {},
 58 |         "Cultural Explainers": {},
 59 |         "Historical Analysis": {}
 60 |       },
 61 |       "Business & Finance": {
 62 |         "Entrepreneurship": {},
 63 |         "Investment Guides": {},
 64 |         "Marketing Strategies": {}
 65 |       }
 66 |     },
 67 |     "Lifestyle": {
 68 |       "Health & Fitness": {
 69 |         "Workout Routines": {},
 70 |         "Nutrition Guides": {},
 71 |         "Mental Health Tips": {}
 72 |       },
 73 |       "Fashion & Beauty": {
 74 |         "Makeup Tutorials": {},
 75 |         "Fashion Hauls": {},
 76 |         "Skincare Routines": {}
 77 |       },
 78 |       "Travel": {
 79 |         "Destination Guides": {},
 80 |         "Travel Tips": {},
 81 |         "Travel Vlogs": {}
 82 |       },
 83 |       "Food & Cooking": {
 84 |         "Recipe Videos": {},
 85 |         "Cooking Shows": {},
 86 |         "Food Reviews": {}
 87 |       },
 88 |       "Home & Garden": {
 89 |         "Home Improvement": {},
 90 |         "Gardening Tips": {},
 91 |         "Interior Design": {}
 92 |       },
 93 |       "Parenting & Family": {
 94 |         "Parenting Tips": {},
 95 |         "Family Vlogs": {},
 96 |         "Childcare Advice": {}
 97 |       }
 98 |     },
 99 |     "News & Politics": {
100 |       "News Reports": {
101 |         "Breaking News": {},
102 |         "Political News": {},
103 |         "World News": {}
104 |       },
105 |       "Opinion & Commentary": {
106 |         "Political Commentary": {},
107 |         "Social Commentary": {},
108 |         "Editorials": {}
109 |       },
110 |       "Interviews": {
111 |         "Celebrity Interviews": {},
112 |         "Political Interviews": {},
113 |         "Expert Interviews": {}
114 |       },
115 |       "Debates": {
116 |         "Political Debates": {},
117 |         "Social Issue Debates": {}
118 |       }
119 |     },
120 |     "Sports": {
121 |       "Highlights & Replays": {
122 |         "Game Highlights": {},
123 |         "Match Replays": {}
124 |       },
125 |       "Sports Commentary": {
126 |         "Analysis Shows": {},
127 |         "Sports Talk Shows": {}
128 |       },
129 |       "Athlete Profiles": {
130 |         "Career Highlights": {},
131 |         "Documentary Profiles": {}
132 |       },
133 |       "Fitness & Training": {
134 |         "Athlete Workouts": {},
135 |         "Training Techniques": {}
136 |       }
137 |     },
138 |     "Art & Creativity": {
139 |       "Visual Arts": {
140 |         "Painting Tutorials": {},
141 |         "Drawing Tutorials": {},
142 |         "Art Exhibitions": {}
143 |       },
144 |       "Photography & Film": {
145 |         "Photography Tips": {},
146 |         "Cinematography": {},
147 |         "Short Films": {}
148 |       },
149 |       "Crafts & DIY": {
150 |         "Home Crafts": {},
151 |         "DIY Projects": {},
152 |         "Upcycling": {}
153 |       },
154 |       "Writing & Literature": {
155 |         "Writing Tips": {},
156 |         "Book Reviews": {},
157 |         "Poetry Readings": {}
158 |       }
159 |     },
160 |     "Science & Technology": {
161 |       "Tech Reviews": {
162 |         "Gadget Reviews": {},
163 |         "Software Reviews": {}
164 |       },
165 |       "Science Explainers": {
166 |         "Physics": {},
167 |         "Biology": {},
168 |         "Chemistry": {}
169 |       },
170 |       "Space Exploration": {
171 |         "Astronomy": {},
172 |         "Space Missions": {}
173 |       },
174 |       "Engineering": {
175 |         "Mechanical Engineering": {},
176 |         "Electrical Engineering": {}
177 |       },
178 |       "Environmental Science": {
179 |         "Climate Change": {},
180 |         "Conservation Efforts": {}
181 |       },
182 |       "Artificial Intelligence": {
183 |         "AI Concepts": {},
184 |         "Machine Learning Tutorials": {}
185 |       }
186 |     },
187 |     "Automotive": {
188 |       "Car Reviews": {},
189 |       "Car Modifications": {},
190 |       "Driving Tutorials": {},
191 |       "Motorsports": {
192 |         "Racing Highlights": {},
193 |         "Motorsport Commentary": {}
194 |       },
195 |       "Off-Roading": {}
196 |     },
197 |     "Hobbies & Interests": {
198 |       "Collecting": {
199 |         "Toy Collections": {},
200 |         "Stamp Collections": {},
201 |         "Memorabilia": {}
202 |       },
203 |       "Board Games & Puzzles": {
204 |         "Gameplay Tutorials": {},
205 |         "Game Reviews": {}
206 |       },
207 |       "Outdoor Activities": {
208 |         "Camping": {},
209 |         "Hiking": {},
210 |         "Fishing": {}
211 |       },
212 |       "Arts & Crafts": {
213 |         "Knitting": {},
214 |         "Pottery": {},
215 |         "Scrapbooking": {}
216 |       },
217 |       "Listicles & Rankings": {
218 |         "Top 10 Videos": {},
219 |         "Best of [Category]": {},
220 |         "Ranked Lists": {},
221 |         "Must-See [Topic]": {},
222 |         "Buyer’s Guides": {}
223 |       },
224 |     "Miscellaneous": {
225 |       "ASMR": {},
226 |       "Unboxing Videos": {},
227 |       "Reaction Videos": {},
228 |       "Pranks": {},
229 |       "Social Experiments": {}
230 |     }
231 |   }
232 | }
233 |   


--------------------------------------------------------------------------------
/contentselection/oracle.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | from sklearn.preprocessing import MinMaxScaler
  4 | import json
  5 | 
  6 | #
  7 | # Given a pandas dataframe with a list of videos and the metadata extracted from YT-Commons, 
  8 | # this script creates a new dataframe with a list of videoids that the target the hours of video that we want to collect.
  9 | #
 10 | 
 11 | ### CONFIG ###
 12 | input_pkl = 'path_to_your_current_videos_df.pkl'
 13 | output_pkl = 'path_to_your_output_df.pkl'
 14 | taxonomy_path = 'content_taxonomy.json'
 15 | target_hours = 4500
 16 | ###
 17 | 
 18 | # Step 1: Preprocess the Data
 19 | def preprocess_df(df):
 20 |     # Fill NaNs with 0 or suitable values
 21 |     df['comment_count'] = df['comment_count'].fillna(0)
 22 |     df['view_count'] = df['view_count'].fillna(0)
 23 |     df['like_count'] = df['like_count'].fillna(0)
 24 |     df['channel_follower_count'] = df['channel_follower_count'].fillna(0)
 25 |     df['duration_seconds'] = df['duration_seconds'].fillna(0)
 26 |     
 27 |     # Normalize numerical columns for fair weighting
 28 |     scaler = MinMaxScaler()
 29 |     df[['comment_count', 'view_count', 'like_count']] = scaler.fit_transform(
 30 |         df[['comment_count', 'view_count', 'like_count']]
 31 |     )
 32 |     
 33 |     return df
 34 | 
 35 | # Step 2: Compute User Activity Score
 36 | def compute_user_activity(df, weights=(0.2, 0.5, 0.3)):
 37 |     # Weights: 0.2 for comments, 0.5 for views, 0.3 for likes
 38 |     df['user_activity_score'] = (
 39 |         weights[0] * df['comment_count'] +
 40 |         weights[1] * df['view_count'] +
 41 |         weights[2] * df['like_count']
 42 |     )
 43 |     return df
 44 | 
 45 | # Step 3: Map Inferred Categories to Higher Taxonomy Levels
 46 | # Note: this was not used in the final version of the content selection algorithm but is useful data that we let in the dataset.
 47 | def map_to_parent_categories(df, taxonomy):
 48 |     """
 49 |     Maps each inferred category in the DataFrame to its top-level parent category
 50 |     in the hierarchical taxonomy.
 51 | 
 52 |     :param df: DataFrame containing video data with an 'inferred_category' column.
 53 |     :param taxonomy: A nested dictionary representing the hierarchical taxonomy.
 54 |     :return: DataFrame with an added 'parent_category' column representing the top-level parent category.
 55 |     """
 56 |     
 57 |     # Helper function to find the top-level parent category
 58 |     def find_top_parent_category(leaf_name, taxonomy):
 59 |         """
 60 |         Finds the top-level parent category of a given leaf in the hierarchical taxonomy.
 61 | 
 62 |         :param leaf_name: A string representing the leaf node to search for.
 63 |         :param taxonomy: A dictionary representing the full hierarchical taxonomy.
 64 |         :return: The top-level parent category of the given leaf if found, else None.
 65 |         """
 66 |         def recursive_search(taxonomy, leaf_name, current_top_category):
 67 |             for category, subcategories in taxonomy.items():
 68 |                 if category == leaf_name:
 69 |                     # Found the leaf node; return the top-level category
 70 |                     return current_top_category
 71 |                 if isinstance(subcategories, dict):
 72 |                     # Continue searching deeper
 73 |                     found_category = recursive_search(subcategories, leaf_name, current_top_category)
 74 |                     if found_category:
 75 |                         return found_category
 76 |             return None
 77 | 
 78 |         # Start the search with top-level categories
 79 |         for top_category, subcategories in taxonomy.items():
 80 |             result = recursive_search(subcategories, leaf_name, top_category)
 81 |             if result:
 82 |                 return result
 83 | 
 84 |         return None
 85 | 
 86 |     # Map each inferred category to its top-level parent category
 87 |     df['parent_category'] = df['inferred_category'].apply(lambda x: find_top_parent_category(x, taxonomy))
 88 |     
 89 |     return df
 90 | 
 91 | 
 92 | # Step 4: Select Videos for Diversity and Total Duration
 93 | def select_videos(df, target_hours=4500):
 94 |     target_seconds = target_hours * 3600  # Convert hours to seconds
 95 |     selected_videos = pd.DataFrame()
 96 | 
 97 |     # Calculate the total number of inferred categories
 98 |     inferred_categories = df['inferred_category'].unique()
 99 |     total_categories = len(inferred_categories)
100 |     
101 |     # Calculate the initial target seconds per inferred category
102 |     target_seconds_per_category = target_seconds / total_categories
103 |     
104 |     # Shuffle rows to mix categories and channels
105 |     df = df.sample(frac=1, random_state=42).reset_index(drop=True)
106 |     
107 |     # Initialize dictionary to keep track of selected durations per inferred category
108 |     category_durations = {category: 0 for category in inferred_categories}
109 |     
110 |     # Define a progressive penalty for repeated channels
111 |     channel_penalty_increment = 0.1  # Incremental penalty for each additional video from the same channel
112 |     
113 |     # Process each inferred category
114 |     for inferred_category in inferred_categories:
115 |         category_df = df[df['inferred_category'] == inferred_category]
116 |         
117 |         # Sort by user activity score and channel follower count in reverse order
118 |         category_df = category_df.sort_values(
119 |             by=['user_activity_score', 'channel_follower_count'],
120 |             ascending=[False, True]
121 |         )
122 |         
123 |         current_duration = 0
124 |         channel_counter = {}
125 |         
126 |         for _, row in category_df.iterrows():
127 |             if current_duration >= target_seconds_per_category:
128 |                 break
129 |             
130 |             channel = row['channel']
131 |             
132 |             # Calculate the penalty based on the number of videos already selected from this channel
133 |             penalty_factor = 1 - (channel_counter.get(channel, 0) * channel_penalty_increment)
134 |             penalty_factor = max(penalty_factor, 0)  # Ensure penalty factor doesn't go negative
135 |             
136 |             # Apply penalty by using a probability check
137 |             if np.random.rand() < penalty_factor:
138 |                 selected_videos = pd.concat([selected_videos, pd.DataFrame([row])])
139 |                 current_duration += row['duration_seconds']
140 |                 category_durations[inferred_category] += row['duration_seconds']
141 |                 channel_counter[channel] = channel_counter.get(channel, 0) + 1
142 |         
143 |         # Update target duration if some categories can't meet the target
144 |         remaining_seconds = target_seconds - selected_videos['duration_seconds'].sum()
145 |         remaining_categories = total_categories - len(selected_videos['inferred_category'].unique())
146 |         if remaining_categories > 0:
147 |             target_seconds_per_category = remaining_seconds / remaining_categories
148 |     
149 |     # Adjust to match exactly the target duration or close
150 |     selected_videos = selected_videos.sort_values(by='duration_seconds', ascending=True)
151 |     
152 |     final_selected = pd.DataFrame()
153 |     total_duration = 0
154 |     
155 |     for _, row in selected_videos.iterrows():
156 |         if total_duration + row['duration_seconds'] <= target_seconds:
157 |             final_selected = pd.concat([final_selected, pd.DataFrame([row])])
158 |             total_duration += row['duration_seconds']
159 |     
160 |     return final_selected
161 | 
162 | def main_algorithm(df, taxonomy_file, target_hours = 4500):
163 |     df = preprocess_df(df)
164 |     df = compute_user_activity(df)
165 |     
166 |     # Load taxonomy from JSON file
167 |     with open(taxonomy_file, 'r') as file:
168 |         taxonomy = json.load(file)
169 |     
170 |     # Map inferred categories to their parent categories
171 |     df = map_to_parent_categories(df, taxonomy)
172 |     
173 |     # Select videos based on updated criteria
174 |     selected_videos = select_videos(df, target_hours=target_hours)
175 |     
176 |     print(f"Total selected videos: {len(selected_videos)}")
177 |     print(f"Total duration (seconds): {selected_videos['duration_seconds'].sum()}")
178 |     
179 |     return selected_videos
180 | 
181 | # Run the algorithm
182 | df = pd.read_pickle(input_pkl)
183 | selected_videos_df = main_algorithm(df, taxonomy_path, target_hours=target_hours)
184 | select_videos.to_pickle(output_pkl)


--------------------------------------------------------------------------------
/dataset-creation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/fineVideo/b961b6ade22910d041aa75451afa94e454bca372/dataset-creation.png


--------------------------------------------------------------------------------
/dynamicfilters/videodynamismfiltering/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Use the official Python image with necessary packages
 2 | FROM python:3.9-slim
 3 | 
 4 | # Install ffmpeg and other dependencies
 5 | RUN apt-get update && \
 6 |     apt-get install -y ffmpeg && \
 7 |     apt-get clean && \
 8 |     rm -rf /var/lib/apt/lists/*
 9 | 
10 | # Set the working directory
11 | WORKDIR /app
12 | 
13 | # Copy the Python script into the container
14 | COPY check_static.py .
15 | 
16 | # Install Python dependencies
17 | RUN pip install boto3 ffmpeg-python
18 | 
19 | # Command to run the script
20 | CMD ["python", "check_static.py"]
21 | 


--------------------------------------------------------------------------------
/dynamicfilters/videodynamismfiltering/check_static.py:
--------------------------------------------------------------------------------
  1 | import boto3
  2 | import subprocess
  3 | import os
  4 | import math
  5 | 
  6 | # Initialize the S3 client
  7 | s3 = boto3.client('s3')
  8 | 
  9 | def download_video_from_s3(bucket, path, video_id):
 10 |     """Download a video from S3 given a video ID."""
 11 |     # Safely handle filenames with hyphens and special characters
 12 |     video_file = f"./{video_id}.mp4"
 13 |     s3_key = f"{path}/{video_id}.mp4"
 14 |     try:
 15 |         s3.download_file(bucket, s3_key, video_file)
 16 |         print(f"Downloaded {video_file} from s3://{bucket}/{s3_key}")
 17 |         return video_file
 18 |     except Exception as e:
 19 |         print(f"Error downloading {video_file}: {e}")
 20 |         return None
 21 | 
 22 | def check_static_video(video_file, segment_duration=60, freeze_n=0.05, freeze_d=50, threshold=0.4):
 23 |     """Use ffmpeg freezedetect to check if a video has significant static content."""
 24 |     
 25 |     # Get video duration using ffprobe
 26 |     try:
 27 |         result = subprocess.run(
 28 |             ["ffprobe", "-v", "error", "-show_entries", "format=duration",
 29 |              "-of", "default=noprint_wrappers=1:nokey=1", video_file],
 30 |             capture_output=True, text=True
 31 |         )
 32 |         video_duration = float(result.stdout.strip())
 33 |     except Exception as e:
 34 |         print(f"Error getting video duration for {video_file}: {e}")
 35 |         return None
 36 | 
 37 |     # Calculate the number of segments to analyze
 38 |     num_segments = math.ceil(video_duration / segment_duration)
 39 |     freeze_count = 0
 40 | 
 41 |     # Analyze video in segments
 42 |     for start_time in range(0, int(video_duration), segment_duration):
 43 |         try:
 44 |             command = [
 45 |                 "ffmpeg", "-hide_banner", "-ss", str(start_time), "-i", video_file, 
 46 |                 "-t", str(segment_duration), "-vf", f"freezedetect=n={freeze_n}:d={freeze_d}", "-an", "-f", "null", "-"
 47 |             ]
 48 |             result = subprocess.run(command, capture_output=True, text=True)
 49 | 
 50 |             # Check the stderr output for freeze detection
 51 |             if "freezedetect" in result.stderr:
 52 |                 print(f"Static content detected in segment starting at {start_time} of {video_file}.")
 53 |                 freeze_count += 1
 54 |         except Exception as e:
 55 |             print(f"Error processing segment starting at {start_time} of {video_file}: {e}")
 56 |             return None
 57 | 
 58 |     # Calculate the percentage of segments with freezes
 59 |     freeze_percentage = freeze_count / num_segments
 60 | 
 61 |     print(f"Freeze percentage for {video_file}: {freeze_percentage:.2%}")
 62 | 
 63 |     # Determine if the video is considered static based on threshold
 64 |     return freeze_percentage >= threshold
 65 | 
 66 | def upload_result_to_s3(bucket, video_id, is_static):
 67 |     """Upload the result to S3 based on whether the video is static or dynamic."""
 68 |     s3_key = f"{'static' if is_static else 'dynamic'}/{video_id}.txt"
 69 |     try:
 70 |         s3.put_object(Bucket=bucket, Key=s3_key, Body="")
 71 |         print(f"Uploaded result to s3://{bucket}/{s3_key}")
 72 |     except Exception as e:
 73 |         print(f"Error uploading result for {video_id}: {e}")
 74 | 
 75 | def main():
 76 |     # Environment variables set in AWS Batch
 77 |     bucket = os.environ.get("VIDEO_BUCKET")
 78 |     video_ids = os.environ.get("VIDEO_IDS").split(",")
 79 |     video_path = os.environ.get("BUCKET_VIDEO_FOLDER_PATH")
 80 |     
 81 |     for video_id in video_ids:
 82 |         # Download video from S3
 83 |         video_file = download_video_from_s3(bucket, video_path, video_id)
 84 |         if not video_file:
 85 |             continue
 86 | 
 87 |         # Check if the video is static
 88 |         is_static = check_static_video(video_file)
 89 |         if is_static is None:
 90 |             continue
 91 | 
 92 |         # Upload result to S3
 93 |         upload_result_to_s3(bucket, video_id, is_static)
 94 |         
 95 |         # Clean up downloaded video file
 96 |         os.remove(video_file)
 97 | 
 98 | if __name__ == "__main__":
 99 |     main()
100 | 


--------------------------------------------------------------------------------
/dynamicfilters/worddensityfiltering.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | #
 4 | # Given a pandas dataframe with a list of videos and the metadata extracted from YT-Commons, 
 5 | # this script creates the columns duration_seconds and word_density with the goal to study word_density across the dataset
 6 | # Finally it drops all entries in the dataframe with word density < 0.5
 7 | #
 8 | 
 9 | ### CONFIG ###
10 | input_pkl = 'path_to_your_input_df.pkl'
11 | output_pkl = 'path_to_your_output_df.pkl'
12 | visualize = False # Toggle to true to inspect some results close to 1 and 0.5 word density values.
13 | ###
14 | 
15 | 
16 | 
17 | df = pd.read_pickle(input_pkl)
18 | 
19 | #Adding word_density and duration_seconds to the dataframe
20 | def duration_to_seconds(duration):
21 |     if pd.isnull(duration):
22 |         return 0  # or np.nan or another default
23 |     parts = duration.split(':')
24 |     parts = [int(p) for p in parts]
25 |     if len(parts) == 3:  # hh:mm:ss
26 |         return parts[0] * 3600 + parts[1] * 60 + parts[2]
27 |     elif len(parts) == 2:  # mm:ss
28 |         return parts[0] * 60 + parts[1]
29 |     elif len(parts) == 1:  # ss
30 |         return parts[0]
31 |     else:
32 |         return 0  # or np.nan if format is unrecognized
33 | 
34 | # Apply the conversion function to the 'duration_string' column
35 | df['duration_seconds'] = df['duration_string'].apply(duration_to_seconds)
36 | 
37 | # Calculate word density
38 | # Word density is the number of words per second, so we divide word_count by duration_seconds
39 | df['word_density'] = df.apply(lambda row: row['word_count'] / row['duration_seconds'] 
40 |                               if row['duration_seconds'] > 0 else 0, axis=1)
41 | 
42 | 
43 | 
44 | if visualize:
45 |     from tabulate import tabulate
46 |     #Visualizing some results
47 |     def get_samples_near_target(df, target, range_width=0.1, num_samples=3):
48 |         """
49 |         Get samples from the DataFrame that have 'word_density' close to the target value.
50 | 
51 |         :param df: DataFrame to sample from.
52 |         :param target: The target word density to find samples around.
53 |         :param range_width: The width of the range around the target value.
54 |         :param num_samples: Number of samples to return.
55 |         :return: A DataFrame with samples close to the target density.
56 |         """
57 |         # Define the range around the target
58 |         lower_bound = target - range_width
59 |         upper_bound = target + range_width
60 |         
61 |         # Filter and sample
62 |         samples = df[(df['word_density'] >= lower_bound) & (df['word_density'] <= upper_bound)].sample(n=num_samples, random_state=1)
63 |         return samples
64 | 
65 |     close_to_1 = get_samples_near_target(df, 1,  num_samples = 100)[['video_id', 'duration_string', 'title']]
66 |     print(tabulate(close_to_1,headers='keys', tablefmt='pretty', showindex=False))
67 | 
68 |     close_to_05 = get_samples_near_target(df, 0.5,  num_samples = 100)[['video_id', 'duration_string', 'title']]
69 |     print(tabulate(close_to_05,headers='keys', tablefmt='pretty', showindex=False))
70 | 
71 | 
72 | # We cut at 0.5
73 | df  = df.loc[df['word_density'] > 0.5]
74 | print(f"Total videos: {len(df)}")
75 | df.to_pickle(output_pkl)


--------------------------------------------------------------------------------
/finealignment/video_alignment.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | from typing import List
  4 | from scenedetect import VideoManager, SceneManager
  5 | from scenedetect.detectors import ContentDetector
  6 | from scenedetect.frame_timecode import FrameTimecode
  7 | import argparse
  8 | import re
  9 | import json
 10 | import boto3
 11 | from datetime import datetime
 12 | 
 13 | #
 14 | # Given an input list of videos, this script downloads them from S3 and aligns the metadata from those videos generated with video2annotation.py with the videos itself.
 15 | #
 16 | # The code is prepared to run as a standalone application:
 17 | # The first parameter is size_chunk: it basically divide the list of videos in sublists of length size_chunk
 18 | # The worker_number decides in which sublist of size size_chunk the current execution will be working on
 19 | # --video-list is to specify the json file that contains a list of videoids as a JSON list. If that is not provided, it defaults to video_alignment_to_process.json
 20 | #
 21 | 
 22 | 
 23 | ### CONFIG ###
 24 | bucket_name = '<bucket_name>'
 25 | video_folder_path = 'videos_minioracle/'
 26 | json_folder_path = 'videos_minioracle_results/'
 27 | output_folder_path = 'results_minioracle_aligned/'
 28 | ###
 29 | 
 30 | # AWS S3 Configuration - specify your personal profile
 31 | session = boto3.Session() 
 32 | s3_client = session.client('s3')
 33 | 
 34 | # Function to download video from S3
 35 | def download_video_from_s3(video_key, local_path):
 36 |     try:
 37 |         s3_client.download_file(bucket_name, video_key, local_path)
 38 |         print(f"Downloaded {video_key} to {local_path}")
 39 |         return True
 40 |     except Exception as e:
 41 |         print(f"Failed to download {video_key} from S3: {e}")
 42 |         return False
 43 | 
 44 | 
 45 | def handle_error(video_id: str, error_message: str, output_folder_path: str, worker_number: str):
 46 |     """Handle errors by creating an error file and updating the status report."""
 47 |     error_data = {
 48 |         "error": error_message,
 49 |         "video_id": video_id,
 50 |         "worker_number": worker_number
 51 |     }
 52 |     error_file_path = os.path.join(output_folder_path, f"errors_{video_id}.json")
 53 |     with open(error_file_path, "w") as f:
 54 |         json.dump(error_data, f, indent=4)
 55 | 
 56 |     # Update status report for failure
 57 |     timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
 58 |     status_report = f"{timestamp} - {video_id} - failed - {error_message}\n"
 59 |     print(status_report)
 60 |     with open(f"status/status_alignment_{worker_number}.txt", "a") as f:
 61 |         f.write(status_report)
 62 | 
 63 | 
 64 | def time_to_frametimecode(time_str: str, fps: float, scene_end_time: FrameTimecode = None, filename: str = "unknown_file", worker_number: str = None) -> str:
 65 |     """Convert mm:ss or ss time format to FrameTimecode, or handle special cases like 'end'."""
 66 |     # Define special cases
 67 |     if time_str == "end":
 68 |         if scene_end_time is not None:
 69 |             return scene_end_time.get_timecode()
 70 |         else:
 71 |             raise ValueError("time_str is end and no replacement for scene_end_time provided")
 72 | 
 73 |     special_cases = ["", "n/a", "varies", "throughout scene", "throughout the scene", 
 74 |                      "end", "throughout", "not present", "not applicable"]
 75 |     if time_str.lower() in special_cases or re.match(r"scene\s\d+", time_str.lower()):
 76 |         return None
 77 | 
 78 |     match = re.match(r"(\d+)s$", time_str.lower())
 79 |     if match:
 80 |         time_str = match.group(1)
 81 |     if 'around ' in time_str:
 82 |         time_str = time_str.split('around ')[0]
 83 |     if '~' in time_str:
 84 |         time_str = time_str.split('~')[0]
 85 |     if '+' in time_str:
 86 |         time_str = time_str.split('+')[0]
 87 |     if '-' in time_str:
 88 |         time_str = time_str.split("-")[0]
 89 |     if ' ' in time_str and ":" in time_str:
 90 |         time_str = time_str.split(" ")[0]
 91 |     if ":" in time_str:
 92 |         parts = time_str.split(":")
 93 |         if len(parts) == 3:
 94 |             hours, minutes, seconds = parts
 95 |         elif len(parts) == 2:
 96 |             hours = 0
 97 |             minutes, seconds = parts
 98 |         elif len(parts) == 1:
 99 |             hours = 0
100 |             minutes = 0
101 |             seconds = parts[0]
102 |         else:
103 |             raise ValueError(f"Invalid timestamp format: {time_str}")
104 | 
105 |         if '.' in seconds:
106 |             seconds = seconds.split(".")[0]
107 | 
108 |         match = re.match(r"^\d+", seconds)
109 |         if match:
110 |             seconds = int(match.group())
111 |         else:
112 |             raise ValueError(f"Invalid timestamp format: {time_str}")
113 | 
114 | 
115 |         total_seconds = float(hours) * 3600 + float(minutes) * 60 + float(seconds)
116 |     else:
117 |         try:
118 |             total_seconds = float(time_str)
119 |         except ValueError:
120 |             raise ValueError(f"Invalid timestamp format: {time_str}")
121 |     return FrameTimecode(timecode=total_seconds, fps=fps).get_timecode()
122 | 
123 | 
124 | def adjust_scene_boundaries(video_path, initial_scenes, video_id, worker_number):
125 |     """Adjust scene boundaries based on scene detection."""
126 |     # Initialize video manager and scene manager
127 |     video_manager = VideoManager([video_path])
128 |     scene_manager = SceneManager()
129 |     scene_manager.add_detector(ContentDetector(threshold=15.0))  # Adjust threshold for sensitivity
130 | 
131 |     # Start the video manager and obtain FPS
132 |     video_manager.start()
133 |     fps = video_manager.get_framerate()  # Get FPS from VideoManager
134 |     # print(f"Detected FPS: {fps}")
135 |     
136 |     # Get total frames using duration in seconds and fps
137 |     duration_seconds = video_manager.get_duration()[0].get_seconds()
138 |     total_frames = int(duration_seconds * fps)
139 |     last_frame_timecode = FrameTimecode(timecode=total_frames, fps=fps).get_timecode().split(".")[0].split(":")
140 |     last_frame_timecode = last_frame_timecode[1] + ":" + last_frame_timecode[2]
141 | 
142 |     adjusted_scenes = []
143 | 
144 |     for idx, initial_scene in enumerate(initial_scenes):
145 | 
146 |         if idx == len(initial_scenes) - 1:
147 |             #Hack to avoid issues with answers that signal the last timestamp as 'end'
148 |             initial_scene['timestamps']['end_timestamp'] = last_frame_timecode
149 |             # print(last_frame_timecode)
150 |         
151 |         start_timecode = time_to_frametimecode(initial_scene['timestamps']['start_timestamp'], fps, filename=video_id, worker_number = worker_number)
152 |         end_timecode = time_to_frametimecode(initial_scene['timestamps']['end_timestamp'], fps, filename=video_id, worker_number = worker_number)
153 | 
154 |         # Ensure all FrameTimecode objects use the same fps
155 |         start_frame_number = int(max(0, FrameTimecode(timecode=start_timecode, fps=fps).get_frames() - 2 * fps))
156 |         end_frame_number = int(min(total_frames, FrameTimecode(timecode=end_timecode, fps=fps).get_frames() + 2 * fps))
157 | 
158 |         search_start = FrameTimecode(timecode=start_frame_number, fps=fps)
159 |         search_end = FrameTimecode(timecode=end_frame_number, fps=fps)
160 | 
161 |         # Seek to the start frame for detection using FrameTimecode
162 |         video_manager.seek(search_start)
163 |         scene_manager.detect_scenes(frame_source=video_manager, end_time=search_end.get_seconds())
164 | 
165 |         detected_scenes = scene_manager.get_scene_list()
166 | 
167 |         # Find closest detected boundaries, default to original timecodes if no match found
168 |         adjusted_start_timecode = start_timecode
169 |         adjusted_end_timecode = end_timecode
170 | 
171 |         if detected_scenes:
172 |             closest_start = min(detected_scenes, key=lambda x: abs(x[0].get_frames() - FrameTimecode(timecode=start_timecode, fps=fps).get_frames()), default=None)
173 |             closest_end = min(detected_scenes, key=lambda x: abs(x[1].get_frames() - FrameTimecode(timecode=end_timecode, fps=fps).get_frames()), default=None)
174 | 
175 |             if closest_start and abs(closest_start[0].get_frames() - FrameTimecode(timecode=start_timecode, fps=fps).get_frames()) < 2 * fps:
176 |                 adjusted_start_timecode = closest_start[0].get_timecode()
177 |                 distance = abs(closest_start[0].get_seconds() - FrameTimecode(timecode=start_timecode, fps=fps).get_seconds())
178 |                 if distance > 2:
179 |                     print(f"\t adjusting start timestamp by {distance:.2f} seconds")
180 |                     print(f"\t\tFrom: {start_timecode} to {adjusted_start_timecode}" )
181 |                     if distance >=5:
182 |                         raise ValueError(f"Large start timestamp adjustment ({distance:.2f} seconds) required for scene {idx+1}")
183 | 
184 |             if closest_end and abs(closest_end[1].get_frames() - FrameTimecode(timecode=end_timecode, fps=fps).get_frames()) < 2 * fps:
185 |                 distance = abs(closest_end[1].get_seconds() - FrameTimecode(timecode=end_timecode, fps=fps).get_seconds())
186 |                 adjusted_end_timecode = closest_end[1].get_timecode()
187 |                 if distance > 2:
188 |                     print(f"\t adjusting end timestamp by {distance:.2f} seconds")
189 |                     print(f"\t\tFrom: {end_timecode} to {adjusted_end_timecode}" )
190 |                     if distance >=5:
191 |                         raise ValueError(f"Large start timestamp adjustment ({distance:.2f} seconds) required for scene {idx+1}")
192 | 
193 |         # Update the JSON with FrameTimecode formatted as HH:MM:SS:FF
194 |         initial_scene['timestamps']['start_timestamp'] = adjusted_start_timecode
195 |         initial_scene['timestamps']['end_timestamp'] = adjusted_end_timecode
196 | 
197 |         adjusted_scenes.append(initial_scene)
198 | 
199 |         # Ensure continuity between scenes
200 |         if idx > 0:
201 |             previous_scene_end = FrameTimecode(timecode=adjusted_scenes[idx - 1]['timestamps']['end_timestamp'], fps=fps)
202 |             current_scene_start = FrameTimecode(timecode=adjusted_start_timecode, fps=fps)
203 |             
204 |             # if current_scene_start.get_frames() <= previous_scene_end.get_frames():
205 |                 # Set start of current scene to be exactly the frame after the end of the previous scene
206 |             new_start_timecode = previous_scene_end.get_frames() + 1
207 |             adjusted_scenes[idx]['timestamps']['start_timestamp'] = FrameTimecode(timecode=new_start_timecode, fps=fps).get_timecode()
208 | 
209 |             frame_adjustment = abs(current_scene_start.get_frames() - new_start_timecode)
210 |             if frame_adjustment > 25:
211 |                 print(f"\t\tWARNING: adjusting a scene start by {frame_adjustment} frames")
212 |                 if frame_adjustment > 125:
213 |                     raise ValueError(f"Large frame adjustment ({frame_adjustment} frames) required for scene {idx+1}")
214 | 
215 | 
216 |     video_manager.release()
217 |     return fps, adjusted_scenes
218 | 
219 | def update_timestamps_in_json(data: dict, fps: float, video_id: str, worker_number: str) -> dict:
220 |     """Update all timestamp fields in the JSON data to FrameTimecode format and ensure they stay within scene boundaries."""
221 |     # Update timestamps in scenes
222 |     for scene in data.get('scenes', []):
223 |         scene_start = FrameTimecode(timecode=scene['timestamps']['start_timestamp'], fps=fps)
224 |         scene_end = FrameTimecode(timecode=scene['timestamps']['end_timestamp'], fps=fps)
225 |         
226 |         def enforce_within_boundaries(timestamp, start, end):
227 |             if timestamp is None:
228 |                 return None
229 |             frame_timecode = FrameTimecode(timecode=timestamp, fps=fps)
230 |             if frame_timecode.get_frames() < start.get_frames():
231 |                 return start.get_timecode()
232 |             elif frame_timecode.get_frames() > end.get_frames():
233 |                 return end.get_timecode()
234 |             else:
235 |                 return timestamp
236 | 
237 |         # Update activities timestamps
238 |         for activity in scene.get('activities', []):
239 |             if 'timestamp' in activity:
240 |                 if 'start_timestamp' in activity['timestamp']:
241 |                     activity['timestamp']['start_timestamp'] = enforce_within_boundaries(
242 |                         time_to_frametimecode(activity['timestamp']['start_timestamp'], fps, filename=video_id, scene_end_time=scene_end, worker_number = worker_number), scene_start, scene_end
243 |                     )
244 |                 if 'end_timestamp' in activity['timestamp']:
245 |                     activity['timestamp']['end_timestamp'] = enforce_within_boundaries(
246 |                         time_to_frametimecode(activity['timestamp']['end_timestamp'], fps, filename=video_id, scene_end_time=scene_end,worker_number = worker_number), scene_start, scene_end
247 |                     )
248 | 
249 |         # Update props timestamps
250 |         for prop in scene.get('props', []):
251 |             if 'timestamp' in prop:
252 |                 if 'start_timestamp' in prop['timestamp']:
253 |                     prop['timestamp']['start_timestamp'] = enforce_within_boundaries(
254 |                         time_to_frametimecode(prop['timestamp']['start_timestamp'], fps, filename=video_id, scene_end_time=scene_end,worker_number = worker_number), scene_start, scene_end
255 |                     )
256 |                 if 'end_timestamp' in prop['timestamp']:
257 |                     prop['timestamp']['end_timestamp'] = enforce_within_boundaries(
258 |                         time_to_frametimecode(prop['timestamp']['end_timestamp'], fps, filename=video_id, scene_end_time=scene_end,worker_number = worker_number), scene_start, scene_end
259 |                     )
260 | 
261 |         # Update video editing details timestamps
262 |         for video_editing in scene.get('videoEditingDetails', []):
263 |             if 'timestamps' in video_editing:
264 |                 if 'start_timestamp' in video_editing['timestamps']:
265 |                     video_editing['timestamps']['start_timestamp'] = enforce_within_boundaries(
266 |                         time_to_frametimecode(video_editing['timestamps']['start_timestamp'], fps, filename=video_id, scene_end_time=scene_end,worker_number = worker_number), scene_start, scene_end
267 |                     )
268 |                 if 'end_timestamp' in video_editing['timestamps']:
269 |                     video_editing['timestamps']['end_timestamp'] = enforce_within_boundaries(
270 |                         time_to_frametimecode(video_editing['timestamps']['end_timestamp'], fps, filename=video_id, scene_end_time=scene_end,worker_number = worker_number), scene_start, scene_end
271 |                     )
272 | 
273 |         # Update mood key moments timestamps
274 |         for key_moment in scene.get('mood', {}).get('keyMoments', []):
275 |             if 'timestamp' in key_moment:
276 |                 key_moment['timestamp'] = enforce_within_boundaries(
277 |                     time_to_frametimecode(key_moment['timestamp'], fps, filename=video_id, scene_end_time=scene_end,worker_number = worker_number), scene_start, scene_end
278 |                 )
279 | 
280 |         # Update narrative progression timestamps
281 |         for narrative in scene.get('narrativeProgression', []):
282 |             if 'timestamp' in narrative:
283 |                 narrative['timestamp'] = enforce_within_boundaries(
284 |                     time_to_frametimecode(narrative['timestamp'], fps, filename=video_id, scene_end_time=scene_end,worker_number = worker_number), scene_start, scene_end
285 |                 )
286 | 
287 |     # Update storylines climax timestamps
288 |     if 'storylines' in data and 'climax' in data['storylines'] and 'timestamp' in data['storylines']['climax']:
289 |         data['storylines']['climax']['timestamp'] = time_to_frametimecode(data['storylines']['climax']['timestamp'], fps, filename=video_id, scene_end_time=scene_end,worker_number = worker_number)
290 | 
291 |     # Update trimming suggestions timestamps
292 |     for trimming in data.get('trimmingSuggestions', []):
293 |         if 'timestamps' in trimming:
294 |             if 'start_timestamp' in trimming['timestamps']:
295 |                 trimming['timestamps']['start_timestamp'] = enforce_within_boundaries(
296 |                     time_to_frametimecode(trimming['timestamps']['start_timestamp'], fps, filename=video_id, scene_end_time=scene_end,worker_number = worker_number), scene_start, scene_end
297 |                 )
298 |             if 'end_timestamp' in trimming['timestamps']:
299 |                 trimming['timestamps']['end_timestamp'] = enforce_within_boundaries(
300 |                     time_to_frametimecode(trimming['timestamps']['end_timestamp'], fps, filename=video_id, scene_end_time=scene_end,worker_number = worker_number), scene_start, scene_end
301 |                 )
302 | 
303 |     return data
304 | 
305 | def result_exists(video_filename,output_directory):
306 |     video_id = os.path.splitext(video_filename)[0]
307 |     result_file = os.path.join(output_directory, f"{video_id}.json")
308 |     error_file = os.path.join(output_directory, f"errors_{video_id}.json")
309 |     return os.path.exists(result_file) or os.path.exists(error_file)
310 | 
311 | def process_single_video(video_id, worker_number):
312 |     s3_folder_videos = 'videos/'
313 |     video_key = f'{s3_folder_videos}/{video_id}.mp4'
314 |     video_filename = f'{video_id}.mp4'
315 |     video_local_path = os.path.join(video_folder_path, video_filename)
316 |     if result_exists(video_filename,output_folder_path):
317 |         print(f"Skipping {video_filename}, result already exists.")
318 |         return
319 | 
320 |     # Download video from S3
321 |     if not download_video_from_s3(video_key, video_local_path):
322 |         # Handle download failure
323 |         error_data = {"error": "File not found in S3"}
324 |         error_file_path = os.path.join(output_folder_path, f"errors_{video_id}.json")
325 |         with open(error_file_path, "w") as f:
326 |             json.dump(error_data, f, indent=4)
327 |         
328 |         # Update status report for download failure
329 |         timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
330 |         status_report = f"{timestamp} - {video_id} - failed - File not found in S3\n"
331 |         print(status_report)
332 |         with open(f"status/status_alignment_{worker_number}.txt", "a") as f:
333 |             f.write(status_report)
334 |         
335 |         return
336 | 
337 |     # Construct paths
338 |     json_path = os.path.join(json_folder_path, f"{video_id}.json")
339 |     json_result_path = os.path.join(output_folder_path, f"{video_id}.json")
340 | 
341 |     # Load JSON file
342 |     with open(json_path, 'r') as json_file:
343 |         video_data = json.load(json_file)
344 | 
345 |     try:
346 |         # Adjust scene boundaries using PySceneDetect to determine FPS
347 |         fps, adjusted_scenes = adjust_scene_boundaries(video_local_path, video_data['scenes'], video_id, str(worker_number))
348 | 
349 |         # Update scenes in the original data
350 |         video_data['scenes'] = adjusted_scenes
351 |         video_data['fps'] = fps
352 | 
353 |         # Update all timestamps to FrameTimecode format
354 |         video_data = update_timestamps_in_json(video_data, fps, video_id, str(worker_number))
355 | 
356 |         # Write updated JSON back to file
357 |         with open(json_result_path, 'w') as json_file:
358 |             json.dump(video_data, json_file, indent=4)
359 | 
360 |         print(f"Processed video {video_id}.")
361 | 
362 |         # Prepare the status report
363 |         timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
364 |         status_report = f"{timestamp} - {video_id} - complete\n"
365 |         print(status_report)
366 |         
367 |         # Append the status report to status.txt
368 |         if worker_number is None:
369 |             with open("status_alignment.txt", "a") as f:
370 |                 f.write(status_report)
371 |         else:    
372 |             with open(f"status/status_alignment_{worker_number}.txt", "a") as f:
373 |                 f.write(status_report)
374 | 
375 |     except Exception as e:
376 |         # Handle any errors in adjusting scenes or updating timestamps
377 |         error_data = {
378 |             "error": str(e),
379 |             "video_id": video_id,
380 |             "worker_number": worker_number
381 |         }
382 |         error_file_path = os.path.join(output_folder_path, f"errors_{video_id}.json")
383 |         with open(error_file_path, "w") as f:
384 |             json.dump(error_data, f, indent=4)
385 | 
386 |         # Update status report for failure
387 |         timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
388 |         status_report = f"{timestamp} - {video_id} - failed - Error during processing: {str(e)}\n"
389 |         print(status_report)
390 |         with open(f"status/status_alignment_{worker_number}.txt", "a") as f:
391 |             f.write(status_report)
392 | 
393 |     finally:
394 |         # Remove the video file after processing, even if an error occurred
395 |         if os.path.exists(video_local_path):
396 |             os.remove(video_local_path)
397 |             print(f"Deleted local file {video_local_path} after processing.")
398 | 
399 | 
400 | def process_chunk(videos_to_process, size_chunk, worker_number):
401 |     # Calculate start and end indices for this worker's chunk
402 |     start_index = worker_number * size_chunk
403 |     end_index = min(start_index + size_chunk, len(videos_to_process))
404 | 
405 |     # Process videos in this worker's chunk
406 |     for video_id in videos_to_process[start_index:end_index]:
407 |         process_single_video(video_id, worker_number)
408 | 
409 | if __name__ == "__main__":
410 |     # Parse command-line arguments
411 |     parser = argparse.ArgumentParser(description='Process videos in chunks.')
412 |     parser.add_argument('size_chunk', type=int, help='Size of each chunk to process')
413 |     parser.add_argument('worker_number', type=int, help='Worker number (zero-indexed)')
414 |     parser.add_argument('--video_list', type=str, help='Optional video list file in JSON format')
415 |     args = parser.parse_args()
416 | 
417 |     # Load the list of videos
418 |     if args.video_list:
419 |         with open(args.video_list, 'r') as f:
420 |             videos_to_process = json.load(f)
421 |         print(f"Using provided video list: {args.video_list}")
422 |     else:
423 |         with open('video_alignment_to_process.json', 'r') as f:
424 |             videos_to_process = json.load(f)
425 |         print("Using default video list: video_alignment_to_process.json")
426 | 
427 |     # Process the assigned chunk
428 |     process_chunk(videos_to_process, args.size_chunk, args.worker_number)
429 | 
430 | 
431 | 
432 | 
433 | 


--------------------------------------------------------------------------------
/finevideo.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/fineVideo/b961b6ade22910d041aa75451afa94e454bca372/finevideo.gif


--------------------------------------------------------------------------------
/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/fineVideo/b961b6ade22910d041aa75451afa94e454bca372/logo.png


--------------------------------------------------------------------------------
/rawdataset/filter-yt-commons.py:
--------------------------------------------------------------------------------
 1 | from huggingface_hub import snapshot_download
 2 | import pandas as pd
 3 | import pyarrow.parquet as pq
 4 | import os
 5 | 
 6 | #
 7 | # This script downloads YTCommons dataset from Hugging Face and parses some relevant fields of each video to finally store them in a dataframe
 8 | # Be careful - this script requires a decent amount of RAM to work.
 9 | #
10 | 
11 | 
12 | ### CONFIG ###
13 | dataset_path = './Youtube-Commons/'
14 | output_pkl = 'en_ycommons.pkl'
15 | ###
16 | 
17 | 
18 | 
19 | def read_filtered_parquet_files(folder_path, fields, filters=None):
20 |     """
21 |     Reads specified fields from all Parquet files in a folder with filtering and combines them into a single DataFrame.
22 |     
23 |     Parameters:
24 |     folder_path (str): The path to the folder containing Parquet files.
25 |     fields (list): List of fields to read from the Parquet files.
26 |     filters (list): List of tuples for filtering, e.g., [('column_name', '==', value)]
27 |     
28 |     Returns:
29 |     pd.DataFrame: A DataFrame containing the specified fields from all filtered Parquet files.
30 |     """
31 |     # List to store DataFrames
32 |     dataframes = []
33 |     
34 |     # Iterate over all files in the folder
35 |     for file_name in os.listdir(folder_path):
36 |         if file_name.endswith('.parquet'):
37 |             file_path = os.path.join(folder_path, file_name)
38 |             print(f"Processing file: {file_path}")
39 |             
40 |             # Read the entire Parquet file
41 |             df = pq.read_table(file_path).to_pandas()
42 |             
43 |             # Apply filters if provided
44 |             if filters:
45 |                 for column, operator, value in filters:
46 |                     if operator == '==':
47 |                         df = df[df[column] == value]
48 |                     elif operator == '>':
49 |                         df = df[df[column] > value]
50 |                     elif operator == '<':
51 |                         df = df[df[column] < value]
52 |                     # Add other operators as needed
53 |             
54 |             # Check if 'word_count' column exists and filter rows with word_count > 50
55 |             if 'word_count' in df.columns:
56 |                 df = df[df['word_count'] > 50]
57 |                 
58 |             # Handle 'source_language' and 'language_id_method' fields
59 |             if 'source_language' not in df.columns and 'language_id_method' in df.columns:
60 |                 df['source_language'] = df['language_id_method']
61 |             elif 'source_language' in df.columns:
62 |                 pass  # 'source_language' already exists, no action needed
63 |             
64 |             # Ensure 'source_language' is in the fields to select
65 |             if 'source_language' not in fields:
66 |                 fields.append('source_language')
67 |                 
68 |             # Select only the specified fields
69 |             df = df[fields]
70 |             dataframes.append(df)
71 |     
72 |     # Concatenate all DataFrames
73 |     combined_df = pd.concat(dataframes, ignore_index=True)
74 |     return combined_df
75 | 
76 | 
77 | fields = ['acodec', 'age_limit', 'categories', 'channel', 'channel_follower_count', 'channel_id', 'character_count', 'comment_count', 'date', 'description', 'duration_string', 'language', 'license', 'like_count', 'original_language', 'resolution', 'tags', 'text', 'title', 'transcription_language', 'upload_date', 'vcodec', 'video_id', 'video_link', 'view_count', 'word_count']
78 | filters = [('original_language', '==', 'en'), ('transcription_language', '==', 'en')]
79 | 
80 | folder = snapshot_download("PleIAs/YouTube-Commons",
81 |                            repo_type='dataset',
82 |                            local_dir=dataset_path)
83 | 
84 | 
85 | df = read_filtered_parquet_files(dataset_path, fields, filters=filters)
86 | 
87 | print(df.head())
88 | print(f"Total videos: {len(df)}")
89 | df.to_pickle(output_pkl)


--------------------------------------------------------------------------------
/rawdataset/ytdlps3/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.9-slim
 2 | 
 3 | ENV PYTHONDONTWRITEBYTECODE 1
 4 | ENV PYTHONUNBUFFERED 1
 5 | 
 6 | # Install required packages
 7 | RUN apt-get update && apt-get install -y \
 8 |     wget \
 9 |     ffmpeg \
10 |     && apt-get clean
11 | 
12 | # Install yt-dlp (a fork of youtube-dl with more features and better maintenance)
13 | RUN pip install yt-dlp boto3
14 | 
15 | # Create a directory for the application
16 | WORKDIR /app
17 | 
18 | # Copy the script into the Docker image
19 | COPY download_and_upload.py /app/download_and_upload.py
20 | 
21 | # Set the entry point to the script
22 | ENTRYPOINT ["python", "/app/download_and_upload.py"]
23 | 


--------------------------------------------------------------------------------
/rawdataset/ytdlps3/download_and_upload.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import boto3
 4 | from yt_dlp import YoutubeDL
 5 | 
 6 | def download_youtube_video(video_id, output_path):
 7 |     ydl_opts = {
 8 |         'format': 'best',
 9 |         'writesubtitles': True,
10 |         'subtitleslangs': ['en'],
11 |         'subtitlesformat': 'vtt',
12 |         'writeinfojson': True,
13 |         'skip_download': False,
14 |         'outtmpl': os.path.join(output_path, f'{video_id}.%(ext)s'),
15 |     }
16 |     with YoutubeDL(ydl_opts) as ydl:
17 |         info_dict = ydl.extract_info(video_id, download=True)
18 |         
19 |         # Get the correct subtitle file path from the info_dict
20 |         subtitle_file_path = None
21 |         subtitles = info_dict.get('subtitles')
22 |         if subtitles and 'en' in subtitles:
23 |             subtitle_data = subtitles['en'][0]  # Get the first English subtitle entry
24 |             subtitle_file_path = ydl.prepare_filename(info_dict).replace('.mp4', '.en.vtt')
25 |         
26 |         return info_dict, subtitle_file_path
27 | 
28 | def upload_to_s3(local_file_path, s3_bucket, s3_key):
29 |     s3_client = boto3.client('s3')
30 |     s3_client.upload_file(local_file_path, s3_bucket, s3_key)
31 | 
32 | def log_failure(video_id, error_message, s3_bucket, s3_path):
33 |     error_file_path = f"/tmp/{video_id}.txt"
34 |     with open(error_file_path, 'w') as f:
35 |         f.write(error_message)
36 |     
37 |     # Upload the error file to S3 in the failed/ subfolder
38 |     s3_client = boto3.client('s3')
39 |     s3_client.upload_file(error_file_path, s3_bucket, f"failed/{video_id}.txt")
40 | 
41 | def process_video(video_id, s3_bucket, s3_path):
42 |     try:
43 |         # Create a temporary directory to store downloaded files
44 |         download_path = '/tmp/youtube_downloads'
45 |         os.makedirs(download_path, exist_ok=True)
46 | 
47 |         # Download the video, subtitles (if available), and metadata
48 |         info_dict, subtitle_file_path = download_youtube_video(video_id, download_path)
49 | 
50 |         # Define file paths
51 |         video_file = os.path.join(download_path, f'{video_id}.mp4')
52 |         metadata_file = os.path.join(download_path, f'{video_id}.info.json')
53 | 
54 |         # Upload each file to the specified S3 path if it exists
55 |         if os.path.exists(video_file):
56 |             upload_to_s3(video_file, s3_bucket, os.path.join(s3_path, f'{video_id}.mp4'))
57 |         if os.path.exists(metadata_file):
58 |             upload_to_s3(metadata_file, s3_bucket, os.path.join(s3_path, f'{video_id}.json'))
59 |         if subtitle_file_path and os.path.exists(subtitle_file_path):
60 |             upload_to_s3(subtitle_file_path, s3_bucket, os.path.join(s3_path, f'{video_id}.en.vtt'))
61 | 
62 |         # Cleanup
63 |         for file_name in os.listdir(download_path):
64 |             os.remove(os.path.join(download_path, file_name))
65 | 
66 |     except Exception as e:
67 |         error_message = str(e)
68 |         log_failure(video_id, error_message, s3_bucket, s3_path)
69 | 
70 | def main(video_ids, s3_bucket, s3_path):
71 |     for video_id in video_ids:
72 |         process_video(video_id, s3_bucket, s3_path)
73 | 
74 | if __name__ == "__main__":
75 |     if len(sys.argv) < 4:
76 |         print("Usage: python download_and_upload.py <s3_bucket_name> <s3_path> <youtube_video_id_1> [<youtube_video_id_2> ...]")
77 |         sys.exit(1)
78 | 
79 |     s3_bucket = sys.argv[1]
80 |     s3_path = sys.argv[2]
81 |     video_ids = sys.argv[3:]
82 | 
83 |     main(video_ids, s3_bucket, s3_path)
84 | 


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
 1 | ![Fine Video](logo.png)
 2 | 
 3 | ## Introduction
 4 | 
 5 | We recently released [FineVideo](https://huggingface.co/spaces/HuggingFaceFV/FineVideo-Explorer), a dataset with 43k+ videos/3.4k hours annotated with rich descriptions, narrative details scene splits and QA pairs. 
 6 | 
 7 | We cannot be more excited about the response of the community! If you have not seen FineVideo yet, take a look at it through the [dataset explorer page](https://huggingface.co/spaces/HuggingFaceFV/FineVideo-Explorer)
 8 | 
 9 | ![FineVideo Explorer page](finevideo.gif)
10 | 
11 | 
12 | If you are interested in more technical details about the pipeline, we invite you to take a look at our [blog post](https://huggingface.co/).
13 | 
14 | 
15 | ## Content of the repository
16 | 
17 | This repository contains the code that we used in FineVideo to gather videos and annotate them. Those scripts cover all the different steps in the pipeline below.
18 | ![alt text](dataset-creation.png)
19 | 
20 | The scripts are grouped in folders and each folder represent one or more steps of the pipeline:
21 | 
22 | ```
23 | ├── rawdataset
24 | │   ├── filter-yt-commons.py
25 | │   └── ytdlps3
26 | │       ├── Dockerfile
27 | │       └── download_and_upload.py
28 | ├── dynamicfilters
29 | │   ├── videodynamismfiltering
30 | │   │   ├── Dockerfile
31 | │   │   └── check_static.py
32 | │   └── worddensityfiltering.py
33 | ├── videocategorization
34 | │   ├── content_taxonomy.json
35 | │   ├── create_prompts.py
36 | │   ├── launchTGI-Slurm.sh
37 | │   └── tgi_inference_client.py
38 | ├── contentselection
39 | │   ├── content_taxonomy.json
40 | │   └── oracle.py
41 | ├── contentannotation
42 | │   ├── gemini_prompt.txt
43 | │   └── video2annotation.py
44 | ├── finealignment
45 |     └── video_alignment.py
46 | 
47 | ```
48 | 
49 | Given the size of the content to scan and/or annotate, all the parts that require scalability are implemented as docker containers that can be launched in a distributed way or prepared to split a list of work in chunks and process specific chunks of it so that you can launch multiple instances of the same script to parallelize.
50 | 
51 | For example:
52 | * video download `ytdlps3` and video dynamism filtering `videodynamismfiltering` are packaged as Docker containers. 
53 | * video id gathering for the raw dataset `filter-yt-commons.py`, content selection `oracle.py` or word density filtering `worddensityfiltering.py` are scripts that process all the content at once
54 | * content annotation `video2annotation.py`, video categorization `tgi_inference_client.py` & `create_prompts.py` and video-metadata alignment `video_alignment.py` are prepared to process chunks of a queue so that you can launch multiple instances of the same script.


--------------------------------------------------------------------------------
/videocategorization/content_taxonomy.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "Entertainment": {
  3 |       "Comedy": {
  4 |         "Stand-up": {},
  5 |         "Sketches": {},
  6 |         "Parodies": {}
  7 |       },
  8 |       "Music": {
  9 |         "Music Videos": {},
 10 |         "Covers": {},
 11 |         "Remixes": {},
 12 |         "Lyric Videos": {}
 13 |       },
 14 |       "Movies & Trailers": {
 15 |         "Film Trailers": {},
 16 |         "Short Films": {},
 17 |         "Movie Reviews": {}
 18 |       },
 19 |       "Gaming": {
 20 |         "Let's Plays": {},
 21 |         "Game Reviews": {},
 22 |         "Walkthroughs": {},
 23 |         "Game Commentary": {}
 24 |       },
 25 |       "Vlogs": {
 26 |         "Daily Vlogs": {},
 27 |         "Travel Vlogs": {},
 28 |         "Storytime": {}
 29 |       },
 30 |       "Livestreams": {
 31 |         "Gaming Livestreams": {},
 32 |         "Q&A Sessions": {},
 33 |         "Event Livestreams": {}
 34 |       }
 35 |     },
 36 |     "Education": {
 37 |       "Tutorials": {
 38 |         "Software Tutorials": {},
 39 |         "DIY & Crafts": {},
 40 |         "Cooking Tutorials": {}
 41 |       },
 42 |       "Lectures & Talks": {
 43 |         "Academic Lectures": {},
 44 |         "TED Talks": {},
 45 |         "Motivational Talks": {}
 46 |       },
 47 |       "Science & Technology": {
 48 |         "Science Explainers": {},
 49 |         "Tech Reviews": {},
 50 |         "Engineering Projects": {}
 51 |       },
 52 |       "Language Learning": {
 53 |         "Language Lessons": {},
 54 |         "Pronunciation Guides": {}
 55 |       },
 56 |       "History & Culture": {
 57 |         "Documentaries": {},
 58 |         "Cultural Explainers": {},
 59 |         "Historical Analysis": {}
 60 |       },
 61 |       "Business & Finance": {
 62 |         "Entrepreneurship": {},
 63 |         "Investment Guides": {},
 64 |         "Marketing Strategies": {}
 65 |       }
 66 |     },
 67 |     "Lifestyle": {
 68 |       "Health & Fitness": {
 69 |         "Workout Routines": {},
 70 |         "Nutrition Guides": {},
 71 |         "Mental Health Tips": {}
 72 |       },
 73 |       "Fashion & Beauty": {
 74 |         "Makeup Tutorials": {},
 75 |         "Fashion Hauls": {},
 76 |         "Skincare Routines": {}
 77 |       },
 78 |       "Travel": {
 79 |         "Destination Guides": {},
 80 |         "Travel Tips": {},
 81 |         "Travel Vlogs": {}
 82 |       },
 83 |       "Food & Cooking": {
 84 |         "Recipe Videos": {},
 85 |         "Cooking Shows": {},
 86 |         "Food Reviews": {}
 87 |       },
 88 |       "Home & Garden": {
 89 |         "Home Improvement": {},
 90 |         "Gardening Tips": {},
 91 |         "Interior Design": {}
 92 |       },
 93 |       "Parenting & Family": {
 94 |         "Parenting Tips": {},
 95 |         "Family Vlogs": {},
 96 |         "Childcare Advice": {}
 97 |       }
 98 |     },
 99 |     "News & Politics": {
100 |       "News Reports": {
101 |         "Breaking News": {},
102 |         "Political News": {},
103 |         "World News": {}
104 |       },
105 |       "Opinion & Commentary": {
106 |         "Political Commentary": {},
107 |         "Social Commentary": {},
108 |         "Editorials": {}
109 |       },
110 |       "Interviews": {
111 |         "Celebrity Interviews": {},
112 |         "Political Interviews": {},
113 |         "Expert Interviews": {}
114 |       },
115 |       "Debates": {
116 |         "Political Debates": {},
117 |         "Social Issue Debates": {}
118 |       }
119 |     },
120 |     "Sports": {
121 |       "Highlights & Replays": {
122 |         "Game Highlights": {},
123 |         "Match Replays": {}
124 |       },
125 |       "Sports Commentary": {
126 |         "Analysis Shows": {},
127 |         "Sports Talk Shows": {}
128 |       },
129 |       "Athlete Profiles": {
130 |         "Career Highlights": {},
131 |         "Documentary Profiles": {}
132 |       },
133 |       "Fitness & Training": {
134 |         "Athlete Workouts": {},
135 |         "Training Techniques": {}
136 |       }
137 |     },
138 |     "Art & Creativity": {
139 |       "Visual Arts": {
140 |         "Painting Tutorials": {},
141 |         "Drawing Tutorials": {},
142 |         "Art Exhibitions": {}
143 |       },
144 |       "Photography & Film": {
145 |         "Photography Tips": {},
146 |         "Cinematography": {},
147 |         "Short Films": {}
148 |       },
149 |       "Crafts & DIY": {
150 |         "Home Crafts": {},
151 |         "DIY Projects": {},
152 |         "Upcycling": {}
153 |       },
154 |       "Writing & Literature": {
155 |         "Writing Tips": {},
156 |         "Book Reviews": {},
157 |         "Poetry Readings": {}
158 |       }
159 |     },
160 |     "Science & Technology": {
161 |       "Tech Reviews": {
162 |         "Gadget Reviews": {},
163 |         "Software Reviews": {}
164 |       },
165 |       "Science Explainers": {
166 |         "Physics": {},
167 |         "Biology": {},
168 |         "Chemistry": {}
169 |       },
170 |       "Space Exploration": {
171 |         "Astronomy": {},
172 |         "Space Missions": {}
173 |       },
174 |       "Engineering": {
175 |         "Mechanical Engineering": {},
176 |         "Electrical Engineering": {}
177 |       },
178 |       "Environmental Science": {
179 |         "Climate Change": {},
180 |         "Conservation Efforts": {}
181 |       },
182 |       "Artificial Intelligence": {
183 |         "AI Concepts": {},
184 |         "Machine Learning Tutorials": {}
185 |       }
186 |     },
187 |     "Automotive": {
188 |       "Car Reviews": {},
189 |       "Car Modifications": {},
190 |       "Driving Tutorials": {},
191 |       "Motorsports": {
192 |         "Racing Highlights": {},
193 |         "Motorsport Commentary": {}
194 |       },
195 |       "Off-Roading": {}
196 |     },
197 |     "Hobbies & Interests": {
198 |       "Collecting": {
199 |         "Toy Collections": {},
200 |         "Stamp Collections": {},
201 |         "Memorabilia": {}
202 |       },
203 |       "Board Games & Puzzles": {
204 |         "Gameplay Tutorials": {},
205 |         "Game Reviews": {}
206 |       },
207 |       "Outdoor Activities": {
208 |         "Camping": {},
209 |         "Hiking": {},
210 |         "Fishing": {}
211 |       },
212 |       "Arts & Crafts": {
213 |         "Knitting": {},
214 |         "Pottery": {},
215 |         "Scrapbooking": {}
216 |       },
217 |       "Listicles & Rankings": {
218 |         "Top 10 Videos": {},
219 |         "Best of [Category]": {},
220 |         "Ranked Lists": {},
221 |         "Must-See [Topic]": {},
222 |         "Buyer’s Guides": {}
223 |       },
224 |     "Miscellaneous": {
225 |       "ASMR": {},
226 |       "Unboxing Videos": {},
227 |       "Reaction Videos": {},
228 |       "Pranks": {},
229 |       "Social Experiments": {}
230 |     }
231 |   }
232 | }
233 |   


--------------------------------------------------------------------------------
/videocategorization/create_prompts.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import pandas as pd
  4 | from concurrent.futures import ThreadPoolExecutor
  5 | 
  6 | 
  7 | #
  8 | # Given a pandas dataframe with a list of videos, this script will generate custom prompts for your videos and by default store
  9 | # them in a subfolder 'prompts' 
 10 | #
 11 | 
 12 | ### CONFIG ###
 13 | df_path = 'current_videos.pkl'
 14 | ###
 15 | 
 16 | 
 17 | 
 18 | # prompt_template = """
 19 | # Given those categories: {leaves}
 20 | # Classify a youtube video given its closed captioning and some metadata details. RETURN ONLY the selected category and nothing else!
 21 | # Title: {title}
 22 | # Description: {description}
 23 | # Categories: {categories}
 24 | # Tags: {tags}
 25 | # Channel: {channel}
 26 | # Closed Caption: {closed_caption}
 27 | # """
 28 | prompt_template = """
 29 | Given those categories: {leaves}
 30 | Classify a youtube video given its closed captioning and some metadata details. RETURN ONLY the selected category and nothing else!
 31 | Title: {title}
 32 | Description: {description}
 33 | Channel: {channel}
 34 | Closed Caption: {closed_caption}
 35 | """
 36 | 
 37 | def get_leaves(taxonomy):
 38 |     leaves = []
 39 |     for key, value in taxonomy.items():
 40 |         if isinstance(value, dict) and value:  # If it's a non-empty dictionary
 41 |             leaves.extend(get_leaves(value))
 42 |         else:  # If it's an empty dictionary, consider it as a leaf
 43 |             if not value:  # Check if the value is an empty dictionary
 44 |                 leaves.append(key)
 45 |     return leaves
 46 | 
 47 | def generate_prompt(row, text, leaves):
 48 |     return prompt_template.format(
 49 |         leaves=json.dumps(leaves, indent=2),
 50 |         title=row['title'],
 51 |         # description=row['description'],
 52 |         # categories=row['categories'],
 53 |         tags=row['tags'],
 54 |         channel=row['channel'],
 55 |         closed_caption=row['text'][:5000]  # Trim closed captions
 56 |     )
 57 | 
 58 | def save_prompts_to_file(prompts, output_file):
 59 |     """Save prompts to the output JSON file, overwriting it."""
 60 |     with open(output_file, 'w', encoding='utf-8') as file:
 61 |         json.dump(prompts, file, indent=4, ensure_ascii=False)
 62 | 
 63 | def process_row(row, leaves):
 64 |     video_id = row['video_id']
 65 | 
 66 |     # Generate the prompt
 67 |     prompt = generate_prompt(row, leaves)
 68 |     return {"video_id": video_id, "prompt": prompt}
 69 | 
 70 | def generate_prompts_and_save(df_path, output_dir='prompts', max_workers=None, chunksize=1000):
 71 |     # Ensure the output directory exists
 72 |     os.makedirs(output_dir, exist_ok=True)
 73 | 
 74 |     # Load the taxonomy content
 75 |     with open('content_taxonomy.json', 'r') as file:
 76 |         taxonomy_content = json.load(file)
 77 | 
 78 |     leaves = get_leaves(taxonomy_content)
 79 | 
 80 |     # Load the entire DataFrame first (ensure this fits in memory)
 81 |     df = pd.read_pickle(df_path)
 82 |     
 83 |     # Process in chunks
 84 |     chunk_index = 0
 85 |     for start in range(0, len(df), chunksize):
 86 |         chunk = df.iloc[start:start + chunksize]
 87 |         prompts = []
 88 | 
 89 |         # Use ThreadPoolExecutor for file I/O-bound operations
 90 |         with ThreadPoolExecutor(max_workers=max_workers) as executor:
 91 |             results = executor.map(
 92 |                 process_row,
 93 |                 (row for _, row in chunk.iterrows()),
 94 |                 [leaves] * len(chunk)
 95 |             )
 96 | 
 97 |         # Collect results and filter out None results
 98 |         results = [result for result in results if result is not None]
 99 | 
100 |         # Save results to file in chunks
101 |         if results:
102 |             chunk_file = os.path.join(output_dir, f'prompts_{chunk_index}.json')
103 |             save_prompts_to_file(results, chunk_file)
104 |             print(f"Saved chunk {chunk_index} to {chunk_file}")
105 |             chunk_index += 1
106 | 
107 |     print(f"Completed processing.")
108 | 
109 | # Specify the number of workers, for example, 8
110 | generate_prompts_and_save(df_path, max_workers=8)
111 | 


--------------------------------------------------------------------------------
/videocategorization/launchTGI-Slurm.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=tgi-tests
 3 | #SBATCH --partition hopper-prod
 4 | #SBATCH --gpus=8
 5 | #SBATCH --cpus-per-task=12
 6 | #SBATCH --mem-per-cpu=11G
 7 | #SBATCH -o slurm/logs/%x_%j.out
 8 | #SBATCH --qos=high
 9 | 
10 | export HF_TOKEN=XXXXX
11 | export PORT=1456
12 | srun --container-image='ghcr.io#huggingface/text-generation-inference' \
13 |      --container-env=HUGGING_FACE_HUB_TOKEN,PORT \
14 |      --container-mounts="/scratch:/data" \
15 |      --container-workdir='/usr/src' \
16 |      --no-container-mount-home \
17 |      --qos normal \
18 |      --gpus=8 \
19 |      /usr/local/bin/text-generation-launcher --model-id meta-llama/Meta-Llama-3.1-70B-Instruct


--------------------------------------------------------------------------------
/videocategorization/tgi_inference_client.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import requests
  4 | from tqdm import tqdm
  5 | from transformers import AutoTokenizer
  6 | import re
  7 | import sys
  8 | from math import ceil
  9 | 
 10 | #
 11 | # This script will run the defined prompts against one or more TGI services
 12 | # the prompts are stored in chunks in a folder called prompts/ 
 13 | 
 14 | # The script is called with 3 parameters:
 15 | # python tgi_inference_client.py <server_address> <port> <block_number>
 16 | # block_number is a number between 0 and 3 (both included). Those blocks are 4 subdivisions of the prompts in prompts/ 
 17 | # and by specifying the block number we run inference in each different block, this allow us to parallelize inference.
 18 | #
 19 | 
 20 | 
 21 | 
 22 | 
 23 | # Ensure the output directory exists
 24 | os.makedirs("processed", exist_ok=True)
 25 | 
 26 | # Function to load prompts from a single JSON file
 27 | def load_prompts_from_file(file_path):
 28 |     with open(file_path, "r", encoding="utf-8") as file:
 29 |         tasks = json.load(file)
 30 |     return tasks
 31 | 
 32 | # Function to process a single file's tasks and save results
 33 | def process_file(file_path, tokenizer, endpoint_url):
 34 |     # Load tasks from the current file
 35 |     tasks = load_prompts_from_file(file_path)
 36 |     results = []
 37 | 
 38 |     # Headers for the HTTP request
 39 |     headers = {
 40 |         "Content-Type": "application/json",
 41 |     }
 42 | 
 43 |     # Process each task
 44 |     for task in tqdm(tasks, desc="Processing tasks"):
 45 |         video_id = task['video_id']
 46 |         input_text = task['prompt']
 47 |         input_text = input_text.replace("Given those categories:", "Given this taxonomy:")
 48 |         pattern = r"Categories: \[.*?\]\n?"
 49 |         input_text = re.sub(pattern, '', input_text)
 50 |         pattern = r"Tags: \[.*?\]\n?"
 51 |         input_text = re.sub(pattern, '', input_text)
 52 |         pattern = r"Description: \[.*?\]\n?"
 53 |         input_text = re.sub(pattern, '', input_text)
 54 |         input_text = input_text + "RETURN A CATEGORY FROM THE TAXONOMY PROVIDED: "
 55 | 
 56 |         prompt_tokens = tokenizer.apply_chat_template(
 57 |             [
 58 |                 {"role": "user", "content": input_text},
 59 |             ],
 60 |             tokenize=False,
 61 |             add_generation_prompt=True
 62 |         )
 63 | 
 64 |         # Prepare the data for the request
 65 |         data = {
 66 |             "inputs": prompt_tokens,
 67 |             "parameters": {
 68 |                 "max_new_tokens": 20,  # Adjust as needed
 69 |             },
 70 |         }
 71 | 
 72 |         # Make a synchronous request to the model endpoint
 73 |         response = requests.post(endpoint_url, headers=headers, json=data)
 74 |         if response.status_code == 200:
 75 |             response_data = response.json()
 76 |             completion = response_data.get('generated_text', '')
 77 |         else:
 78 |             completion = "Error: Unable to get response"
 79 | 
 80 |         # Append the result
 81 |         results.append({"video_id": video_id, "completion": completion})
 82 | 
 83 |     # Save results to file after processing all tasks in the file
 84 |     output_filename = os.path.splitext(os.path.basename(file_path))[0]
 85 |     with open(f"processed/{output_filename}_results.json", "w", encoding="utf-8") as f:
 86 |         json.dump(results, f, ensure_ascii=False, indent=4)
 87 | 
 88 | # Main function to process a subset of files
 89 | def main():
 90 |     # Get server address, port, and block number from command-line arguments
 91 |     if len(sys.argv) != 4:
 92 |         print("Usage: python script_name.py <server_address> <port> <block_number>")
 93 |         sys.exit(1)
 94 | 
 95 |     server_address = sys.argv[1]
 96 |     port = sys.argv[2]
 97 |     block_number = int(sys.argv[3])
 98 | 
 99 |     # Validate block number
100 |     if block_number < 0 or block_number > 3:
101 |         print("Error: block_number must be between 0 and 3.")
102 |         sys.exit(1)
103 | 
104 |     # Construct endpoint URL
105 |     endpoint_url = f"http://{server_address}:{port}/generate"
106 | 
107 |     # Initialize tokenizer
108 |     tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-70B-Instruct")
109 | 
110 |     # List all JSON files in the prompts directory
111 |     files = [f for f in os.listdir("prompts") if f.endswith(".json")]
112 | 
113 |     # Sort files to ensure consistent partitioning
114 |     files.sort()
115 | 
116 |     # Divide files into 4 blocks
117 |     total_files = len(files)
118 |     block_size = ceil(total_files / 4)
119 | 
120 |     # Determine start and end indices for the current block
121 |     start_index = block_number * block_size
122 |     end_index = min(start_index + block_size, total_files)
123 | 
124 |     # Process only the files in the current block
125 |     for i in range(start_index, end_index):
126 |         file_path = os.path.join("prompts", files[i])
127 |         process_file(file_path, tokenizer, endpoint_url)
128 | 
129 | # Run the main function
130 | if __name__ == "__main__":
131 |     main()


--------------------------------------------------------------------------------