221 | ```
222 |
223 | ### Arguments
224 |
225 | - **files** (required):
226 | One or more files to process.
227 |
228 | - **tool** (required):
229 | The tool to use. Available options:
230 | - `base` – to generate a base video.
231 | - `add_titles` – to add titles to the video.
232 |
233 | ### Example
234 | ```bash
235 | python main.py generator video1.mp4 base
236 | ```
237 | *This command uses the `base` tool on `video1.mp4` to generate a base video.*
238 |
239 | ---
240 |
241 | ## General Help
242 |
243 | To display the help information for the CLI tool or a specific subcommand, use the `--help` flag. For example:
244 | ```bash
245 | python main.py --help
246 | python main.py video_edit --help
247 | ```
248 |
249 | This will display all available options and arguments for that command.
250 |
251 |
252 | ## Project Structure
253 |
254 | - **config_loader.py:** Loads configuration from `config.json` and makes it available throughout the project.
255 | - **main.py:** The central entry point that defines and handles multiple subcommands for video processing.
256 | - **automatic_short_generator.py:** A script to generate short videos using predefined tools.
257 | - **get_data.py:** A utility to traverse directories and concatenate files.
258 | - **utils/**
259 | - **utils.py:** Contains helper functions (e.g., converting strings to booleans, audio extraction, video metadata extraction).
260 | - **operations/**
261 | - **save.py:** Functions to save edited or joined video clips.
262 | - **set_orientation.py:** Adjusts video orientation (vertical/horizontal).
263 | - **subtitles.py:** Adds subtitles to videos.
264 | - **shorts.py:** Generates base videos with effects (e.g., blurred background) and adds title clips.
265 | - **transcript.py:** Generates transcripts using the Whisper model.
266 | - **trim.py:** Implements silence detection and video trimming.
267 | - **translation.py:** Handles video translation and audio generation.
268 | - **denoise.py:** Applies denoising filters using deep learning models.
269 |
270 | ## Configuration
271 |
272 | The toolkit uses a JSON configuration file (`config.json`) to define parameters such as:
273 | - Subtitle and title clip settings (e.g., font, size, position).
274 | - Other customizable options for processing operations.
275 |
276 | Adjust these settings according to your needs before running any commands.
277 |
278 | ## Contributing
279 |
280 | Contributions are welcome! If you have suggestions or improvements, feel free to open an issue or submit a pull request.
281 |
282 | ## License
283 | This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details.
284 |
285 |
286 |
287 |
Let's connect 😋
288 |
289 |
290 |
291 |
292 |
293 |
294 |
295 |
296 |
297 |
298 |
299 |
300 |
301 |
302 |
--------------------------------------------------------------------------------
/operations/avatar_video_generation.py:
--------------------------------------------------------------------------------
1 | """
2 | Module for video generation with an avatar from audio
3 | """
4 |
5 | import logging
6 | import os
7 | import json
8 | from pathlib import Path
9 | from typing import Any, Dict, List, Optional, Tuple
10 |
11 | from concurrent.futures import ThreadPoolExecutor, as_completed
12 |
13 | from dotenv import load_dotenv
14 | import numpy as np
15 |
16 | from openai import OpenAI
17 | from faster_whisper import WhisperModel
18 | from pydub import AudioSegment
19 | from moviepy.editor import VideoFileClip, AudioFileClip, CompositeVideoClip
20 |
21 | from utils import apply_shake, get_subclip_volume_segment
22 |
23 |
24 | CACHE_SUFFIX = "_segments.json"
25 | DEFAULT_FPS = 24 # fallback framerate if clip.fps is missing
26 |
27 | load_dotenv()
28 | OPENAI_MODEL = os.getenv("OPENAI_MODEL", "GPT-4.1")
29 | OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
30 | OPENAI_API_BASE = os.getenv("OPENAI_API_BASE", "https://api.openai.com/v1")
31 | WHISPER_MODEL_SIZE = os.getenv("WHISPER_MODEL_SIZE", "large-v3") # can be adjusted
32 |
33 | # Validate essential environment variables early
34 | if not OPENAI_API_KEY:
35 | raise RuntimeError("Missing OPENAI_API_KEY in environment variables.")
36 |
37 | # Initialize OpenAI client
38 | _client = OpenAI(api_key=OPENAI_API_KEY, base_url=OPENAI_API_BASE)
39 |
40 | # Set up logging
41 | logging.basicConfig(
42 | level=logging.INFO,
43 | format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
44 | datefmt="%Y-%m-%d %H:%M:%S",
45 | )
46 | logger = logging.getLogger(__name__)
47 |
48 |
49 | class SegmentData:
50 | """
51 | Simple container for a single transcript segment's metadata.
52 | """
53 |
54 | def __init__(self, start: float, end: float, emotion: str, volume: float):
55 | self.start = start
56 | self.end = end
57 | self.emotion = emotion
58 | self.volume = volume
59 |
60 | def to_dict(self) -> Dict[str, Any]:
61 | """
62 | Transform SegmentData to Dict
63 | """
64 | return {
65 | "start": self.start,
66 | "end": self.end,
67 | "emotion": self.emotion,
68 | "volume": self.volume,
69 | }
70 |
71 | @staticmethod
72 | def from_dict(data: Dict[str, Any]) -> "SegmentData":
73 | """
74 | Transform dict to SegmentData
75 | """
76 | return SegmentData(
77 | start=data["start"],
78 | end=data["end"],
79 | emotion=data["emotion"],
80 | volume=data["volume"],
81 | )
82 |
83 |
84 | def build_emotion_system_prompt(emotion_keys: List[str]) -> str:
85 | """
86 | Construct the system prompt for ChatGPT to classify emotions.
87 | """
88 | labels = ", ".join(emotion_keys)
89 | return (
90 | "You are an emotion classifier. "
91 | "Given a short phrase in any language, reply with exactly one of the following labels: "
92 | f"{labels}. "
93 | "Respond with just the label, no extra text. Try to be expressive."
94 | )
95 |
96 |
97 | def classify_emotion(text: str, emotion_map: Dict[str, str]) -> str:
98 | """
99 | Use ChatGPT to classify the given text into one of the keys in emotion_map.
100 | If anything goes wrong or the returned label is unexpected, fallback to the first emotion key.
101 |
102 | Args:
103 | text (str): The text segment to classify.
104 | emotion_map (Dict[str, str]): Mapping from emotion label -> avatar path.
105 |
106 | Returns:
107 | str: One of the keys from emotion_map (lowercased match).
108 | """
109 | emotion_keys = list(emotion_map.keys())
110 | default_emotion = emotion_keys[0]
111 |
112 | prompt = build_emotion_system_prompt(emotion_keys)
113 | logger.debug("Emotion classification prompt: %s", prompt)
114 | logger.debug("User text for classification: %s", text)
115 |
116 | try:
117 | response = _client.chat.completions.create(
118 | model=OPENAI_MODEL,
119 | messages=[
120 | {"role": "system", "content": prompt},
121 | {"role": "user", "content": text},
122 | ],
123 | )
124 | raw_label = response.choices[0].message.content.strip().lower()
125 | logger.debug("Raw emotion label from GPT: %s", raw_label)
126 | # Attempt to match one of the known keys
127 | for key in emotion_keys:
128 | if key.lower() in raw_label:
129 | logger.info("Classified emotion '%s' for text segment.", key)
130 | return key
131 | # No direct match: fallback
132 | logger.warning(
133 | "Unexpected label '%s'. Falling back to default '%s'.",
134 | raw_label,
135 | default_emotion,
136 | )
137 | return default_emotion
138 |
139 | except Exception as e:
140 | logger.error(
141 | "Error calling ChatGPT for emotion classification: %s. Using default '%s'.",
142 | e,
143 | default_emotion,
144 | )
145 | return default_emotion
146 |
147 |
148 | def compute_segment_volume(audio: AudioSegment, start: float, end: float) -> float:
149 | """
150 | Compute the average loudness/volume of a subclip using pydub.
151 | Delegates to get_subclip_volume_segment helper.
152 |
153 | Args:
154 | audio (AudioSegment): Full audio loaded via pydub.
155 | start (float): Start time in seconds.
156 | end (float): End time in seconds.
157 |
158 | Returns:
159 | float: A volume metric (higher means louder).
160 | """
161 | duration = end - start
162 | try:
163 | volume_value = get_subclip_volume_segment(audio, start, duration)
164 | logger.debug(
165 | "Computed volume %.4f for segment [%.2f, %.2f].", volume_value, start, end
166 | )
167 | return volume_value
168 | except Exception as e:
169 | logger.error(
170 | "Error computing volume for segment [%.2f, %.2f]: %s. Defaulting to 0.0.",
171 | start,
172 | end,
173 | e,
174 | )
175 | return 0.0
176 |
177 |
178 | def process_transcript_segment(
179 | seg: Any, pydub_audio: AudioSegment, emotion_map: Dict[str, str]
180 | ) -> SegmentData:
181 | """
182 | Given a Whisper transcript segment (with .start, .end, .text),
183 | classify emotion and measure volume.
184 |
185 | Args:
186 | seg (Any): A segment object returned by Whisper, expected to have .start, .end, .text.
187 | pydub_audio (AudioSegment): The full audio loaded so we can measure volume.
188 | emotion_map (Dict[str, str]): Mapping of emotion label -> avatar file path.
189 |
190 | Returns:
191 | SegmentData: A container with start, end, chosen emotion, and volume.
192 | """
193 | start = seg.start
194 | end = seg.end
195 | text = seg.text.strip()
196 |
197 | logger.debug("Processing segment from %.2f to %.2f: '%s'.", start, end, text)
198 |
199 | # Classify the emotion using ChatGPT
200 | chosen_emotion = classify_emotion(text, emotion_map)
201 |
202 | # Measure volume for this segment
203 | volume = compute_segment_volume(pydub_audio, start, end)
204 |
205 | logger.info(
206 | "Segment [%.2f-%.2f] | Text: '%s' | Emotion: '%s' | Volume: %.4f",
207 | start,
208 | end,
209 | text,
210 | chosen_emotion,
211 | volume,
212 | )
213 |
214 | return SegmentData(start=start, end=end, emotion=chosen_emotion, volume=volume)
215 |
216 |
217 | def get_cache_path(audio_path: Path) -> Path:
218 | """
219 | Given an audio file path, return the corresponding JSON cache path.
220 | """
221 | return audio_path.with_name(audio_path.stem + CACHE_SUFFIX)
222 |
223 |
224 | def load_cached_segments(cache_path: Path) -> Optional[Tuple[List[SegmentData], float]]:
225 | """
226 | If the cache JSON exists, load and return the segments list and global average volume.
227 |
228 | Returns:
229 | Tuple[List[SegmentData], float] or None if cache is missing or invalid.
230 | """
231 | if not cache_path.exists():
232 | logger.info("No cache file found at '%s'. Will generate segments.", cache_path)
233 | return None
234 |
235 | try:
236 | with cache_path.open("r", encoding="utf-8") as f:
237 | data = json.load(f)
238 | raw_segments = data.get("segments", [])
239 | avg_volume = float(data.get("global_avg_volume", 0.0))
240 | segments = [SegmentData.from_dict(item) for item in raw_segments]
241 | logger.info(
242 | "Loaded %d segments and global_avg_volume=%.4f from cache.",
243 | len(segments),
244 | avg_volume,
245 | )
246 | return segments, avg_volume
247 | except Exception as e:
248 | logger.error(
249 | "Failed to load cache from '%s': %s. Ignoring cache.", cache_path, e
250 | )
251 | return None
252 |
253 |
254 | def save_cached_segments(
255 | cache_path: Path, segments: List[SegmentData], global_avg_volume: float
256 | ) -> None:
257 | """
258 | Save the list of segments (converted to dicts) and global_avg_volume to the JSON cache.
259 |
260 | Args:
261 | cache_path (Path): Where to write the cache file.
262 | segments (List[SegmentData]): The computed segments data.
263 | global_avg_volume (float): The average volume across segments.
264 | """
265 | try:
266 | cache_data = {
267 | "segments": [seg.to_dict() for seg in segments],
268 | "global_avg_volume": global_avg_volume,
269 | }
270 | with cache_path.open("w", encoding="utf-8") as f:
271 | json.dump(cache_data, f, ensure_ascii=False, indent=2)
272 | logger.info("Saved segments to cache at '%s'.", cache_path)
273 | except Exception as e:
274 | logger.error("Failed to save cache to '%s': %s", cache_path, e)
275 |
276 |
277 | def transcribe_audio_whisper(audio_path: Path, model_size: str) -> List[Any]:
278 | """
279 | Use faster_whisper.WhisperModel to transcribe the audio file into a list of segments.
280 |
281 | Args:
282 | audio_path (Path): Path to the audio or video file.
283 | model_size (str): Model size for Whisper (e.g. "tiny", "base", "small", etc.).
284 |
285 | Returns:
286 | List[Any]: A list of transcript segment objects (each having .start, .end, .text).
287 | """
288 | logger.info("Loading Whisper model (size='%s') for transcription...", model_size)
289 | whisper_model = WhisperModel(model_size, num_workers=4, compute_type="int8")
290 | try:
291 | result, _ = whisper_model.transcribe(str(audio_path), multilingual=True)
292 | logger.info("Transcription complete. Obtained segments.")
293 | return result
294 | finally:
295 | # Ensure we free WhisperModel resources immediately
296 | del whisper_model
297 |
298 |
299 | def classify_and_measure_all(
300 | transcript_segments: List[Any],
301 | pydub_audio: AudioSegment,
302 | emotion_map: Dict[str, str],
303 | max_workers: Optional[int] = None,
304 | ) -> List[SegmentData]:
305 | """
306 | In parallel, classify emotion and measure volume for each Whisper transcript segment.
307 |
308 | Args:
309 | transcript_segments (List[Any]): List of Whisper transcript objects.
310 | pydub_audio (AudioSegment): Full audio for volume computation.
311 | emotion_map (Dict[str, str]): Mapping from emotion key -> avatar path.
312 | max_workers (Optional[int]): Number of threads for parallel execution.
313 |
314 | Returns:
315 | List[SegmentData]: Ordered list of computed SegmentData.
316 | """
317 | logger.info("Starting parallel processing transcript segments...")
318 | segments: List[SegmentData] = []
319 |
320 | with ThreadPoolExecutor(max_workers=max_workers) as executor:
321 | futures = {
322 | executor.submit(
323 | process_transcript_segment, seg, pydub_audio, emotion_map
324 | ): seg
325 | for seg in transcript_segments
326 | }
327 | for future in as_completed(futures):
328 | try:
329 | seg_data = future.result()
330 | segments.append(seg_data)
331 | except Exception as e:
332 | # If one segment fails, log the error but continue
333 | seg_obj = futures[future]
334 | logger.error(
335 | "Segment [%.2f-%.2f] processing failed: %s",
336 | seg_obj.start,
337 | seg_obj.end,
338 | e,
339 | )
340 |
341 | # Sort by start time, just in case
342 | segments.sort(key=lambda s: s.start)
343 | logger.info("Completed classification & volume measurement for all segments.")
344 | return segments
345 |
346 |
347 | def append_tail_segment_if_needed(
348 | segments: List[SegmentData],
349 | total_duration: float,
350 | default_emotion: str,
351 | global_avg_volume: float,
352 | ) -> List[SegmentData]:
353 | """
354 | If there is a gap at the end of the audio not covered by any segment,
355 | append a "tail" segment from the last segment end to total_duration,
356 | using default_emotion and the global average volume.
357 |
358 | Args:
359 | segments (List[SegmentData]): Current list of processed segments (sorted).
360 | total_duration (float): Total length of the audio in seconds.
361 | default_emotion (str): The fallback emotion key.
362 | global_avg_volume (float): Average volume across all existing segments.
363 |
364 | Returns:
365 | List[SegmentData]: The new list with an extra tail segment if needed.
366 | """
367 | if not segments:
368 | # No segments at all: create a single segment from 0 to total_duration
369 | logger.warning(
370 | "No transcript segments found. Generating single tail segment "
371 | "from 0 to %.2f with emotion '%s'.",
372 | total_duration,
373 | default_emotion,
374 | )
375 | return [
376 | SegmentData(
377 | start=0.0,
378 | end=total_duration,
379 | emotion=default_emotion,
380 | volume=global_avg_volume,
381 | )
382 | ]
383 |
384 | last_end = segments[-1].end
385 | if last_end < total_duration:
386 | logger.info(
387 | "Audio extends from %.2f to %.2f beyond last segment end. "
388 | "Adding tail segment with default emotion '%s'.",
389 | last_end,
390 | total_duration,
391 | default_emotion,
392 | )
393 | tail_segment = SegmentData(
394 | start=last_end,
395 | end=total_duration,
396 | emotion=default_emotion,
397 | volume=global_avg_volume,
398 | )
399 | return segments + [tail_segment]
400 |
401 | logger.debug("No tail segment needed; segments already cover full audio.")
402 | return segments
403 |
404 |
405 | def generate_segment_data(
406 | audio_path: Path,
407 | emotion_map: Dict[str, str],
408 | max_workers: Optional[int] = None,
409 | ) -> Tuple[List[SegmentData], float]:
410 | """
411 | Main orchestration function: generate (or load from cache) the list of SegmentData
412 | dictionaries, each containing start, end, emotion, and volume.
413 | Also return the global average volume.
414 |
415 | Steps:
416 | 1. Check if cache exists. If so, load and return cached data.
417 | 2. Otherwise:
418 | a. Load audio via pydub for volume measurement.
419 | b. Transcribe via WhisperModel.
420 | c. In parallel, process each segment to classify emotion and measure volume.
421 | d. Compute global average volume.
422 | e. Append a tail segment if total segment durations < full audio duration.
423 | f. Save everything to cache JSON and return.
424 |
425 | Args:
426 | audio_path (Path): Path to the audio file (or video file with audio).
427 | emotion_map (Dict[str, str]): Mapping from emotion key -> avatar path.
428 | max_workers (Optional[int]): Number of parallel threads.
429 |
430 | Returns:
431 | Tuple[List[SegmentData], float]: (List of SegmentData, global average volume).
432 | """
433 | cache_path = get_cache_path(audio_path)
434 | cached = load_cached_segments(cache_path)
435 | if cached:
436 | return cached # (segments, global_avg_volume)
437 |
438 | # 2.a. Load full audio via pydub for volume measurement
439 | logger.info("Loading full audio via pydub from '%s'...", audio_path)
440 | try:
441 | pydub_audio = AudioSegment.from_file(str(audio_path))
442 | except Exception as e:
443 | logger.error(
444 | "Failed to load audio with pydub: %s. Aborting segment generation.", e
445 | )
446 | return [], 0.0
447 |
448 | # 2.b. Transcribe via Whisper
449 | transcript_segments = transcribe_audio_whisper(audio_path, WHISPER_MODEL_SIZE)
450 |
451 | # 2.c. Parallel classification + volume
452 | segments = classify_and_measure_all(
453 | transcript_segments, pydub_audio, emotion_map, max_workers
454 | )
455 |
456 | # 2.d. Compute global average volume
457 | volumes = [seg.volume for seg in segments]
458 | global_avg_volume = float(np.mean(volumes)) if volumes else 0.0
459 | logger.info("Global average volume computed: %.4f", global_avg_volume)
460 |
461 | # 2.e. If the transcription times do not cover the entire audio, append a tail
462 | # First we need total audio duration; we can get it from pydub_audio.duration_seconds
463 | total_duration = pydub_audio.duration_seconds
464 | default_emotion = list(emotion_map.keys())[0]
465 | segments = append_tail_segment_if_needed(
466 | segments, total_duration, default_emotion, global_avg_volume
467 | )
468 |
469 | # 2.f. Save to cache
470 | save_cached_segments(cache_path, segments, global_avg_volume)
471 |
472 | return segments, global_avg_volume
473 |
474 |
475 | def load_avatar_clips(avatar_map: Dict[str, str]) -> Dict[str, VideoFileClip]:
476 | """
477 | Given a mapping from emotion key -> avatar file path, load each avatar as a
478 | VideoFileClip (without audio).
479 | If a path does not exist, log a warning and skip that key.
480 |
481 | Args:
482 | avatar_map (Dict[str, str]): Mapping of emotion key -> avatar file path.
483 |
484 | Returns:
485 | Dict[str, VideoFileClip]: Only keys whose path existed and loaded successfully.
486 | """
487 | loaded_clips: Dict[str, VideoFileClip] = {}
488 | for emotion, path_str in avatar_map.items():
489 | path_obj = Path(path_str)
490 | if not path_obj.exists():
491 | logger.warning(
492 | "Avatar file for emotion '%s' not found at '%s'. Skipping.",
493 | emotion,
494 | path_str,
495 | )
496 | continue
497 | try:
498 | clip = VideoFileClip(str(path_obj)).without_audio()
499 | loaded_clips[emotion] = clip
500 | logger.info(
501 | "Preloaded avatar clip for emotion '%s' from '%s'.", emotion, path_str
502 | )
503 | except Exception as e:
504 | logger.error(
505 | "Failed to load avatar '%s' at '%s': %s. Skipping.",
506 | emotion,
507 | path_str,
508 | e,
509 | )
510 | return loaded_clips
511 |
512 |
513 | def build_avatar_subclips(
514 | segments: List[SegmentData],
515 | default_clip: VideoFileClip,
516 | preloaded_clips: Dict[str, VideoFileClip],
517 | global_avg_volume: float,
518 | shake_factor: float,
519 | ) -> List[VideoFileClip]:
520 | """
521 | For each segment, create a looped (and shaken) avatar subclip at the correct timestamp.
522 | Also fill any gaps with the default avatar loop.
523 |
524 | Args:
525 | segments (List[SegmentData]): Sorted list of segment data.
526 | default_clip (VideoFileClip): The fallback avatar clip (first emotion).
527 | preloaded_clips (Dict[str, VideoFileClip]): Mapping of emotion key -> VideoFileClip.
528 | global_avg_volume (float): Average volume across all segments.
529 | shake_factor (float): Factor controlling shake intensity relative to volume.
530 |
531 | Returns:
532 | List[VideoFileClip]: All prepared subclips positioned in time.
533 | """
534 | subclips: List[VideoFileClip] = []
535 | prev_end = 0.0
536 |
537 | # Precompute default_fps and store it
538 | default_fps = getattr(default_clip, "fps", DEFAULT_FPS) or DEFAULT_FPS
539 |
540 | for seg in segments:
541 | start, end, emotion, volume = seg.start, seg.end, seg.emotion, seg.volume
542 | duration = end - start
543 |
544 | # 1) If there is a gap between prev_end and this segment's start, fill with default avatar
545 | if start > prev_end:
546 | gap_duration = start - prev_end
547 | logger.debug(
548 | "Filling gap [%.2f-%.2f] with default avatar clip.", prev_end, start
549 | )
550 | looped_default = (
551 | default_clip.loop(duration=gap_duration)
552 | .set_duration(gap_duration)
553 | .set_fps(default_fps)
554 | .set_start(prev_end)
555 | )
556 | subclips.append(looped_default)
557 |
558 | # 2) For this segment, pick the correct avatar (or fallback to default if missing)
559 | if emotion not in preloaded_clips:
560 | logger.warning(
561 | "No preloaded avatar for emotion '%s'. Using default instead.", emotion
562 | )
563 | base_clip = default_clip
564 | else:
565 | base_clip = preloaded_clips[emotion]
566 |
567 | # Precompute fps for this base clip
568 | base_fps = getattr(base_clip, "fps", DEFAULT_FPS) or DEFAULT_FPS
569 |
570 | # Loop the avatar clip to exactly match segment duration
571 | avatar_loop = (
572 | base_clip.loop(duration=duration).set_duration(duration).set_fps(base_fps)
573 | )
574 |
575 | # Compute shake intensity (0 if global_avg_volume is zero)
576 | if global_avg_volume > 0:
577 | intensity = (volume / global_avg_volume) * shake_factor
578 | else:
579 | intensity = 0.0
580 |
581 | logger.debug(
582 | "Applying shake to emotion '%s' clip. Volume=%.4f, GlobalAvg=%.4f, ShakeIntensity=%.4f",
583 | emotion,
584 | volume,
585 | global_avg_volume,
586 | intensity,
587 | )
588 | shaken_clip = (
589 | apply_shake(avatar_loop, intensity)
590 | .set_duration(duration)
591 | .set_fps(base_fps)
592 | .set_start(start)
593 | )
594 |
595 | subclips.append(shaken_clip)
596 | prev_end = end
597 |
598 | # 3) After all segments, if there's leftover audio, fill with default avatar
599 | total_audio_duration = segments[-1].end if segments else 0.0
600 | if segments and total_audio_duration < default_clip.duration:
601 | # If default_clip.duration is actually longer than needed, skip.
602 | # We need real full audio duration
603 | pass # We'll set final composition duration from the audio track itself
604 | # Instead, let main function add a final default clip by checking audio duration there
605 |
606 | return subclips
607 |
608 |
609 | def create_avatar_video_from_audio(
610 | audio_path_str: str,
611 | config: Dict[str, Any],
612 | max_workers: Optional[int] = None,
613 | ) -> None:
614 | """
615 | High-level function to generate the avatar video:
616 | 1. Load audio (video or audio file).
617 | 2. Generate or load segment data (transcription, emotion, volume).
618 | 3. Preload avatar clips.
619 | 4. Build a list of timed subclips (avatar loops and default gaps).
620 | 5. Composite all subclips and attach the original audio.
621 | 6. Export the final video as 'output_video.mp4'.
622 |
623 | Args:
624 | audio_path_str (str): Path to the input audio or video file.
625 | config (Dict[str, Any]): A configuration dictionary that must contain:
626 | - 'avatars': Dict[str, str] mapping emotion keys -> avatar file paths.
627 | - 'shake_factor': float representing maximum shake intensity scale.
628 | max_workers (Optional[int]): Number of threads to use for segment processing.
629 | """
630 | audio_path = Path(audio_path_str)
631 | logger.info("Starting avatar video generation for '%s'.", audio_path)
632 |
633 | # 1. Load the audio clip (attempt VideoFileClip first, else AudioFileClip)
634 | try:
635 | video_reader = VideoFileClip(str(audio_path))
636 | audio_clip = video_reader.audio
637 | logger.info(
638 | "Extracted audio from video '%s'. Duration=%.2f sec.",
639 | audio_path,
640 | audio_clip.duration,
641 | )
642 | except Exception:
643 | logger.info("Input is not a video or failed to extract. Loading as pure audio.")
644 | try:
645 | audio_clip = AudioFileClip(str(audio_path))
646 | logger.info(
647 | "Loaded audio-only file '%s'. Duration=%.2f sec.",
648 | audio_path,
649 | audio_clip.duration,
650 | )
651 | except Exception as e:
652 | logger.error("Failed to load '%s' as audio: %s. Aborting.", audio_path, e)
653 | return
654 |
655 | total_duration = audio_clip.duration
656 |
657 | # 2. Build segments (load from cache or generate new)
658 | emotion_map: Dict[str, str] = config.get("avatars", {})
659 | if not emotion_map:
660 | logger.error("No 'avatars' mapping provided in config. Cannot proceed.")
661 | return
662 |
663 | segments, global_avg_volume = generate_segment_data(
664 | audio_path, emotion_map, max_workers
665 | )
666 | if not segments:
667 | logger.error("No segments generated. Aborting video creation.")
668 | return
669 |
670 | # 3. Preload avatar clips
671 | preloaded_clips = load_avatar_clips(emotion_map)
672 | default_emotion = list(emotion_map.keys())[0]
673 | if default_emotion not in preloaded_clips:
674 | logger.error(
675 | "Default emotion '%s' avatar not preloaded. Aborting video creation.",
676 | default_emotion,
677 | )
678 | return
679 |
680 | default_clip = preloaded_clips[default_emotion]
681 | shake_factor = config.get("shake_factor", 0.1)
682 |
683 | # 4. Build all subclips
684 | logger.info("Building avatar subclips for %d segments...")
685 | subclips = build_avatar_subclips(
686 | segments, default_clip, preloaded_clips, global_avg_volume, shake_factor
687 | )
688 |
689 | # 4.a. Check if final tail clip needed (if last segment end < total_duration)
690 | last_end_time = segments[-1].end
691 | if last_end_time < total_duration:
692 | gap = total_duration - last_end_time
693 | default_fps = getattr(default_clip, "fps", DEFAULT_FPS) or DEFAULT_FPS
694 | logger.info(
695 | "Adding final default avatar loop to cover gap [%.2f-%.2f].",
696 | last_end_time,
697 | total_duration,
698 | )
699 | final_tail = (
700 | default_clip.loop(duration=gap)
701 | .set_duration(gap)
702 | .set_fps(default_fps)
703 | .set_start(last_end_time)
704 | )
705 | subclips.append(final_tail)
706 |
707 | # 5. Composite all subclips into one video, sized as the default clip
708 | width, height = default_clip.w, default_clip.h
709 | logger.info(
710 | "Compositing %d subclips into final video of size (%d x %d).",
711 | len(subclips),
712 | width,
713 | height,
714 | )
715 | final_video = CompositeVideoClip(subclips, size=(width, height))
716 | final_video = final_video.set_audio(audio_clip).set_duration(total_duration)
717 |
718 | # 6. Export the final video
719 | output_path = Path("output_video.mp4")
720 | try:
721 | logger.info("Writing final video to '%s'...", output_path)
722 | final_video.write_videofile(
723 | str(output_path),
724 | codec="libx264",
725 | audio_codec="aac",
726 | fps=DEFAULT_FPS,
727 | preset="medium",
728 | verbose=False,
729 | logger=None,
730 | )
731 | logger.info("Successfully saved avatar video as '%s'.", output_path)
732 | except Exception as e:
733 | logger.error("Failed to write final video '%s': %s", output_path, e)
734 | finally:
735 | # 7. Release resources: close all loaded clips
736 | logger.info("Releasing resources for avatar clips and final video.")
737 | for clip in preloaded_clips.values():
738 | try:
739 | clip.close()
740 | except Exception as e:
741 | logger.warning("Error closing avatar clip: %s", e)
742 | try:
743 | final_video.close()
744 | except Exception:
745 | pass
746 | try:
747 | audio_clip.close()
748 | except Exception:
749 | pass
750 | if "video_reader" in locals():
751 | try:
752 | video_reader.close()
753 | except Exception:
754 | pass
755 |
--------------------------------------------------------------------------------