├── .dockerignore ├── .env.sample ├── .gitignore ├── LICENSE ├── PromptClip_multimodal.ipynb ├── PromptClip_spoken.ipynb ├── PromptClip_visual.ipynb ├── README.md ├── llm_agent.py ├── requirements.txt └── video_prompter.py /.dockerignore: -------------------------------------------------------------------------------- 1 | venv 2 | .git -------------------------------------------------------------------------------- /.env.sample: -------------------------------------------------------------------------------- 1 | VIDEO_DB_API_KEY= 2 | OPENAI_API_KEY= 3 | ANTHROPIC_KEY= -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.log 3 | ngrok 4 | !lib/README.md 5 | .DS_Store 6 | google/ 7 | lib/ 8 | archive 9 | temp/ 10 | transcribe/ 11 | .test.ipynb 12 | .idea/ 13 | ideas/ 14 | .ipynb_checkpoints 15 | log/ 16 | model_data/ 17 | data/ 18 | nohup.out 19 | dump.rdb 20 | *.out 21 | *.zip 22 | .idea/* 23 | .env 24 | venv/ 25 | test.json 26 | __pycache__ 27 | __pycache__/* 28 | */__pycache__ 29 | zappa_settings.py 30 | .vscode -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License 2 | 3 | Copyright (c) Ashutosh Trivedi 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /PromptClip_multimodal.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Overview\n", 8 | "This notebook walks your through the process of creating clips with LLM prompts using multimodal information of video. \n", 9 | "\n", 10 | "Pick a video, decide your prompt, generate a new clip ⚡️\n", 11 | "\n", 12 | "It's as simple as it sounds.\n", 13 | "\n", 14 | "If you want to go extra mile you can add Image Overlays or Audio overlays on these clips." 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "# But first, let's install the dependecies.\n", 24 | "!pip install -r requirements.txt" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "### Loading The Video\n", 32 | "\n", 33 | "Before proceeding, ensure access to [VideoDB](https://videodb.io) API key. If not, sign up for API access on the respective platforms.\n", 34 | "\n", 35 | "> Get your API key from [VideoDB Console](https://console.videodb.io). ( Free for first 50 uploads, **No credit card required** ) 🎉\n", 36 | "\n", 37 | "You can either source a new video from YouTube or select an existing one from your VideoDB collection.\n" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": null, 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "import os\n", 47 | "import videodb\n", 48 | "\n", 49 | "from dotenv import load_dotenv\n", 50 | "\n", 51 | "\n", 52 | "# TODO: setup .env file\n", 53 | "load_dotenv()\n", 54 | "\n", 55 | "# Connect to VideoDB\n", 56 | "conn = videodb.connect()\n", 57 | "coll = conn.get_collection()\n", 58 | "\n", 59 | "# TODO: Add video_id if video already exists in the collection\n", 60 | "video_id = os.getenv(\"MULTIMODAL_DEMO_VIDEO_ID\")\n", 61 | "video_url = \"https://www.youtube.com/watch?v=NZGLHdcw2RM\"\n", 62 | "\n", 63 | "if not video_id:\n", 64 | " video = coll.upload(url=video_url)\n", 65 | "else:\n", 66 | " video = coll.get_video(video_id)\n", 67 | "\n", 68 | "print(f\"video_id: {video.id}, name: {video.name}\")\n", 69 | "video.play()" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": {}, 75 | "source": [ 76 | "### Indexing The Visual And Spoken Information\n", 77 | "\n", 78 | "In this example, we are using a cricket explainer video. We will index it based on spoken words and scenes to support complex queries that require multimodal information." 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "metadata": {}, 84 | "source": [ 85 | "Below cell will retrieve the transcript if it's already indexed; otherwise, it will first index the content and then set the transcript." 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 7, 91 | "metadata": {}, 92 | "outputs": [], 93 | "source": [ 94 | "try:\n", 95 | " transcript = video.get_transcript()\n", 96 | " transcript_text = video.get_transcript_text()\n", 97 | "except Exception:\n", 98 | " video.index_spoken_words()\n", 99 | " transcript = video.get_transcript()\n", 100 | " transcript_text = video.get_transcript_text()" 101 | ] 102 | }, 103 | { 104 | "cell_type": "markdown", 105 | "metadata": {}, 106 | "source": [ 107 | "Here, you can either provide the `scene_index_id` of the video, if available, or leave it blank to index the video for visual retrieval.\n", 108 | "\n", 109 | "To know more about scene indexing click [here](https://docs.videodb.io/scene-index-guide-80)." 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": null, 115 | "metadata": {}, 116 | "outputs": [], 117 | "source": [ 118 | "# Add scene_index_id here if already indexed.\n", 119 | "scene_index_id = os.getenv(\"MULTIMODAL_DEMO_SCENE_INDEX_ID\")\n", 120 | "\n", 121 | "if not scene_index_id:\n", 122 | " scene_index_id = video.index_scenes(\n", 123 | " prompt=\"Summarize the essence of the scene in one or two concise sentences.\"\n", 124 | " )\n", 125 | "scenes = video.get_scene_index(scene_index_id)\n", 126 | "print(f\"Video is indexed with scene_index_id {scene_index_id} and has {len(scenes)} scenes.\")" 127 | ] 128 | }, 129 | { 130 | "cell_type": "markdown", 131 | "metadata": {}, 132 | "source": [ 133 | "### Inspecting The Indexed Scenes" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": null, 139 | "metadata": {}, 140 | "outputs": [], 141 | "source": [ 142 | "for scene in scenes:\n", 143 | " print(f\"{scene['start']}-{scene['end']}: {scene['description']}\")" 144 | ] 145 | }, 146 | { 147 | "cell_type": "markdown", 148 | "metadata": {}, 149 | "source": [ 150 | "### Get Results From Your Prompt\n", 151 | "\n", 152 | "The `multimodal_promper` function in `video_prompter.py` takes the transcript and indexed scenes, combines them, chunks them, and then parallelly calls the LLM with user and system prompts to retrieve the desired matching scenes.\n", 153 | "\n", 154 | "To create a clip using the `multimodal_promper` function from a video, it's crucial to craft a specific prompt that will help identify the most relevant segments for your use case. This prompt should highlight the themes, activity, or combination of verbal or visual cues you're interested in. \n", 155 | "\n", 156 | "The `multimodal_promper` will return sentences which are visual description from matched chunks. Review these to ensure they align with your needs. You can then use these segments to create your clip, focusing on the content that's most relevant to your use case." 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": null, 162 | "metadata": {}, 163 | "outputs": [], 164 | "source": [ 165 | "from video_prompter import multimodal_prompter\n", 166 | "\n", 167 | "user_prompt = \"find the scene explaining the cricket rules using infographics, remember rules with infographics.\"\n", 168 | "# Here, we are only interested in the section of the video where cricket rules are explained using infographics.\n", 169 | "# If we create a clip using only the spoken index data, we won't know where the infographics are.\n", 170 | "# If we create a clip using only the visual index data, we may include additional infographics that aren't actually about the rules but might appear to be rule explanations based on visual information.\n", 171 | "# By creating a clip using combined data, we achieve a much more precise intersection where the infographics are present, and the rules are being explained.\n", 172 | "result = multimodal_prompter(transcript, scenes, user_prompt)" 173 | ] 174 | }, 175 | { 176 | "cell_type": "markdown", 177 | "metadata": {}, 178 | "source": [ 179 | "### Generate The Clip\n", 180 | "To generate a clip, first we'll call `get_result_timestamps` from `video_prompter.py` it uses VideoDB's `keyword search` feature. We already leveraged the power of the LLM (Large Language Model) to identify relevant sentences. We'll use the search results to create a programmable video stream. Here's how you can approach this process:\n", 181 | "\n", 182 | "We have the descriptions in the `results` variable. We input these keywords into VideoDB's keyword search feature. This function will search through the indexed scenes of your videos to find matches.\n", 183 | "\n", 184 | "The search will return a SearchResult object, which contains detailed information about the found segments, including their timestamps, the text of the scene description.\n", 185 | "\n", 186 | "**Create a Programmable Video Stream with Timeline:** With the specific segments identified, you can now use `build_video_timeline` from `video_prompter.py` to get the Timeline to create a new programmable video stream. The Timeline tool allows you to stitch together video segments based on the timestamps provided in the SearchResult object. You can arrange, cut, or combine these segments to craft a fresh video stream that focuses on your topic of interest." 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": null, 192 | "metadata": {}, 193 | "outputs": [], 194 | "source": [ 195 | "from videodb import play_stream\n", 196 | "from videodb.timeline import Timeline\n", 197 | "\n", 198 | "from video_prompter import get_result_timestamps, build_video_timeline\n", 199 | "\n", 200 | "timeline = Timeline(conn)\n", 201 | "result_timestamps = get_result_timestamps(video, result, scene_index_id=scene_index_id)\n", 202 | "timeline, duration = build_video_timeline(video, result_timestamps, timeline)\n", 203 | "stream = timeline.generate_stream()\n", 204 | "play_stream(stream)" 205 | ] 206 | }, 207 | { 208 | "cell_type": "markdown", 209 | "metadata": {}, 210 | "source": [ 211 | "### Bonus - With Single Modality For Comparison" 212 | ] 213 | }, 214 | { 215 | "cell_type": "markdown", 216 | "metadata": {}, 217 | "source": [ 218 | "#### Clip Using Only Spoken Information\n", 219 | "Since the transcript does not include information about infographics, not all infographics with rules are captured in the result.\n", 220 | "Segments explaining how many days each format is played are missing." 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": 8, 226 | "metadata": {}, 227 | "outputs": [], 228 | "source": [ 229 | "from video_prompter import text_prompter\n", 230 | "\n", 231 | "user_prompt = \"find the scene explaining the cricket rules using infographics, remember rules with infographics.\"\n", 232 | "text_result = text_prompter(transcript_text, user_prompt)" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": null, 238 | "metadata": {}, 239 | "outputs": [], 240 | "source": [ 241 | "timeline = Timeline(conn)\n", 242 | "result_timestamps = get_result_timestamps(video, text_result, index_type=\"spoken_word\")\n", 243 | "timeline, duration = build_video_timeline(video, result_timestamps, timeline)\n", 244 | "stream = timeline.generate_stream()\n", 245 | "play_stream(stream)" 246 | ] 247 | }, 248 | { 249 | "cell_type": "markdown", 250 | "metadata": {}, 251 | "source": [ 252 | "Since the model doesn't have information about the visual infographics, the segments toward the end, where the rules about overs are explained, are missing from the results." 253 | ] 254 | }, 255 | { 256 | "cell_type": "markdown", 257 | "metadata": {}, 258 | "source": [ 259 | "#### Clip On Scene Information\n", 260 | "\n", 261 | "Here, segments that contain infographics but are not related to rules are also being captured in the results too." 262 | ] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "execution_count": 10, 267 | "metadata": {}, 268 | "outputs": [], 269 | "source": [ 270 | "from video_prompter import scene_prompter\n", 271 | "\n", 272 | "user_prompt = \"find the scene explaining the cricket rules using infographics, remember rules with infographics.\"\n", 273 | "scene_result = scene_prompter(scenes, user_prompt)" 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": null, 279 | "metadata": {}, 280 | "outputs": [], 281 | "source": [ 282 | "timeline = Timeline(conn)\n", 283 | "result_timestamps = get_result_timestamps(video, scene_result, scene_index_id=scene_index_id)\n", 284 | "timeline, duration = build_video_timeline(video, result_timestamps, timeline)\n", 285 | "stream = timeline.generate_stream()\n", 286 | "play_stream(stream)" 287 | ] 288 | }, 289 | { 290 | "cell_type": "markdown", 291 | "metadata": {}, 292 | "source": [ 293 | "Since the model doesn't have information about the spoken content, false positive segments are being added—these segments aren't actually about rules but may appear to be based on visual information." 294 | ] 295 | }, 296 | { 297 | "cell_type": "markdown", 298 | "metadata": {}, 299 | "source": [ 300 | "### Conclusion \n", 301 | "\n", 302 | "As shown above, in cases where both modalities are important to retrieve the desired clip, this notebook works best. If only one modality is sufficient, the respective functions can be used.\n", 303 | "\n", 304 | "We can't wait to see which approach works best for your videos—do share your results with us!\n", 305 | "\n", 306 | "* [Discord](https://colab.research.google.com/corgiredirector?site=https%3A%2F%2Fdiscord.gg%2Fpy9P639jGz)\n", 307 | "* [GitHub](https://github.com/video-db)\n", 308 | "* [Website](https://colab.research.google.com/corgiredirector?site=https%3A%2F%2Fvideodb.io)\n", 309 | "* [Email](ashu@videodb.io)" 310 | ] 311 | }, 312 | { 313 | "cell_type": "markdown", 314 | "metadata": {}, 315 | "source": [ 316 | "### Playground For You To Experiment" 317 | ] 318 | }, 319 | { 320 | "cell_type": "code", 321 | "execution_count": null, 322 | "metadata": {}, 323 | "outputs": [], 324 | "source": [ 325 | "video_url = \"\"\n", 326 | "video_id = \"\"\n", 327 | "\n", 328 | "if not video_id:\n", 329 | " video = coll.upload(url=video_url)\n", 330 | "else:\n", 331 | " video = coll.get_video(video_id)\n", 332 | "\n", 333 | "print(f\"video_id: {video.id}, name: {video.name}\")" 334 | ] 335 | }, 336 | { 337 | "cell_type": "code", 338 | "execution_count": null, 339 | "metadata": {}, 340 | "outputs": [], 341 | "source": [ 342 | "try:\n", 343 | " transcript = video.get_transcript()\n", 344 | " transcript_text = video.get_transcript_text()\n", 345 | "except Exception:\n", 346 | " video.index_spoken_words()\n", 347 | " transcript = video.get_transcript()\n", 348 | " transcript_text = video.get_transcript_text()" 349 | ] 350 | }, 351 | { 352 | "cell_type": "code", 353 | "execution_count": null, 354 | "metadata": {}, 355 | "outputs": [], 356 | "source": [ 357 | "scene_index_id = \"\"\n", 358 | "\n", 359 | "if not scene_index_id:\n", 360 | " scene_index_id = video.index_scenes(\n", 361 | " prompt=\"Summarize the essence of the scene in one or two concise sentences.\"\n", 362 | " )\n", 363 | "scenes = video.get_scene_index(scene_index_id)\n", 364 | "print(f\"Video is indexed with scene_index_id {scene_index_id} and has {len(scenes)} scenes.\")" 365 | ] 366 | }, 367 | { 368 | "cell_type": "code", 369 | "execution_count": null, 370 | "metadata": {}, 371 | "outputs": [], 372 | "source": [ 373 | "from video_prompter import multimodal_prompter\n", 374 | "\n", 375 | "user_prompt = \"\"\n", 376 | "result = multimodal_prompter(transcript, scenes, user_prompt)" 377 | ] 378 | }, 379 | { 380 | "cell_type": "code", 381 | "execution_count": null, 382 | "metadata": {}, 383 | "outputs": [], 384 | "source": [ 385 | "from videodb import play_stream\n", 386 | "from videodb.timeline import Timeline\n", 387 | "\n", 388 | "from video_prompter import get_result_timestamps, build_video_timeline\n", 389 | "\n", 390 | "timeline = Timeline(conn)\n", 391 | "result_timestamps = get_result_timestamps(video, result, scene_index_id=scene_index_id)\n", 392 | "timeline, duration = build_video_timeline(video, result_timestamps, timeline)\n", 393 | "stream = timeline.generate_stream()\n", 394 | "play_stream(stream)" 395 | ] 396 | } 397 | ], 398 | "metadata": { 399 | "kernelspec": { 400 | "display_name": "venv", 401 | "language": "python", 402 | "name": "python3" 403 | }, 404 | "language_info": { 405 | "codemirror_mode": { 406 | "name": "ipython", 407 | "version": 3 408 | }, 409 | "file_extension": ".py", 410 | "mimetype": "text/x-python", 411 | "name": "python", 412 | "nbconvert_exporter": "python", 413 | "pygments_lexer": "ipython3", 414 | "version": "3.12.4" 415 | } 416 | }, 417 | "nbformat": 4, 418 | "nbformat_minor": 2 419 | } 420 | -------------------------------------------------------------------------------- /PromptClip_spoken.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Overview\n", 8 | "This notebook walks your through the process of creating clips with LLM prompts using spoken content of the video. \n", 9 | "\n", 10 | "Pick a video, decide your prompt, generate a new clip ⚡️\n", 11 | "\n", 12 | "It's as simple as it sounds.\n", 13 | "\n", 14 | "If you want to go extra mile you can score and rank your results, add Image Overlays or Audio overlays on these clips." 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": { 21 | "scrolled": true 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "# But first, let's install the dependecies.\n", 26 | "!pip install -r requirements.txt" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "metadata": {}, 32 | "source": [ 33 | "### Choose the Video\n", 34 | "\n", 35 | "You can either use a fresh video from Youtube etc. or choose an exisitng one already uploaded on your VideoDB collection." 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "metadata": { 42 | "scrolled": true 43 | }, 44 | "outputs": [], 45 | "source": [ 46 | "import os\n", 47 | "import videodb\n", 48 | "\n", 49 | "from dotenv import load_dotenv\n", 50 | "\n", 51 | "\n", 52 | "# TODO: setup .env file\n", 53 | "load_dotenv()\n", 54 | "\n", 55 | "# Connect to VideoDB\n", 56 | "conn = videodb.connect()\n", 57 | "coll = conn.get_collection()\n", 58 | "\n", 59 | "# NOTE: Please set video_id or SPOKEN_DEMO_VIDEO_ID in .env if video already exists in the collection\n", 60 | "video_id = os.getenv(\"SPOKEN_DEMO_VIDEO_ID\")\n", 61 | "video_url = \"https://www.youtube.com/watch?v=HpUR7-Oe1ss\"\n", 62 | "\n", 63 | "if not video_id:\n", 64 | " video = coll.upload(url=video_url)\n", 65 | "else:\n", 66 | " video = coll.get_video(video_id)\n", 67 | "\n", 68 | "print(f\"video_id: {video.id}, name: {video.name}\")\n", 69 | "video.play()" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": {}, 75 | "source": [ 76 | "### Indexing Spoken Information" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 2, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "try:\n", 86 | " transcript_text = video.get_transcript_text()\n", 87 | "except Exception:\n", 88 | " video.index_spoken_words()\n", 89 | " transcript_text = video.get_transcript_text()" 90 | ] 91 | }, 92 | { 93 | "cell_type": "markdown", 94 | "metadata": {}, 95 | "source": [ 96 | "### Run your Prompt\n", 97 | "\n", 98 | "To create a clip using the `text_prompter` function from a video, it's crucial to craft a specific prompt that will help identify the most relevant segments for your use case. This prompt should highlight the themes, topics, or specific phrases you're interested in. The function then analyzes the video's spoken content to find segments that match your criteria. \n", 99 | "\n", 100 | "Before you can use `text_prompter`, make sure the video's spoken content is indexed with the `video.index_spoken_words()` function. This prepares the video for analysis by making its spoken content searchable.\n", 101 | "\n", 102 | "The `text_prompter` will return sentences or segments from the video that match your prompt. Review these to ensure they align with your needs. You can then use these segments to create your clip, focusing on the content that's most relevant to your use case." 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "metadata": { 109 | "scrolled": true 110 | }, 111 | "outputs": [], 112 | "source": [ 113 | "from video_prompter import text_prompter\n", 114 | "\n", 115 | "# Choose a prompt to create create clip. \n", 116 | "user_prompt = \"find sentences where a deal is discussed\"\n", 117 | "result = text_prompter(transcript_text, user_prompt)\n", 118 | "print(f\"Found {len(result)} segments in the video.\")" 119 | ] 120 | }, 121 | { 122 | "cell_type": "markdown", 123 | "metadata": {}, 124 | "source": [ 125 | "### Generate the Clip\n", 126 | "\n", 127 | "To generate a clip, we'll use **VideoDB**'s `keyword search` feature. We already leveraged the power of the LLM (Large Language Model) to identify relevant sentences. We'll use the search results to create a `programmable video stream`. Here's how you can approach this process:\n", 128 | "\n", 129 | "We have the keywords in the `results` variable. Input these keywords into VideoDB's keyword search feature. This function will search through the indexed spoken content of your videos to find matches. \n", 130 | "\n", 131 | "The search will return a SearchResult object, which contains detailed information about the found segments, including their timestamps, the text of the spoken content, and possibly other metadata.\n", 132 | "\n", 133 | "**Create a Programmable Video Stream with Timeline**: With the specific segments identified, you can now use the Timeline to create a new programmable video stream. The Timeline tool allows you to stitch together video segments based on the timestamps provided in the SearchResult object. You can arrange, cut, or combine these segments to craft a fresh video stream that focuses on your topic of interest." 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": null, 139 | "metadata": {}, 140 | "outputs": [], 141 | "source": [ 142 | "from videodb import play_stream\n", 143 | "from videodb.timeline import Timeline\n", 144 | "from video_prompter import get_result_timestamps, build_video_timeline\n", 145 | "\n", 146 | "timeline = Timeline(conn)\n", 147 | "result_timestamps = get_result_timestamps(video, result, index_type=\"spoken_word\")\n", 148 | "timeline, duration = build_video_timeline(video, result_timestamps, timeline)\n", 149 | "stream = timeline.generate_stream()\n", 150 | "print(stream)\n", 151 | "play_stream(stream)" 152 | ] 153 | }, 154 | { 155 | "cell_type": "markdown", 156 | "metadata": {}, 157 | "source": [ 158 | "### Modify the Timeline\n", 159 | "\n", 160 | "The programmable stream part of VideoDB allows you to not just watch the original clip but also modify and personalize the stream. Here we can add up the logo on each clip easily. You can read more about it here - https://docs.videodb.io/version-0-0-3-timeline-and-assets-44" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": null, 166 | "metadata": {}, 167 | "outputs": [], 168 | "source": [ 169 | "# upload Image\n", 170 | "from videodb import MediaType\n", 171 | "\n", 172 | "image_id = os.getenv(\"SPOKEN_DEMO_IMAGE_ID\")\n", 173 | "if not image_id:\n", 174 | " image = conn.upload(url=\"https://www.freepnglogos.com/uploads/logo-ig-png/logo-ig-instagram-new-logo-vector-download-13.png\", media_type=MediaType.image)\n", 175 | " image_id = image.id" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": 11, 181 | "metadata": {}, 182 | "outputs": [], 183 | "source": [ 184 | "from videodb.asset import VideoAsset, AudioAsset, ImageAsset\n", 185 | "\n", 186 | "image_asset = ImageAsset(\n", 187 | " asset_id=image_id,\n", 188 | " width=40,\n", 189 | " height=40,\n", 190 | " x=20,\n", 191 | " y=10,\n", 192 | " duration=7\n", 193 | ")" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": null, 199 | "metadata": {}, 200 | "outputs": [], 201 | "source": [ 202 | "timeline.add_overlay(0, image_asset)\n", 203 | "stream = timeline.generate_stream()\n", 204 | "play_stream(stream)" 205 | ] 206 | }, 207 | { 208 | "cell_type": "markdown", 209 | "metadata": {}, 210 | "source": [ 211 | "### Bonus : Ranking using LLM\n", 212 | "If you want to choose only a few top results and wodering how to do it, have LLM to rank your results and create a score that you can use to decide the order of segments. You can modify the ranking prompt creativiely to drive the outcome of it. We would love to see what you create 🙌🏼" 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": 14, 218 | "metadata": {}, 219 | "outputs": [], 220 | "source": [ 221 | "from llm_agent import LLM\n", 222 | "import re\n", 223 | "import json\n", 224 | "from math import floor\n", 225 | "\n", 226 | "def ranking_prompt_llm(text, prompt):\n", 227 | " ranking_prompt = \"\"\"Given the text provided below and a specific User Prompt, evaluate the relevance of the text\n", 228 | " in relation to the user's prompt. Please assign a relevance score ranging from 0 to 10, where 0 indicates no relevance \n", 229 | " and 10 signifies perfect alignment with the user's request.\n", 230 | " The score quality also increases when the text is a complete senetence, making it perfect for a video clip result\"\"\"\n", 231 | "\n", 232 | " # pass the data\n", 233 | " ranking_prompt += f\"\"\"\n", 234 | " text: {text}\n", 235 | " User Prompt: {prompt}\n", 236 | " \"\"\"\n", 237 | "\n", 238 | " # Add instructions to always return JSON at the end of processing.\n", 239 | " ranking_prompt += \"\"\"\n", 240 | " Ensure the final output strictly adheres to the JSON format specified, without including additional text or explanations. \n", 241 | " Use the following structure for your response:\n", 242 | " {\n", 243 | " \"score\": \n", 244 | " }\n", 245 | " \"\"\"\n", 246 | " try:\n", 247 | " response = LLM().chat(message=ranking_prompt)\n", 248 | " print(response)\n", 249 | " output = response[\"choices\"][0][\"message\"][\"content\"]\n", 250 | " res = json.loads(output)\n", 251 | " score = res.get('score')\n", 252 | " return score\n", 253 | " except Exception as e:\n", 254 | " return 0 \n", 255 | "\n", 256 | "def rank_results(res, prompt, score_percentage=0.40):\n", 257 | " \"\"\"\n", 258 | " rank and give score to each result\n", 259 | " \"\"\"\n", 260 | " res_score = []\n", 261 | " for text in res:\n", 262 | " res_score.append((text, ranking_prompt_llm(text,prompt)))\n", 263 | " \n", 264 | " res_score_sorted = sorted(res_score, key=lambda x: x[1], reverse=True)\n", 265 | " return res_score_sorted[0: floor(len(res_score_sorted)*score_percentage)]" 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": null, 271 | "metadata": { 272 | "scrolled": true 273 | }, 274 | "outputs": [], 275 | "source": [ 276 | "ranked_results = rank_results(result, user_prompt)" 277 | ] 278 | }, 279 | { 280 | "cell_type": "code", 281 | "execution_count": null, 282 | "metadata": {}, 283 | "outputs": [], 284 | "source": [ 285 | "ranked_results" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": 17, 291 | "metadata": {}, 292 | "outputs": [], 293 | "source": [ 294 | "# Search for matching video segments and watch the resulting stream\n", 295 | "from videodb import SearchType\n", 296 | "from videodb.timeline import Timeline, VideoAsset, AudioAsset\n", 297 | "\n", 298 | "timeline = Timeline(conn)\n", 299 | "for sentences, score in ranked_results:\n", 300 | " search_res = video.search(sentences, search_type=SearchType.keyword)\n", 301 | " matched_segments = search_res.get_shots()\n", 302 | " \n", 303 | " # No exact match found\n", 304 | " if len(matched_segments) == 0:\n", 305 | " continue\n", 306 | "\n", 307 | " # Get the first matched video segment\n", 308 | " video_shot = matched_segments[0]\n", 309 | "\n", 310 | " # Create a new Video Asset and add it to a timeline.\n", 311 | " timeline.add_inline(VideoAsset(asset_id=video.id, start=video_shot.start, end=video_shot.end))" 312 | ] 313 | }, 314 | { 315 | "cell_type": "code", 316 | "execution_count": null, 317 | "metadata": {}, 318 | "outputs": [], 319 | "source": [ 320 | "stream = timeline.generate_stream()\n", 321 | "play_stream(stream)" 322 | ] 323 | }, 324 | { 325 | "cell_type": "markdown", 326 | "metadata": {}, 327 | "source": [ 328 | "### Add some sound effects to it 🎶\n", 329 | "\n", 330 | "Not just this we can jazz it up with audio overlays and create another stream with audio overlays." 331 | ] 332 | }, 333 | { 334 | "cell_type": "code", 335 | "execution_count": null, 336 | "metadata": {}, 337 | "outputs": [], 338 | "source": [ 339 | "# Add music overlay, this can be laughter soundtrack\n", 340 | "audio_id = os.getenv(\"SPOKEN_DEMO_AUDIO_ID\")\n", 341 | "if not audio_id:\n", 342 | " audio = conn.upload(url=\"https://www.youtube.com/watch?v=q3VVxbJa61Q\", media_type=MediaType.audio)\n", 343 | " audio_id = audio.id\n", 344 | " print(f\"Uploaded audio with id {audio_id}\")" 345 | ] 346 | }, 347 | { 348 | "cell_type": "code", 349 | "execution_count": 21, 350 | "metadata": {}, 351 | "outputs": [], 352 | "source": [ 353 | "# 1 sec background audio \n", 354 | "background = AudioAsset(asset_id=audio_id, start=3, end=4, disable_other_tracks=True)" 355 | ] 356 | }, 357 | { 358 | "cell_type": "code", 359 | "execution_count": null, 360 | "metadata": {}, 361 | "outputs": [], 362 | "source": [ 363 | "timeline = Timeline(conn)\n", 364 | "dur_so_far = 0\n", 365 | "for clip_sentences, score in ranked_results:\n", 366 | " try:\n", 367 | " search_res = video.search(clip_sentences, search_type=SearchType.keyword)\n", 368 | " matched_segments = search_res.get_shots()\n", 369 | " \n", 370 | " # No exact match found\n", 371 | " if len(matched_segments) == 0:\n", 372 | " continue\n", 373 | " \n", 374 | " #video segment\n", 375 | " video_shot = matched_segments[0]\n", 376 | " \n", 377 | " # Create a new Video Asset and add it to a timeline.\n", 378 | " timeline.add_inline(VideoAsset(asset_id=video.id, start=video_shot.start, end=video_shot.end))\n", 379 | " chunk_dur = (video_shot.end - video_shot.start)\n", 380 | " dur_so_far += chunk_dur \n", 381 | " if chunk_dur < 2:\n", 382 | " print(\"Skipping since chunk duration is less then the overlay audio.\")\n", 383 | " continue\n", 384 | " timeline.add_overlay(dur_so_far-2, background)\n", 385 | " except Exception as e:\n", 386 | " print(f\"Error: skipping the segment {str(e)}\")" 387 | ] 388 | }, 389 | { 390 | "cell_type": "code", 391 | "execution_count": null, 392 | "metadata": { 393 | "scrolled": true 394 | }, 395 | "outputs": [], 396 | "source": [ 397 | "# add music overlay in the last 2 sec of each supercut.\n", 398 | "stream = timeline.generate_stream()\n", 399 | "play_stream(stream)" 400 | ] 401 | }, 402 | { 403 | "cell_type": "markdown", 404 | "metadata": {}, 405 | "source": [ 406 | "If you have any questions or feedback. Feel free to reach out to us 🙌🏼\n", 407 | "\n", 408 | "* [Discord](https://colab.research.google.com/corgiredirector?site=https%3A%2F%2Fdiscord.gg%2Fpy9P639jGz)\n", 409 | "* [GitHub](https://github.com/video-db)\n", 410 | "* [Website](https://colab.research.google.com/corgiredirector?site=https%3A%2F%2Fvideodb.io)\n", 411 | "* [Email](ashu@videodb.io)" 412 | ] 413 | }, 414 | { 415 | "cell_type": "markdown", 416 | "metadata": {}, 417 | "source": [ 418 | "### Playground For You To Experiment" 419 | ] 420 | }, 421 | { 422 | "cell_type": "code", 423 | "execution_count": null, 424 | "metadata": {}, 425 | "outputs": [], 426 | "source": [ 427 | "video_url = \"\"\n", 428 | "video_id = \"\"\n", 429 | "\n", 430 | "if not video_id:\n", 431 | " video = coll.upload(url=video_url)\n", 432 | "else:\n", 433 | " video = coll.get_video(video_id)\n", 434 | "\n", 435 | "print(f\"video_id: {video.id}, name: {video.name}\")" 436 | ] 437 | }, 438 | { 439 | "cell_type": "code", 440 | "execution_count": null, 441 | "metadata": {}, 442 | "outputs": [], 443 | "source": [ 444 | "try:\n", 445 | " transcript = video.get_transcript()\n", 446 | " transcript_text = video.get_transcript_text()\n", 447 | "except Exception:\n", 448 | " video.index_spoken_words()\n", 449 | " transcript = video.get_transcript()\n", 450 | " transcript_text = video.get_transcript_text()" 451 | ] 452 | }, 453 | { 454 | "cell_type": "code", 455 | "execution_count": null, 456 | "metadata": {}, 457 | "outputs": [], 458 | "source": [ 459 | "from video_prompter import text_prompter\n", 460 | "\n", 461 | "user_prompt = \"\"\n", 462 | "result = text_prompter(transcript_text, user_prompt)\n", 463 | "print(f\"Found {len(result)} segments in the video.\")" 464 | ] 465 | }, 466 | { 467 | "cell_type": "code", 468 | "execution_count": null, 469 | "metadata": {}, 470 | "outputs": [], 471 | "source": [ 472 | "from videodb import play_stream\n", 473 | "from videodb.timeline import Timeline\n", 474 | "from video_prompter import get_result_timestamps, build_video_timeline\n", 475 | "\n", 476 | "timeline = Timeline(conn)\n", 477 | "result_timestamps = get_result_timestamps(video, result, index_type=\"spoken_word\")\n", 478 | "timeline, duration = build_video_timeline(video, result_timestamps, timeline)\n", 479 | "stream = timeline.generate_stream()\n", 480 | "print(stream)\n", 481 | "play_stream(stream)" 482 | ] 483 | } 484 | ], 485 | "metadata": { 486 | "kernelspec": { 487 | "display_name": "Python 3 (ipykernel)", 488 | "language": "python", 489 | "name": "python3" 490 | }, 491 | "language_info": { 492 | "codemirror_mode": { 493 | "name": "ipython", 494 | "version": 3 495 | }, 496 | "file_extension": ".py", 497 | "mimetype": "text/x-python", 498 | "name": "python", 499 | "nbconvert_exporter": "python", 500 | "pygments_lexer": "ipython3", 501 | "version": "3.12.4" 502 | } 503 | }, 504 | "nbformat": 4, 505 | "nbformat_minor": 4 506 | } 507 | -------------------------------------------------------------------------------- /PromptClip_visual.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Overview\n", 8 | "This notebook walks you through the process of creating clips with LLM prompts using visual information.\n", 9 | "\n", 10 | "The sample video provided in the walkthrough is an episode of Mr. Bean, which relies heavily on visual comedy rather than spoken words. Therefore, visual information is the most effective way to create the clip.\n", 11 | "\n", 12 | "We will create two clips:\n", 13 | "\n", 14 | "
    \n", 15 | "

    \n", 16 | "
  1. The famous Mr. Bean copying in an exam meme.
  2. \n", 17 | "

    \n", 18 | "
  3. A compilation of all the car gags from the episode.
  4. \n", 19 | "
\n", 20 | "\n", 21 | "After the walkthrough, you can pick your own video, decide on your prompt, and generate a new clip ⚡️\n", 22 | "\n", 23 | "It's as simple as it sounds.\n", 24 | "\n", 25 | "If you want to go extra mile you can add Image Overlays or Audio overlays on these clips.\n", 26 | "\n" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [ 35 | "# But first, let's install the dependecies.\n", 36 | "!pip install -r requirements.txt" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [ 43 | "### Loading The Video\n", 44 | "\n", 45 | "Before proceeding, ensure access to [VideoDB](https://videodb.io) API key. If not, sign up for API access on the respective platforms.\n", 46 | "\n", 47 | "> Get your API key from [VideoDB Console](https://console.videodb.io). ( Free for first 50 uploads, **No credit card required** ) 🎉\n", 48 | "\n", 49 | "You can either source a new video from YouTube or select an existing one from your VideoDB collection.\n" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "import os\n", 59 | "import videodb\n", 60 | "\n", 61 | "from dotenv import load_dotenv\n", 62 | "\n", 63 | "\n", 64 | "# TODO: setup .env file\n", 65 | "load_dotenv()\n", 66 | "\n", 67 | "# Connect to VideoDB\n", 68 | "conn = videodb.connect()\n", 69 | "coll = conn.get_collection()\n", 70 | "\n", 71 | "# NOTE: Add VISUAL_DEMO_VIDEO_ID in .env if video is already present in your collection\n", 72 | "video_id = os.getenv(\"VISUAL_DEMO_VIDEO_ID\")\n", 73 | "video_url = \"https://www.youtube.com/watch?v=7Im2I6STbms\"\n", 74 | "\n", 75 | "if not video_id:\n", 76 | " video = coll.upload(url=video_url)\n", 77 | "else:\n", 78 | " video = coll.get_video(video_id)\n", 79 | "\n", 80 | "print(f\"video_id: {video.id}, name: {video.name}\")\n", 81 | "video.play()" 82 | ] 83 | }, 84 | { 85 | "cell_type": "markdown", 86 | "metadata": {}, 87 | "source": [ 88 | "### Indexing The Visual Information\n", 89 | "\n", 90 | "Here, you can either set the `scene_index_id` in cell or `VISUAL_DEMO_SCENE_INDEX_ID` in the .env, if already indexed, or leave it blank to index the video for visual retrieval.\n", 91 | "\n", 92 | "To know more about scene indexing click [here](https://docs.videodb.io/scene-index-guide-80).\n" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "metadata": {}, 99 | "outputs": [], 100 | "source": [ 101 | "#NOTE: Add VISUAL_DEMO_SCENE_INDEX_ID in the .env if already indexed.\n", 102 | "scene_index_id = os.getenv(\"VISUAL_DEMO_SCENE_INDEX_ID\")\n", 103 | "\n", 104 | "if not scene_index_id:\n", 105 | " scene_index_id = video.index_scenes(\n", 106 | " extraction_config={\n", 107 | " \"threshold\": 20, \n", 108 | " \"frame_count\": 3\n", 109 | " },\n", 110 | " prompt=\"Summarize the essence of the scene in one or two concise sentences without focusing on individual images.\"\n", 111 | " )\n", 112 | "scenes = video.get_scene_index(scene_index_id)\n", 113 | "print(f\"Video is indexed with scene_index_id {scene_index_id} with {len(scenes)} scenes.\")" 114 | ] 115 | }, 116 | { 117 | "cell_type": "markdown", 118 | "metadata": {}, 119 | "source": [ 120 | "### Inspecting The Indexed Scenes" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": null, 126 | "metadata": {}, 127 | "outputs": [], 128 | "source": [ 129 | "for scene in scenes:\n", 130 | " print(f\"{scene['start']}-{scene['end']}: {scene['description']}\")" 131 | ] 132 | }, 133 | { 134 | "cell_type": "markdown", 135 | "metadata": {}, 136 | "source": [ 137 | "### Get Results From Your Prompt\n", 138 | "\n", 139 | "The `scene_prompter` function in `video_prompter.py` takes the indexed scenes, chunks them, and then parallelly calls the LLM with user and system prompts to retrieve the desired matching scenes.\n", 140 | "\n", 141 | "To create a clip using the `scene_prompter` function from a video, it's crucial to craft a specific prompt that will help identify the most relevant segments for your use case. This prompt should highlight the themes, activity, or specific visual cues you're interested in. \n", 142 | "\n", 143 | "The `scene_prompter` will return sentences or segments from the video that match your prompt. Review these to ensure they align with your needs. You can then use these segments to create your clip, focusing on the content that's most relevant to your use case." 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": null, 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [ 152 | "from video_prompter import scene_prompter\n", 153 | "\n", 154 | "# This prompt is for finding the iconic copying in examination scene of Mr. Bean\n", 155 | "user_prompt = \"find the moment where mr.bean is attempting to cheat peeking over at the answer sheet of man beside him, find it with high accuracy.\"\n", 156 | "result = scene_prompter(scenes, user_prompt)" 157 | ] 158 | }, 159 | { 160 | "cell_type": "markdown", 161 | "metadata": {}, 162 | "source": [ 163 | "### Generate The Clip\n", 164 | "To generate a clip, first we'll call `get_result_timestamps` from `video_prompter.py` it uses VideoDB's `keyword search` feature. We already leveraged the power of the LLM (Large Language Model) to identify relevant sentences. We'll use the search results to create a programmable video stream. Here's how you can approach this process:\n", 165 | "\n", 166 | "We have the descriptions in the `results` variable. We input these keywords into VideoDB's keyword search feature. This function will search through the indexed scenes of your videos to find matches.\n", 167 | "\n", 168 | "The search will return a SearchResult object, which contains detailed information about the found segments, including their timestamps, the text of the scene description.\n", 169 | "\n", 170 | "**Create a Programmable Video Stream with Timeline**: With the specific segments identified, you can now use `build_video_timeline` from `video_prompter.py` to get the Timeline to create a new programmable video stream. The Timeline tool allows you to stitch together video segments based on the timestamps provided in the SearchResult object. You can arrange, cut, or combine these segments to craft a fresh video stream that focuses on your topic of interest." 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": null, 176 | "metadata": {}, 177 | "outputs": [], 178 | "source": [ 179 | "from videodb import play_stream\n", 180 | "from videodb.timeline import Timeline\n", 181 | "from video_prompter import get_result_timestamps, build_video_timeline\n", 182 | "\n", 183 | "timeline = Timeline(conn)\n", 184 | "\n", 185 | "result_timestamps = get_result_timestamps(video, result, scene_index_id=scene_index_id)\n", 186 | "print(f\"We have got {len(result_timestamps)} segments matching with the user prompt.\")\n", 187 | "# Since we are only interested in one segment for the meme, we've hardcoded the timestamp to filter it out.\n", 188 | "# In an actual scenario, you can inspect all the segments and select the ones you're interested in.\n", 189 | "# Alternatively, you can skip the filtering if you want a clip of all the segments involving cheating.\n", 190 | "meme_start_time = 370.4\n", 191 | "meme_result = [next((item for item in result_timestamps if item[0] == meme_start_time), None)]\n", 192 | "if meme_result:\n", 193 | " print(\"Selecting the segment with the meme.\")\n", 194 | "else:\n", 195 | " print(f\"Segment with start {meme_start_time} not found.\")\n", 196 | " print(\"Taking the first segment instead, please inspect the user_prompt and result_timestamps if you are intrested only in the meme.\")\n", 197 | " meme_result = result_timestamps[:1]\n", 198 | "result_timestamps = meme_result\n", 199 | "timeline, duration = build_video_timeline(video, meme_result, timeline)\n", 200 | "stream = timeline.generate_stream()\n", 201 | "play_stream(stream)" 202 | ] 203 | }, 204 | { 205 | "cell_type": "markdown", 206 | "metadata": {}, 207 | "source": [ 208 | "### Stream With Text Overlay\n", 209 | "\n", 210 | "You can add custom text to the meme for further personalization.\n", 211 | "\n", 212 | "For more customization options, refer to the [TextAsset Styling Guide](https://docs.videodb.io/guide-textasset-75)." 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": null, 218 | "metadata": {}, 219 | "outputs": [], 220 | "source": [ 221 | "from videodb.asset import TextStyle\n", 222 | "from videodb.timeline import TextAsset\n", 223 | "\n", 224 | "timeline = Timeline(conn)\n", 225 | "timeline, duration = build_video_timeline(video, result_timestamps, timeline, top_n=1)\n", 226 | "left = TextAsset(\n", 227 | " text=\"XXXX\",\n", 228 | " duration=duration,\n", 229 | " style=TextStyle(\n", 230 | " x=190,\n", 231 | " y=15,\n", 232 | " font = \"Inter\",\n", 233 | " fontsize = 25,\n", 234 | " fontcolor = \"#002869\",\n", 235 | " )\n", 236 | ")\n", 237 | "right = TextAsset(\n", 238 | " text=\"YYYY\",\n", 239 | " duration=duration,\n", 240 | " style=TextStyle(\n", 241 | " x=420,\n", 242 | " y=15,\n", 243 | " font = \"Inter\",\n", 244 | " fontsize = 25, \n", 245 | " fontcolor = \"#00692c\",\n", 246 | " )\n", 247 | ")\n", 248 | "timeline.add_overlay(0, left)\n", 249 | "timeline.add_overlay(0, right)\n", 250 | "stream = timeline.generate_stream()\n", 251 | "play_stream(stream)" 252 | ] 253 | }, 254 | { 255 | "cell_type": "markdown", 256 | "metadata": {}, 257 | "source": [ 258 | "### Stream With Image Overlay \n", 259 | "\n", 260 | "You can also add image overlays if needed. For more details, refer to the [Dynamic Video Stream Guide](https://docs.videodb.io/dynamic-video-stream-guide-44)" 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": null, 266 | "metadata": {}, 267 | "outputs": [], 268 | "source": [ 269 | "from videodb.asset import TextStyle, ImageAsset\n", 270 | "from videodb import MediaType\n", 271 | "\n", 272 | "timeline = Timeline(conn)\n", 273 | "timeline, duration = build_video_timeline(video, result_timestamps, timeline, top_n=1)\n", 274 | "\n", 275 | "image1_id = os.getenv(\"VISUAL_DEMO_IMAGE_1\")\n", 276 | "if not image1_id:\n", 277 | " image1_url = \"https://upload.wikimedia.org/wikipedia/sco/thumb/d/d1/Ferrari-Logo.svg/344px-Ferrari-Logo.svg.png\"\n", 278 | " image1 = coll.upload(url=image1_url, media_type=MediaType.image)\n", 279 | " image1_id = image1.id\n", 280 | " print(f\"image1_id: {image1_id}\")\n", 281 | "\n", 282 | "image2_id = os.getenv(\"VISUAL_DEMO_IMAGE_2\")\n", 283 | "if not image2_id:\n", 284 | " image2_url = \"https://upload.wikimedia.org/wikipedia/en/thumb/6/66/McLaren_Racing_logo.svg/512px-McLaren_Racing_logo.svg.png\"\n", 285 | " image2 = coll.upload(url=image2_url, media_type=MediaType.image)\n", 286 | " image2_id = image2.id\n", 287 | " print(f\"image2_id: {image2_id}\")\n", 288 | "\n", 289 | "left = ImageAsset(\n", 290 | " asset_id=image1_id,\n", 291 | " duration=duration,\n", 292 | " width=70,\n", 293 | " height=124,\n", 294 | " x=150,\n", 295 | " y=200,\n", 296 | ")\n", 297 | "right = ImageAsset(\n", 298 | " asset_id=image2_id,\n", 299 | " duration=duration,\n", 300 | " width=128,\n", 301 | " height=40,\n", 302 | " x=400,\n", 303 | " y=240\n", 304 | ")\n", 305 | "\n", 306 | "timeline.add_overlay(0, left)\n", 307 | "timeline.add_overlay(0, right)\n", 308 | "stream = timeline.generate_stream()\n", 309 | "play_stream(stream)" 310 | ] 311 | }, 312 | { 313 | "cell_type": "markdown", 314 | "metadata": {}, 315 | "source": [ 316 | "### Another Prompt With Simple Compilation" 317 | ] 318 | }, 319 | { 320 | "cell_type": "code", 321 | "execution_count": null, 322 | "metadata": {}, 323 | "outputs": [], 324 | "source": [ 325 | "user_prompt = \"find all the car gags with high accuracy\"\n", 326 | "result = scene_prompter(scenes, user_prompt)" 327 | ] 328 | }, 329 | { 330 | "cell_type": "code", 331 | "execution_count": null, 332 | "metadata": {}, 333 | "outputs": [], 334 | "source": [ 335 | "timeline = Timeline(conn)\n", 336 | "result_timestamps = get_result_timestamps(video, result, scene_index_id=scene_index_id)\n", 337 | "timeline, duration = build_video_timeline(video, result_timestamps, timeline)\n", 338 | "stream = timeline.generate_stream()\n", 339 | "play_stream(stream)" 340 | ] 341 | }, 342 | { 343 | "cell_type": "markdown", 344 | "metadata": {}, 345 | "source": [ 346 | "If you have any questions or feedback. Feel free to reach out to us 🙌🏼\n", 347 | "\n", 348 | "* [Discord](https://colab.research.google.com/corgiredirector?site=https%3A%2F%2Fdiscord.gg%2Fpy9P639jGz)\n", 349 | "* [GitHub](https://github.com/video-db)\n", 350 | "* [Website](https://colab.research.google.com/corgiredirector?site=https%3A%2F%2Fvideodb.io)\n", 351 | "* [Email](ashu@videodb.io)" 352 | ] 353 | }, 354 | { 355 | "cell_type": "markdown", 356 | "metadata": {}, 357 | "source": [ 358 | "### Playground For You To Experiment" 359 | ] 360 | }, 361 | { 362 | "cell_type": "code", 363 | "execution_count": null, 364 | "metadata": {}, 365 | "outputs": [], 366 | "source": [ 367 | "video_url = \"\"\n", 368 | "video_id = \"\"\n", 369 | "\n", 370 | "if not video_id:\n", 371 | " video = coll.upload(url=video_url)\n", 372 | "else:\n", 373 | " video = coll.get_video(video_id)\n", 374 | "\n", 375 | "print(f\"video_id: {video.id}, name: {video.name}\")" 376 | ] 377 | }, 378 | { 379 | "cell_type": "code", 380 | "execution_count": null, 381 | "metadata": {}, 382 | "outputs": [], 383 | "source": [ 384 | "scene_index_id = \"\"\n", 385 | "\n", 386 | "if not scene_index_id:\n", 387 | " scene_index_id = video.index_scenes(\n", 388 | " extraction_config={\n", 389 | " \"threshold\": 20, \n", 390 | " \"frame_count\": 3\n", 391 | " },\n", 392 | " prompt=\"Summarize the essence of the scene in one or two concise sentences without focusing on individual images.\"\n", 393 | " )\n", 394 | "scenes = video.get_scene_index(scene_index_id)" 395 | ] 396 | }, 397 | { 398 | "cell_type": "code", 399 | "execution_count": null, 400 | "metadata": {}, 401 | "outputs": [], 402 | "source": [ 403 | "for scene in scenes:\n", 404 | " print(f\"{scene['start']}-{scene['end']}: {scene['description']}\")" 405 | ] 406 | }, 407 | { 408 | "cell_type": "code", 409 | "execution_count": null, 410 | "metadata": {}, 411 | "outputs": [], 412 | "source": [ 413 | "from video_prompter import scene_prompter\n", 414 | "\n", 415 | "user_prompt = \"\"\n", 416 | "result = scene_prompter(scenes, user_prompt)" 417 | ] 418 | }, 419 | { 420 | "cell_type": "code", 421 | "execution_count": null, 422 | "metadata": {}, 423 | "outputs": [], 424 | "source": [ 425 | "timeline = Timeline(conn)\n", 426 | "result_timestamps = get_result_timestamps(video, result, scene_index_id=scene_index_id)\n", 427 | "timeline, duration = build_video_timeline(video, result_timestamps, timeline)\n", 428 | "stream = timeline.generate_stream()\n", 429 | "play_stream(stream)" 430 | ] 431 | } 432 | ], 433 | "metadata": { 434 | "kernelspec": { 435 | "display_name": "venv", 436 | "language": "python", 437 | "name": "python3" 438 | }, 439 | "language_info": { 440 | "codemirror_mode": { 441 | "name": "ipython", 442 | "version": 3 443 | }, 444 | "file_extension": ".py", 445 | "mimetype": "text/x-python", 446 | "name": "python", 447 | "nbconvert_exporter": "python", 448 | "pygments_lexer": "ipython3", 449 | "version": "3.12.4" 450 | } 451 | }, 452 | "nbformat": 4, 453 | "nbformat_minor": 2 454 | } 455 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 6 | [![PyPI version][pypi-shield]][pypi-url] 7 | [![Stargazers][stars-shield]][stars-url] 8 | [![Issues][issues-shield]][issues-url] 9 | [![Website][website-shield]][website-url] 10 | 11 | 12 | 13 |
14 |

15 | 16 | Logo 17 | 18 | 19 |

PromptClip ✄

20 | 21 |

22 | Create video clips using LLM prompts 23 |
24 | 🐞Report a Bug 25 | · 26 | 💡Suggest a Feature 27 |

28 | 29 | 30 | 31 | # PromptClip: Create Instant Video Clips with LLM Prompts 🍭 32 | 33 | ## What does it do? 🤔 34 | 35 | It allows any developer to: 36 | 37 | * 📚 Upload a video from any source (Local Files, Youtube, S3 etc.) 38 | * 🔍 Prompt that video in natural language with queries like `Show funny moments in the video` 39 | or `find the moments useful for social media trailer` 40 | * 🎛️ Use any LLM of your choice like OpenAI, Claude or Gemeni Pro. 41 | * 🌟 Instantly watch the clip of those moments. 42 | * 🛠️ Finetune the clip or supercut by ranking results, managing length of the clip etc. 43 | * 🎸 Add music or image overlay to the clip. 44 | 45 | --- 46 | ## Results 🎉 47 | 48 | Checkout these prompts and the results 👉🏼 49 | 50 | | Original Video | Prompt | Link | 51 | |--------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------| 52 | | [Shark Tank](https://www.youtube.com/watch?v=HpUR7-Oe1ss) | Find every moment where a deal was offered | [watch](https://console.videodb.io/player?url=https://stream.videodb.io/v3/published/manifests/f46f8de8-725a-410f-a83b-3b5e73a6b29a.m3u8) | 53 | | [Useful Gadgets](https://www.youtube.com/watch?v=bGmXrMW9ucU) | Show me moments in the video where the host discusses or reveals the pricing of the gadgets | [watch](https://console.videodb.io/player?url=https://stream.videodb.io/v3/published/manifests/b13d64f2-3294-4e75-bf68-ed7ccae7006d.m3u8) | 54 | | [Sponsorship Details of Huberman Podcast](https://www.youtube.com/watch?v=LYYyQcAJZfk) | Find details about every sponsor | [watch](https://console.videodb.io/player?url=https://stream.videodb.io/v3/published/manifests/13c5d0de-8fcd-435a-8e22-a7338f5465f0.m3u8) | 55 | | [Highlights from Masterchef Episode](https://www.youtube.com/watch?v=4JVzznqOF0k) | Show me the feedback from every judge | [watch](https://console.videodb.io/player?url=https://stream.videodb.io/v3/published/manifests/109cb830-9a5a-4c89-a3fa-f4be92f2c3db.m3u8) | 56 | | [Important Topics/Advice from a Lecture](https://www.youtube.com/watch?v=HAnw168huqA) | Find sentences where anxiety is discussed | [watch](https://console.videodb.io/player?url=https://stream.videodb.io/v3/published/manifests/f9faefd3-bb66-4756-a49b-dc30bdcf4617.m3u8) | 57 | | [Tech Review Video](https://www.youtube.com/watch?v=dtp6b76pMak) | Find sentences where host explains about the battery | [watch](https://console.videodb.io/player?url=https://stream.videodb.io/v3/published/manifests/e6ce8133-ed7d-4dad-a070-ead778a0d2d3.m3u8) | 58 | | [Travel Video](https://www.youtube.com/watch?v=sV1Z2LXtHqc) | What are some popular tourist destinations in Sri Lanka | [watch](https://console.videodb.io/player?url=https://stream.videodb.io/v3/published/manifests/36aba200-e4e6-40c0-aa45-8eafdf844fe0.m3u8) | 59 | 60 | ## How do I use it? 🛠️ 61 | 62 | - **Get your API key:** Sign up on [VideoDB console](https://console.videodb.io) (Free for the first 50 uploads, no 63 | credit card required). 🆓 64 | - **Set `VIDEO_DB_API_KEY`:** Enter your key in the `env` file. 65 | - **Set `OPENAI_API_KEY` or `ANTHROPIC_KEY`:** Add your LLM API Key in the `env` file. 66 | - **Install dependencies:** Run `pip install -r requirements.txt` in your terminal. 67 | - **Run locally:** Run the notebook `PromptClip_text.ipynb`, `PromptClip_visual.ipynb` and `PromptClip_multimodal.ipynb` to experiment with your prompts and ranking of results. 68 | 69 | --- 70 | 71 | 72 | ## Roadmap 🛣️ 73 | 74 | 1. Add support for music generation models to jazzup the cuts. 75 | 2. Integrate with other projects like Pika Labs and Midjourney. 76 | 77 | --- 78 | 79 | 80 | ## Contributing 🤝 81 | 82 | Your contributions make the open-source community an incredible place for learning, inspiration, and creativity. We 83 | welcome and appreciate your input! Here's how you can contribute: 84 | 85 | - Open issues to share your use cases. 86 | - Participate in brainstorming solutions for our roadmap. 87 | - Suggest improvements to the codebase. 88 | 89 | ### Contribution Steps 90 | 91 | 1. Fork the Project 🍴 92 | 2. Create your Feature Branch (`git checkout -b feature/AmazingFeature`) 93 | 3. Commit your Changes (`git commit -m 'Add some AmazingFeature'`) 94 | 4. Push to the Branch (`git push origin feature/AmazingFeature`) 95 | 5. Open a Pull Request 📬 96 | 97 | --- 98 | 99 | 100 | 101 | 102 | [pypi-shield]: https://img.shields.io/pypi/v/videodb?style=for-the-badge 103 | 104 | [pypi-url]: https://pypi.org/project/videodb/ 105 | 106 | [python-shield]:https://img.shields.io/pypi/pyversions/videodb?style=for-the-badge 107 | 108 | [stars-shield]: https://img.shields.io/github/stars/video-db/promptClip.svg?style=for-the-badge 109 | 110 | [stars-url]: https://github.com/video-db/promptClip/stargazers 111 | 112 | [issues-shield]: https://img.shields.io/github/issues/video-db/videodb-python.svg?style=for-the-badge 113 | 114 | [issues-url]: https://github.com/video-db/promptClip/issues 115 | 116 | [website-shield]: https://img.shields.io/website?url=https%3A%2F%2Fvideodb.io%2F&style=for-the-badge&label=videodb.io 117 | 118 | [website-url]: https://videodb.io/ 119 | 120 | -------------------------------------------------------------------------------- /llm_agent.py: -------------------------------------------------------------------------------- 1 | """ 2 | Extend this codebase to add any LLM 3 | """ 4 | 5 | import json 6 | import os 7 | 8 | import requests 9 | from anthropic import Anthropic, HUMAN_PROMPT, AI_PROMPT 10 | import google.generativeai as genai 11 | from dotenv import load_dotenv 12 | 13 | load_dotenv() 14 | OPENAI_KEY = os.getenv("OPENAI_API_KEY") 15 | CLAUDE_KEY = os.getenv("ANTHROPIC_KEY") 16 | GEMINI_KEY = os.getenv("GEMINI_API_KEY") 17 | 18 | 19 | class LLMType: 20 | OPENAI = "openAI" 21 | CLAUDE = "claude" 22 | GEMINI = "gemini" 23 | 24 | 25 | class Models: 26 | GPT3 = "gpt-3.5-turbo-16k" 27 | GPT4 = "gpt-4" 28 | GPT4o = "gpt-4o" 29 | GPT4o_new = "gpt-4o-2024-08-06" 30 | CLAUDE_INSTANT = "claude-instant-1.1" 31 | CLAUDE2 = "claude-2" 32 | GEMINI_1_5_FLASH = "gemini-1.5-flash" 33 | GEMINI_1_5_PRO = "gemini-1.5-pro" 34 | OA_MODELS_WITH_RESPONSE_TYPE_SUPPORT = [GPT4o, GPT4o_new] 35 | 36 | 37 | class LLM: 38 | def __init__(self, llm_type=LLMType.OPENAI, model=Models.GPT4): 39 | self.type = llm_type 40 | self.model = model 41 | self.openai_key = os.getenv("OPENAI_API_KEY") 42 | self.claude_key = os.getenv("ANTHROPIC_KEY") 43 | self.gemini_key = os.getenv("GEMINI_KEY") 44 | 45 | def chat(self, message, functions=None): 46 | if self.type == LLMType.OPENAI: 47 | message = [self._to_gpt_msg(message)] 48 | return self._call_openai(message, functions) 49 | elif self.type == LLMType.CLAUDE: 50 | return self._call_claude(message) 51 | elif self.type == LLMType.GEMINI: 52 | return self._call_gemini(message) 53 | else: 54 | raise ValueError("Unsupported LLM type.") 55 | 56 | def _to_gpt_msg(self, data): 57 | """ 58 | convert data to message for LLM 59 | :param data: 60 | :return: 61 | """ 62 | context_msg = "" 63 | context_msg += str(data) 64 | 65 | return {"role": "system", "content": context_msg} 66 | 67 | def _call_openai(self, message, functions=None): 68 | url = "https://api.openai.com/v1/chat/completions" 69 | # print(f'call openAI with message {message}') 70 | headers = { 71 | "Content-Type": "application/json", 72 | "Authorization": f"Bearer {self.openai_key}", 73 | } 74 | data = { 75 | "model": self.model, 76 | "messages": message, 77 | "temperature": 0.6, 78 | } 79 | if self.model in Models.OA_MODELS_WITH_RESPONSE_TYPE_SUPPORT: 80 | data["response_format"] = {"type": "json_object"} 81 | if functions: 82 | data.update( 83 | { 84 | "functions": functions, 85 | "function_call": "auto", 86 | } 87 | ) 88 | 89 | response = requests.post(url, headers=headers, data=json.dumps(data)) 90 | try: 91 | return response.json() 92 | except json.JSONDecodeError: 93 | return {"error": "Failed to decode JSON response."} 94 | 95 | def _call_claude(self, message): 96 | anthropic = Anthropic(api_key=self.claude_key) 97 | prompt = f"{HUMAN_PROMPT} {message} {AI_PROMPT}" 98 | try: 99 | completion = anthropic.completions.create( 100 | model=self.model, 101 | max_tokens_to_sample=80000, 102 | prompt=prompt, 103 | ) 104 | return {"response": completion.completion} 105 | except ( 106 | Exception 107 | ) as e: # Consider a more specific exception based on the Anthropic SDK 108 | return {"error": str(e)} 109 | 110 | def _call_gemini(self, message): 111 | genai.configure(api_key=GEMINI_KEY) 112 | model = genai.GenerativeModel(self.model) 113 | try: 114 | response = model.generate_content(message) 115 | response_text = response.text.replace("```json", "").replace("```", "") 116 | response_json = json.loads(response_text) 117 | return response_json.get("sentences") 118 | except Exception as e: 119 | return {"error": str(e)} 120 | 121 | def get_word_limit(self): 122 | if self.type == LLMType.CLAUDE: 123 | return 10000 124 | return 2000 125 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | annotated-types==0.6.0 2 | anthropic==0.15.0 3 | anyio==4.2.0 4 | backoff==2.2.1 5 | certifi==2024.2.2 6 | charset-normalizer==3.3.2 7 | distro==1.9.0 8 | exceptiongroup==1.2.0 9 | filelock==3.13.1 10 | fsspec==2024.2.0 11 | google-generativeai==0.5.4 12 | h11==0.14.0 13 | httpcore==1.0.2 14 | httpx==0.26.0 15 | huggingface-hub==0.20.3 16 | idna==3.6 17 | packaging==23.2 18 | pydantic==2.6.1 19 | pydantic_core==2.16.2 20 | python-dotenv==1.0.1 21 | PyYAML==6.0.1 22 | requests==2.31.0 23 | sniffio==1.3.0 24 | tokenizers==0.15.1 25 | tqdm==4.66.1 26 | typing_extensions==4.9.0 27 | urllib3==2.2.0 28 | videodb==0.2.3 29 | -------------------------------------------------------------------------------- /video_prompter.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import concurrent.futures 4 | 5 | from llm_agent import LLM, LLMType 6 | from videodb import connect 7 | from videodb import SearchType, IndexType 8 | from videodb.timeline import VideoAsset 9 | 10 | 11 | from dotenv import load_dotenv 12 | 13 | load_dotenv() 14 | 15 | 16 | def get_connection(): 17 | """ 18 | Get connection and load the env. 19 | :return: 20 | """ 21 | conn = connect() 22 | return conn 23 | 24 | 25 | def get_video(id): 26 | """ 27 | Get video object 28 | :param id: 29 | :return: 30 | """ 31 | conn = get_connection() 32 | all_videos = conn.get_collection().get_videos() 33 | video = next(vid for vid in all_videos if vid.id == id) 34 | return video 35 | 36 | 37 | def chunk_docs(docs, chunk_size): 38 | """ 39 | chunk docs to fit into context of your LLM 40 | :param docs: 41 | :param chunk_size: 42 | :return: 43 | """ 44 | for i in range(0, len(docs), chunk_size): 45 | yield docs[i : i + chunk_size] # Yield the current chunk 46 | 47 | 48 | def get_result_timestamps( 49 | video, 50 | result, 51 | index_type="scene", 52 | scene_index_id=None, 53 | sort="time", 54 | run_concurrent=True, 55 | ): 56 | """ 57 | This function takes the result from scene_prompter and performs a keyword search on the video. 58 | By default, the function sorts the results by time. 59 | It returns a list of (start, end, description) for the matched segments. 60 | """ 61 | result_timestamps = [] 62 | 63 | def search_description(description): 64 | # keyword search on each result description 65 | if index_type == "scene": 66 | search_res = video.search( 67 | description, 68 | index_type=IndexType.scene, 69 | search_type=SearchType.keyword, 70 | scene_index_id=scene_index_id, 71 | ) 72 | else: 73 | search_res = video.search( 74 | description, 75 | index_type=IndexType.spoken_word, 76 | search_type=SearchType.keyword, 77 | ) 78 | matched_segments = search_res.get_shots() 79 | if len(matched_segments) == 0: 80 | return None # No match found 81 | 82 | video_shot = matched_segments[0] 83 | return (video_shot.start, video_shot.end, video_shot.text) 84 | 85 | if run_concurrent: 86 | with concurrent.futures.ThreadPoolExecutor() as executor: 87 | future_to_desc = { 88 | executor.submit(search_description, desc): desc for desc in result 89 | } 90 | for future in concurrent.futures.as_completed(future_to_desc): 91 | res = future.result() 92 | if res: 93 | result_timestamps.append(res) 94 | else: 95 | for description in result: 96 | res = search_description(description) 97 | if res: 98 | result_timestamps.append(res) 99 | 100 | # Sorting the results if needed 101 | if sort == "time": 102 | result_timestamps.sort(key=lambda x: x[0]) 103 | 104 | return result_timestamps 105 | 106 | 107 | # Creating and returning timeline of given result timestamps 108 | def build_video_timeline( 109 | video, result_timestamps, timeline, top_n=None, max_duration=None, debug=False 110 | ): 111 | """ 112 | This function takes the matched segments list (result_timestamps) and creates a VideoDB Timeline based on the given conditions. 113 | The user can specify top_n to select the top n results. 114 | Additionally, the user can set max_duration to stop adding results to the Timeline if the total duration exceeds this limit. 115 | """ 116 | duration = 0 117 | if top_n: 118 | existing_count = len(result_timestamps) 119 | result_timestamps = result_timestamps[:top_n] 120 | if debug: 121 | print(f"Picked top {top_n} from {existing_count}") 122 | for result_timestamp in result_timestamps: 123 | start = float(result_timestamp[0]) 124 | end = float(result_timestamp[1]) 125 | description = result_timestamp[2] 126 | if debug: 127 | print(start, end, description) 128 | duration += end - start 129 | if max_duration and duration > max_duration: 130 | duration -= end - start 131 | break 132 | timeline.add_inline(VideoAsset(asset_id=video.id, start=start, end=end)) 133 | return timeline, duration 134 | 135 | 136 | def filter_transcript(transcript, start, end): 137 | result = [] 138 | for entry in transcript: 139 | if float(entry["end"]) > start and float(entry["start"]) < end: 140 | result.append(entry) 141 | return result 142 | 143 | 144 | def get_multimodal_docs(transcript, scenes, club_on="scene"): 145 | # TODO: Implement club on transcript 146 | docs = [] 147 | if club_on == "scene": 148 | for scene in scenes: 149 | spoken_result = filter_transcript( 150 | transcript, float(scene["start"]), float(scene["end"]) 151 | ) 152 | spoken_text = " ".join( 153 | entry["text"] for entry in spoken_result if entry["text"] != "-" 154 | ) 155 | data = { 156 | "visual": scene["description"], 157 | "spoken": spoken_text, 158 | "start": scene["start"], 159 | "end": scene["end"], 160 | } 161 | docs.append(data) 162 | return docs 163 | 164 | 165 | def send_msg_openai(chunk_prompt, llm=LLM()): 166 | response = llm.chat(message=chunk_prompt) 167 | output = json.loads(response["choices"][0]["message"]["content"]) 168 | sentences = output.get("sentences") 169 | return sentences 170 | 171 | 172 | def send_msg_claude(chunk_prompt, llm): 173 | response = llm.chat(message=chunk_prompt) 174 | # TODO : add claude reposnse parser 175 | return response 176 | 177 | 178 | def send_msg_gemini(chunk_prompt, llm): 179 | response = llm.chat(message=chunk_prompt) 180 | # TODO : add claude reposnse parser 181 | return response 182 | 183 | 184 | def text_prompter(transcript_text, prompt, llm=None): 185 | chunk_size = 10000 186 | # sentence tokenizer 187 | chunks = chunk_docs(transcript_text, chunk_size=chunk_size) 188 | # print(f"Length of the sentence chunk are {len(chunks)}") 189 | 190 | if llm is None: 191 | llm = LLM() 192 | 193 | # 400 sentence at a time 194 | if llm.type == LLMType.OPENAI: 195 | llm_caller_fn = send_msg_openai 196 | elif llm.type == LLMType.GEMINI: 197 | llm_caller_fn = send_msg_gemini 198 | else: 199 | # claude for now 200 | llm_caller_fn = send_msg_claude 201 | 202 | matches = [] 203 | prompts = [] 204 | i = 0 205 | for chunk in chunks: 206 | chunk_prompt = """ 207 | You are a video editor who uses AI. Given a user prompt and transcript of a video analyze the text to identify sentences in the transcript relevant to the user prompt for making clips. 208 | - **Instructions**: 209 | - Evaluate the sentences for relevance to the specified user prompt. 210 | - Make sure that sentences start and end properly and meaningfully complete the discussion or topic. Choose the one with the greatest relevance and longest. 211 | - We'll use the sentences to make video clips in future, so optimize for great viewing experience for people watching the clip of these. 212 | - If the matched sentences are not too far, merge them into one sentence. 213 | - Strictly make each result minimum 20 words long. If the match is smaller, adjust the boundries and add more context around the sentences. 214 | 215 | - **Output Format**: Return a JSON list of strings named 'sentences' that containes the output sentences, make sure they are exact substrings. 216 | - **User Prompts**: User prompts may include requests like 'find funny moments' or 'find moments for social media'. Interpret these prompts by 217 | identifying keywords or themes in the transcript that match the intent of the prompt. 218 | """ 219 | 220 | # pass the data 221 | chunk_prompt += f""" 222 | Transcript: {chunk} 223 | User Prompt: {prompt} 224 | """ 225 | 226 | # Add instructions to always return JSON at the end of processing. 227 | chunk_prompt += """ 228 | Ensure the final output strictly adheres to the JSON format specified without including additional text or explanations. \ 229 | If there is no match return empty list without additional text. Use the following structure for your response: 230 | { 231 | "sentences": [ 232 | {}, 233 | ... 234 | ] 235 | } 236 | """ 237 | prompts.append(chunk_prompt) 238 | i += 1 239 | 240 | # make a parallel call to all chunks with prompts 241 | with concurrent.futures.ThreadPoolExecutor() as executor: 242 | future_to_index = { 243 | executor.submit(llm_caller_fn, prompt, llm): prompt for prompt in prompts 244 | } 245 | for future in concurrent.futures.as_completed(future_to_index): 246 | try: 247 | matches.extend(future.result()) 248 | except Exception as e: 249 | print(f"Chunk failed to work with LLM {str(e)}") 250 | return matches 251 | 252 | 253 | def scene_prompter(transcript_text, prompt, llm=None, run_concurrent=True): 254 | chunk_size = 100 255 | chunks = chunk_docs(transcript_text, chunk_size=chunk_size) 256 | 257 | llm_caller_fn = send_msg_openai 258 | if llm is None: 259 | llm = LLM() 260 | 261 | # TODO: llm should have caller function 262 | # 400 sentence at a time 263 | if llm.type == LLMType.OPENAI: 264 | llm_caller_fn = send_msg_openai 265 | else: 266 | # claude for now 267 | llm_caller_fn = send_msg_claude 268 | 269 | matches = [] 270 | prompts = [] 271 | i = 0 272 | 273 | for chunk in chunks: 274 | descriptions = [scene["description"] for scene in chunk] 275 | chunk_prompt = """ 276 | You are a video editor who uses AI. Given a user prompt and AI-generated scene descriptions of a video, analyze the descriptions to identify segments relevant to the user prompt for creating clips. 277 | 278 | - **Instructions**: 279 | - Evaluate the scene descriptions for relevance to the specified user prompt. 280 | - Choose description with the highest relevance and most comprehensive content. 281 | - Optimize for engaging viewing experiences, considering visual appeal and narrative coherence. 282 | 283 | - User Prompts: Interpret prompts like 'find exciting moments' or 'identify key plot points' by matching keywords or themes in the scene descriptions to the intent of the prompt. 284 | """ 285 | 286 | chunk_prompt += f""" 287 | Descriptions: {json.dumps(descriptions)} 288 | User Prompt: {prompt} 289 | """ 290 | 291 | chunk_prompt += """ 292 | **Output Format**: Return a JSON list of strings named 'result' that containes the fileds `sentence` Ensure the final output 293 | strictly adheres to the JSON format specified without including additional text or explanations. \ 294 | If there is no match return empty list without additional text. Use the following structure for your response: 295 | {"sentences": []} 296 | """ 297 | prompts.append(chunk_prompt) 298 | i += 1 299 | 300 | if run_concurrent: 301 | with concurrent.futures.ThreadPoolExecutor() as executor: 302 | future_to_index = { 303 | executor.submit(llm_caller_fn, prompt, llm): prompt 304 | for prompt in prompts 305 | } 306 | for future in concurrent.futures.as_completed(future_to_index): 307 | try: 308 | matches.extend(future.result()) 309 | except Exception as e: 310 | print(f"Chunk failed to work with LLM {str(e)}") 311 | else: 312 | for prompt in prompts: 313 | try: 314 | res = llm_caller_fn(prompt) 315 | matches.extend(res) 316 | except Exception as e: 317 | print(f"Chunk failed to work with LLM {str(e)}") 318 | return matches 319 | 320 | 321 | def multimodal_prompter(transcript, scene_index, prompt, llm=None, run_concurrent=True): 322 | docs = get_multimodal_docs(transcript, scene_index) 323 | chunk_size = 80 324 | chunks = chunk_docs(docs, chunk_size=chunk_size) 325 | 326 | if llm is None: 327 | llm = LLM() 328 | 329 | if llm.type == LLMType.OPENAI: 330 | llm_caller_fn = send_msg_openai 331 | else: 332 | llm_caller_fn = send_msg_claude 333 | 334 | matches = [] 335 | prompts = [] 336 | i = 0 337 | for chunk in chunks: 338 | chunk_prompt = f""" 339 | You are given visual and spoken information of the video of each second, and a transcipt of what's being spoken along with timestamp. 340 | Your task is to evaluate the data for relevance to the specified user prompt. 341 | Corelate visual and spoken content to find the relevant video segment. 342 | 343 | Multimodal Data: 344 | video: {chunk} 345 | User Prompt: {prompt} 346 | 347 | 348 | """ 349 | chunk_prompt += """ 350 | **Output Format**: Return a JSON list of strings named 'result' that containes the fileds `sentence`. 351 | sentence is from the visual section of the input. 352 | Ensure the final output strictly adheres to the JSON format specified without including additional text or explanations. 353 | If there is no match return empty list without additional text. Use the following structure for your response: 354 | {"sentences": []} 355 | """ 356 | prompts.append(chunk_prompt) 357 | i += 1 358 | 359 | if run_concurrent: 360 | with concurrent.futures.ThreadPoolExecutor() as executor: 361 | future_to_index = { 362 | executor.submit(llm_caller_fn, prompt, llm): prompt 363 | for prompt in prompts 364 | } 365 | for future in concurrent.futures.as_completed(future_to_index): 366 | try: 367 | matches.extend(future.result()) 368 | except Exception as e: 369 | print(f"Chunk failed to work with LLM {str(e)}") 370 | else: 371 | for prompt in prompts: 372 | try: 373 | res = llm_caller_fn(prompt) 374 | matches.extend(res) 375 | except Exception as e: 376 | import traceback 377 | 378 | print(traceback.print_exc()) 379 | print(f"Chunk failed to work with LLM {str(e)}") 380 | return matches 381 | --------------------------------------------------------------------------------