├── .gitignore ├── .pre-commit-config.yaml ├── Makefile ├── README.md ├── assets ├── example-slide.jpeg └── financial-presentations │ ├── sigifirstquarter2024inve001.jpg │ ├── sigifirstquarter2024inve002.jpg │ ├── sigifirstquarter2024inve003.jpg │ ├── sigifirstquarter2024inve004.jpg │ ├── sigifirstquarter2024inve005.jpg │ ├── sigifirstquarter2024inve006.jpg │ ├── sigifirstquarter2024inve007.jpg │ ├── sigifirstquarter2024inve008.jpg │ ├── sigifirstquarter2024inve009.jpg │ ├── sigifirstquarter2024inve010.jpg │ ├── sigifirstquarter2024inve011.jpg │ ├── sigifirstquarter2024inve012.jpg │ ├── sigifirstquarter2024inve013.jpg │ ├── sigifirstquarter2024inve014.jpg │ ├── sigifirstquarter2024inve015.jpg │ ├── sigifirstquarter2024inve016.jpg │ ├── sigifirstquarter2024inve017.jpg │ ├── sigifirstquarter2024inve018.jpg │ ├── sigifirstquarter2024inve019.jpg │ ├── sigifirstquarter2024inve020.jpg │ ├── sigifirstquarter2024inve021.jpg │ ├── sigifirstquarter2024inve022.jpg │ ├── sigifirstquarter2024inve023.jpg │ ├── sigifirstquarter2024inve024.jpg │ ├── sigifirstquarter2024inve025.jpg │ ├── sigifirstquarter2024inve026.jpg │ ├── sigifirstquarter2024inve027.jpg │ ├── sigifirstquarter2024inve028.jpg │ ├── sigifirstquarter2024inve029.jpg │ └── sigifirstquarter2024inve030.jpg └── notebooks ├── 00_quickstart.ipynb ├── 01_schema_showcase.ipynb ├── 02_case_study_drivers_license.ipynb ├── 03_case_study_tv_news.ipynb ├── 04_visual_grounding.ipynb ├── 05_case_study_image_catalogue.ipynb ├── 06_fashion_images_hybrid_search.ipynb ├── 07_generate_schema.ipynb ├── 08_document_markdown.ipynb ├── advanced_finetuning_video_inference.ipynb └── advanced_video_transcription.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 110 | .pdm.toml 111 | .pdm-python 112 | .pdm-build/ 113 | 114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 115 | __pypackages__/ 116 | 117 | # Celery stuff 118 | celerybeat-schedule 119 | celerybeat.pid 120 | 121 | # SageMath parsed files 122 | *.sage.py 123 | 124 | # Environments 125 | .env 126 | .env.dev 127 | .venv 128 | env/ 129 | venv/ 130 | ENV/ 131 | env.bak/ 132 | venv.bak/ 133 | 134 | # Spyder project settings 135 | .spyderproject 136 | .spyproject 137 | 138 | # Rope project settings 139 | .ropeproject 140 | 141 | # mkdocs documentation 142 | /site 143 | 144 | # mypy 145 | .mypy_cache/ 146 | .dmypy.json 147 | dmypy.json 148 | 149 | # Pyre type checker 150 | .pyre/ 151 | 152 | # pytype static type analyzer 153 | .pytype/ 154 | 155 | # Cython debug symbols 156 | cython_debug/ 157 | 158 | # PyCharm 159 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 160 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 161 | # and can be added to the global gitignore or merged into this file. For a more nuclear 162 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 163 | #.idea/ 164 | /helpers 165 | *.env.* 166 | .DS_Store 167 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | ci: 2 | autofix_prs: true 3 | autoupdate_commit_msg: '[pre-commit.ci] pre-commit autofix suggestions' 4 | 5 | repos: 6 | - repo: https://github.com/charliermarsh/ruff-pre-commit 7 | rev: 'v0.0.262' 8 | hooks: 9 | - id: ruff 10 | args: ['--fix', '--exit-non-zero-on-fix'] 11 | 12 | - repo: https://github.com/psf/black 13 | rev: 22.3.0 14 | hooks: 15 | - id: black 16 | exclude: examples|^tests/test_data$ 17 | args: [--config=./pyproject.toml] 18 | 19 | - repo: https://github.com/pre-commit/pre-commit-hooks 20 | rev: v3.1.0 21 | hooks: 22 | - id: check-ast 23 | - id: check-docstring-first 24 | - id: check-merge-conflict 25 | - id: debug-statements 26 | - id: detect-private-key 27 | - id: end-of-file-fixer 28 | exclude: ^vlm_tools/tests/|^examples 29 | - id: pretty-format-json 30 | exclude: ^vlm_tools/tests/|^docs|^examples 31 | - id: trailing-whitespace 32 | exclude: ^examples 33 | - id: check-added-large-files 34 | args: ['--maxkb=100'] 35 | exclude: ^vlm_tools/tests|^examples|^assets 36 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: lint 2 | 3 | lint: 4 | pre-commit run --all-files # Uses pyproject.toml -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 |

3 | VLM Run Logo
4 |

5 |

VLM Run Cookbook

6 |

7 | Website | Platform | Hub | Docs | Blog | Discord 8 |

9 |

10 | Discord 11 | Twitter Follow 12 |

13 |
14 |
15 | 16 | Welcome to **[VLM Run](https://vlm.run) Cookbook**, a comprehensive collection of examples and notebooks demonstrating the power of structured visual understanding using the [VLM Run Platform](https://app.vlm.run). This repository hosts practical examples and tutorials for extracting structured data from images, videos, and documents using Vision Language Models (VLMs). 17 | 18 | 19 | ### 💡 Why Use This Cookbook? 20 | --- 21 | 22 | - 📚 **Practical Examples**: A comprehensive collection of Colab notebooks demonstrating real-world applications of VLM Run. 23 | - 🔋 **Ready-to-Use**: Each example comes with complete code and documentation, making it easy to adapt for your use case. 24 | - 🎯 **Domain-Specific**: Examples cover various domains from financial documents to TV news analysis. 25 | 26 | ### 📖 Cookbook Notebooks 27 | --- 28 | 29 | Our collection of Colab notebooks demonstrates various use cases and integrations: 30 | | Name | Type | Colab | Last Updated | 31 | |:---|:---|:---:|:---:| 32 | | [API Quickstart](./notebooks/00_quickstart.ipynb) | | [![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/vlm-run/vlmrun-cookbook/blob/main/notebooks/00_quickstart.ipynb) | 02-08-2025 | 33 | | [Schema Showcase](./notebooks/01_schema_showcase.ipynb) | feature | [![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/vlm-run/vlmrun-cookbook/blob/main/notebooks/01_schema_showcase.ipynb) | 02-08-2025 | 34 | | [Visual Grounding](./notebooks/04_visual_grounding.ipynb) | feature | [![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/vlm-run/vlmrun-cookbook/blob/main/notebooks/04_visual_grounding.ipynb) | 02-18-2025 | 35 | | [Long-form Video Transcription](./notebooks/advanced_video_transcription.ipynb) | feature | [![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/vlm-run/vlmrun-cookbook/blob/main/notebooks/advanced_video_transcription.ipynb) | 03-13-2025 | 36 | | [Video Inference (Fine-Tuning)](./notebooks/advanced_finetuning_video_inference.ipynb) | feature | [![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/vlm-run/vlmrun-cookbook/blob/main/notebooks/advanced_finetuning_video_inference.ipynb) | 02-18-2025 | 37 | | [US Drivers License](./notebooks/02_case_study_drivers_license.ipynb) | application | [![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/vlm-run/vlmrun-cookbook/blob/main/notebooks/02_case_study_drivers_license.ipynb) | 02-08-2025 | 38 | | [Parsing Financial Presentations](https://colab.research.google.com/drive/15_iRDucKj2I33p3m5X3ULdXby_DHWgjS) | application | [![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/15_iRDucKj2I33p3m5X3ULdXby_DHWgjS) | 02-04-2025 | 39 | | [TV News Analysis](./notebooks/03_case_study_tv_news.ipynb) | application | [![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/vlm-run/vlmrun-cookbook/blob/main/notebooks/03_case_study_tv_news.ipynb) | 02-15-2025 | 40 | | [Fashion Product Catalog](./notebooks/05_case_study_image_catalogue.ipynb) | application | [![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/vlm-run/vlmrun-cookbook/blob/main/notebooks/05_case_study_image_catalogue.ipynb) | 02-20-2025 | 41 | | [Fashion Images Hybrid Search](./notebooks/06_fashion_images_hybrid_search.ipynb) | application | [![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/vlm-run/vlmrun-cookbook/blob/main/notebooks/06_fashion_images_hybrid_search.ipynb) | 02-21-2025 | 42 | | [Generate Custom Schema](./notebooks/07_generate_schema.ipynb) | feature | [![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/vlm-run/vlmrun-cookbook/blob/main/notebooks/07_generate_schema.ipynb) | 03-13-2025 | 43 | | [Document Markdown Extraction](./notebooks/08_document_markdown.ipynb) | feature | [![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/vlm-run/vlmrun-cookbook/blob/main/notebooks/08_document_markdown.ipynb) | 06-02-2025 | 44 | 45 | 46 | 47 | 48 | ### 🔗 Quick Links 49 | --- 50 | 51 | * 💬 Send us an email at [support@vlm.run](mailto:support@vlm.run) or join our [Discord](https://discord.gg/4jgyECY4rq) for help 52 | * 📣 Follow us on [Twitter](https://twitter.com/vlmrun) and [LinkedIn](https://www.linkedin.com/company/vlm-run) to keep up-to-date on our products 53 | * 📚 Check out our [Documentation](https://docs.vlm.run/) for detailed guides and API reference 54 | -------------------------------------------------------------------------------- /assets/example-slide.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vlm-run/vlmrun-cookbook/14e80f003862e59fd63bb78ed39453488736fdff/assets/example-slide.jpeg -------------------------------------------------------------------------------- /assets/financial-presentations/sigifirstquarter2024inve001.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vlm-run/vlmrun-cookbook/14e80f003862e59fd63bb78ed39453488736fdff/assets/financial-presentations/sigifirstquarter2024inve001.jpg -------------------------------------------------------------------------------- /assets/financial-presentations/sigifirstquarter2024inve002.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vlm-run/vlmrun-cookbook/14e80f003862e59fd63bb78ed39453488736fdff/assets/financial-presentations/sigifirstquarter2024inve002.jpg -------------------------------------------------------------------------------- /assets/financial-presentations/sigifirstquarter2024inve003.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vlm-run/vlmrun-cookbook/14e80f003862e59fd63bb78ed39453488736fdff/assets/financial-presentations/sigifirstquarter2024inve003.jpg -------------------------------------------------------------------------------- /assets/financial-presentations/sigifirstquarter2024inve004.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vlm-run/vlmrun-cookbook/14e80f003862e59fd63bb78ed39453488736fdff/assets/financial-presentations/sigifirstquarter2024inve004.jpg -------------------------------------------------------------------------------- /assets/financial-presentations/sigifirstquarter2024inve005.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vlm-run/vlmrun-cookbook/14e80f003862e59fd63bb78ed39453488736fdff/assets/financial-presentations/sigifirstquarter2024inve005.jpg -------------------------------------------------------------------------------- /assets/financial-presentations/sigifirstquarter2024inve006.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vlm-run/vlmrun-cookbook/14e80f003862e59fd63bb78ed39453488736fdff/assets/financial-presentations/sigifirstquarter2024inve006.jpg -------------------------------------------------------------------------------- /assets/financial-presentations/sigifirstquarter2024inve007.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vlm-run/vlmrun-cookbook/14e80f003862e59fd63bb78ed39453488736fdff/assets/financial-presentations/sigifirstquarter2024inve007.jpg -------------------------------------------------------------------------------- /assets/financial-presentations/sigifirstquarter2024inve008.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vlm-run/vlmrun-cookbook/14e80f003862e59fd63bb78ed39453488736fdff/assets/financial-presentations/sigifirstquarter2024inve008.jpg -------------------------------------------------------------------------------- /assets/financial-presentations/sigifirstquarter2024inve009.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vlm-run/vlmrun-cookbook/14e80f003862e59fd63bb78ed39453488736fdff/assets/financial-presentations/sigifirstquarter2024inve009.jpg -------------------------------------------------------------------------------- /assets/financial-presentations/sigifirstquarter2024inve010.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vlm-run/vlmrun-cookbook/14e80f003862e59fd63bb78ed39453488736fdff/assets/financial-presentations/sigifirstquarter2024inve010.jpg -------------------------------------------------------------------------------- /assets/financial-presentations/sigifirstquarter2024inve011.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vlm-run/vlmrun-cookbook/14e80f003862e59fd63bb78ed39453488736fdff/assets/financial-presentations/sigifirstquarter2024inve011.jpg -------------------------------------------------------------------------------- /assets/financial-presentations/sigifirstquarter2024inve012.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vlm-run/vlmrun-cookbook/14e80f003862e59fd63bb78ed39453488736fdff/assets/financial-presentations/sigifirstquarter2024inve012.jpg -------------------------------------------------------------------------------- /assets/financial-presentations/sigifirstquarter2024inve013.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vlm-run/vlmrun-cookbook/14e80f003862e59fd63bb78ed39453488736fdff/assets/financial-presentations/sigifirstquarter2024inve013.jpg -------------------------------------------------------------------------------- /assets/financial-presentations/sigifirstquarter2024inve014.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vlm-run/vlmrun-cookbook/14e80f003862e59fd63bb78ed39453488736fdff/assets/financial-presentations/sigifirstquarter2024inve014.jpg -------------------------------------------------------------------------------- /assets/financial-presentations/sigifirstquarter2024inve015.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vlm-run/vlmrun-cookbook/14e80f003862e59fd63bb78ed39453488736fdff/assets/financial-presentations/sigifirstquarter2024inve015.jpg -------------------------------------------------------------------------------- /assets/financial-presentations/sigifirstquarter2024inve016.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vlm-run/vlmrun-cookbook/14e80f003862e59fd63bb78ed39453488736fdff/assets/financial-presentations/sigifirstquarter2024inve016.jpg -------------------------------------------------------------------------------- /assets/financial-presentations/sigifirstquarter2024inve017.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vlm-run/vlmrun-cookbook/14e80f003862e59fd63bb78ed39453488736fdff/assets/financial-presentations/sigifirstquarter2024inve017.jpg -------------------------------------------------------------------------------- /assets/financial-presentations/sigifirstquarter2024inve018.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vlm-run/vlmrun-cookbook/14e80f003862e59fd63bb78ed39453488736fdff/assets/financial-presentations/sigifirstquarter2024inve018.jpg -------------------------------------------------------------------------------- /assets/financial-presentations/sigifirstquarter2024inve019.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vlm-run/vlmrun-cookbook/14e80f003862e59fd63bb78ed39453488736fdff/assets/financial-presentations/sigifirstquarter2024inve019.jpg -------------------------------------------------------------------------------- /assets/financial-presentations/sigifirstquarter2024inve020.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vlm-run/vlmrun-cookbook/14e80f003862e59fd63bb78ed39453488736fdff/assets/financial-presentations/sigifirstquarter2024inve020.jpg -------------------------------------------------------------------------------- /assets/financial-presentations/sigifirstquarter2024inve021.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vlm-run/vlmrun-cookbook/14e80f003862e59fd63bb78ed39453488736fdff/assets/financial-presentations/sigifirstquarter2024inve021.jpg -------------------------------------------------------------------------------- /assets/financial-presentations/sigifirstquarter2024inve022.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vlm-run/vlmrun-cookbook/14e80f003862e59fd63bb78ed39453488736fdff/assets/financial-presentations/sigifirstquarter2024inve022.jpg -------------------------------------------------------------------------------- /assets/financial-presentations/sigifirstquarter2024inve023.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vlm-run/vlmrun-cookbook/14e80f003862e59fd63bb78ed39453488736fdff/assets/financial-presentations/sigifirstquarter2024inve023.jpg -------------------------------------------------------------------------------- /assets/financial-presentations/sigifirstquarter2024inve024.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vlm-run/vlmrun-cookbook/14e80f003862e59fd63bb78ed39453488736fdff/assets/financial-presentations/sigifirstquarter2024inve024.jpg -------------------------------------------------------------------------------- /assets/financial-presentations/sigifirstquarter2024inve025.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vlm-run/vlmrun-cookbook/14e80f003862e59fd63bb78ed39453488736fdff/assets/financial-presentations/sigifirstquarter2024inve025.jpg -------------------------------------------------------------------------------- /assets/financial-presentations/sigifirstquarter2024inve026.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vlm-run/vlmrun-cookbook/14e80f003862e59fd63bb78ed39453488736fdff/assets/financial-presentations/sigifirstquarter2024inve026.jpg -------------------------------------------------------------------------------- /assets/financial-presentations/sigifirstquarter2024inve027.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vlm-run/vlmrun-cookbook/14e80f003862e59fd63bb78ed39453488736fdff/assets/financial-presentations/sigifirstquarter2024inve027.jpg -------------------------------------------------------------------------------- /assets/financial-presentations/sigifirstquarter2024inve028.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vlm-run/vlmrun-cookbook/14e80f003862e59fd63bb78ed39453488736fdff/assets/financial-presentations/sigifirstquarter2024inve028.jpg -------------------------------------------------------------------------------- /assets/financial-presentations/sigifirstquarter2024inve029.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vlm-run/vlmrun-cookbook/14e80f003862e59fd63bb78ed39453488736fdff/assets/financial-presentations/sigifirstquarter2024inve029.jpg -------------------------------------------------------------------------------- /assets/financial-presentations/sigifirstquarter2024inve030.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vlm-run/vlmrun-cookbook/14e80f003862e59fd63bb78ed39453488736fdff/assets/financial-presentations/sigifirstquarter2024inve030.jpg -------------------------------------------------------------------------------- /notebooks/05_case_study_image_catalogue.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "92259041-2e36-463b-9db9-faba98c463dd", 6 | "metadata": {}, 7 | "source": [ 8 | "
\n", 9 | "

\n", 10 | " \"VLM
\n", 11 | "

\n", 12 | "

Website | API Docs | Blog | Discord\n", 13 | "

\n", 14 | "

\n", 15 | "\"Discord\"\n", 16 | "\"Twitter\n", 17 | "

\n", 18 | "
\n", 19 | "\n", 20 | "Welcome to **[VLM Run Cookbooks](https://github.com/vlm-run/vlmrun-cookbook)**, a comprehensive collection of examples and notebooks demonstrating the power of structured visual understanding using the [VLM Run Platform](https://app.vlm.run). " 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "id": "97793ab4-6fd5-4f9e-ab5c-b1800f8b176d", 26 | "metadata": {}, 27 | "source": [ 28 | "## Case Study: Fashion Product Catalogue Analysis\n", 29 | "\n", 30 | "This notebook demonstrates how to use VLM Run to analyze fashion product images and extract structured information including:\n", 31 | "- Detailed product descriptions\n", 32 | "- Product categories\n", 33 | "- Target gender\n", 34 | "- Seasonal classification\n", 35 | "\n", 36 | "We'll use a sample dataset from the Fashion Product Images dataset to showcase VLM Run's retail product analysis capabilities." 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "id": "86b18b40-c859-4183-bcc0-74c6971ae45a", 42 | "metadata": {}, 43 | "source": [ 44 | "### Environment Setup\n", 45 | "\n", 46 | "To get started, install the VLM Run Python SDK and sign-up for an API key on the [VLM Run App](https://app.vlm.run).\n", 47 | "- Store the VLM Run API key under the `VLMRUN_API_KEY` environment variable." 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "id": "b1989a6d-c94d-4243-af69-9c7b0736e058", 53 | "metadata": {}, 54 | "source": [ 55 | "## Prerequisites\n", 56 | "\n", 57 | "* Python 3.9+\n", 58 | "* VLM Run API key (get one at [app.vlm.run](https://app.vlm.run))" 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "id": "96425ef8-ee2c-4d03-adae-b88645f25e89", 64 | "metadata": {}, 65 | "source": [ 66 | "## Setup\n", 67 | "\n", 68 | "First, let's install the required packages:" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": 1, 74 | "id": "450da6bf-c72d-4366-9d65-9fa5fd744224", 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "! pip install vlmrun --upgrade --quiet\n", 79 | "! pip install vlmrun-hub --upgrade --quiet" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 5, 85 | "id": "11314850-6fb9-422c-a015-9675cd35630c", 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "! pip install datasets --quiet" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": 2, 95 | "id": "5a7b057d-569e-421b-99d3-8fd18b9938d1", 96 | "metadata": {}, 97 | "outputs": [], 98 | "source": [ 99 | "import os\n", 100 | "import getpass\n", 101 | "\n", 102 | "VLMRUN_BASE_URL = os.getenv(\"VLMRUN_BASE_URL\", \"https://api.vlm.run/v1\")\n", 103 | "VLMRUN_API_KEY = os.getenv(\"VLMRUN_API_KEY\", None)\n", 104 | "if VLMRUN_API_KEY is None:\n", 105 | " VLMRUN_API_KEY = getpass.getpass()" 106 | ] 107 | }, 108 | { 109 | "cell_type": "markdown", 110 | "id": "9a7510c8-d302-4079-b97e-2b1b0d1730e3", 111 | "metadata": {}, 112 | "source": [ 113 | "Let's initialize the VLM Run Client" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 3, 119 | "id": "72a99ce4-2588-4f14-8983-566a7f6b3ec6", 120 | "metadata": {}, 121 | "outputs": [], 122 | "source": [ 123 | "from vlmrun.client import VLMRun\n", 124 | "\n", 125 | "vlm_client = VLMRun(base_url=VLMRUN_BASE_URL, api_key=VLMRUN_API_KEY)" 126 | ] 127 | }, 128 | { 129 | "cell_type": "markdown", 130 | "id": "257f8706", 131 | "metadata": {}, 132 | "source": [ 133 | "### Load the dataset\n", 134 | "\n", 135 | "Let's load the first 1% of the fashion dataset, and visualize the dataset." 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": 19, 141 | "id": "5c63baa5-99ca-48b9-b8b9-73f51d1beee3", 142 | "metadata": {}, 143 | "outputs": [ 144 | { 145 | "name": "stdout", 146 | "output_type": "stream", 147 | "text": [ 148 | "Loading fashion dataset...\n", 149 | "Loaded 10 images successfully\n" 150 | ] 151 | }, 152 | { 153 | "data": { 154 | "text/plain": [ 155 | "Dataset({\n", 156 | " features: ['id', 'gender', 'masterCategory', 'subCategory', 'articleType', 'baseColour', 'season', 'year', 'usage', 'productDisplayName', 'image'],\n", 157 | " num_rows: 10\n", 158 | "})" 159 | ] 160 | }, 161 | "execution_count": 19, 162 | "metadata": {}, 163 | "output_type": "execute_result" 164 | } 165 | ], 166 | "source": [ 167 | "from datasets import load_dataset\n", 168 | "\n", 169 | "\n", 170 | "print(f\"Loading fashion dataset...\")\n", 171 | "ds = load_dataset(\"ashraq/fashion-product-images-small\", split=f\"train[:10]\")\n", 172 | "print(f\"Loaded {len(ds)} images successfully\")\n", 173 | "ds" 174 | ] 175 | }, 176 | { 177 | "cell_type": "markdown", 178 | "id": "efb7d11d-58bc-4675-ae1e-879d37ae5fa8", 179 | "metadata": {}, 180 | "source": [ 181 | "Let's define an utility function for visualization" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": 33, 187 | "id": "b0b57a57", 188 | "metadata": {}, 189 | "outputs": [ 190 | { 191 | "data": { 192 | "text/html": [ 193 | "
\n", 194 | "\n", 207 | "\n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | "
imageproductDisplayNamegendermasterCategoryseason
0<PIL.JpegImagePlugin.JpegImageFile image mode=...Turtle Check Men Navy Blue ShirtMenApparelFall
1<PIL.JpegImagePlugin.JpegImageFile image mode=...Peter England Men Party Blue JeansMenApparelSummer
2<PIL.Image.Image image mode=L size=60x80 at 0x...Titan Women Silver WatchWomenAccessoriesWinter
3<PIL.JpegImagePlugin.JpegImageFile image mode=...Manchester United Men Solid Black Track PantsMenApparelFall
4<PIL.Image.Image image mode=RGB size=60x80 at ...Puma Men Grey T-shirtMenApparelSummer
5<PIL.JpegImagePlugin.JpegImageFile image mode=...Inkfruit Mens Chain Reaction T-shirtMenApparelSummer
6<PIL.JpegImagePlugin.JpegImageFile image mode=...Fabindia Men Striped Green ShirtMenApparelSummer
7<PIL.Image.Image image mode=RGB size=60x80 at ...Jealous 21 Women Purple ShirtWomenApparelSummer
8<PIL.Image.Image image mode=RGB size=60x80 at ...Puma Men Pack of 3 SocksMenAccessoriesSummer
9<PIL.Image.Image image mode=RGB size=60x80 at ...Skagen Men Black WatchMenAccessoriesWinter
\n", 301 | "
" 302 | ], 303 | "text/plain": [ 304 | " image \\\n", 305 | "0 JSON with VLM Run\n", 350 | "\n", 351 | "\n", 352 | "Now, let's call the VLM Run API to get the predictions for the dataframe using the `vlm_client.image.generate` method. In this example, we'll use the `retail.product-catalog` domain to get the predictions." 353 | ] 354 | }, 355 | { 356 | "cell_type": "code", 357 | "execution_count": 38, 358 | "id": "4b53facc", 359 | "metadata": {}, 360 | "outputs": [ 361 | { 362 | "data": { 363 | "text/html": [ 364 | "
\n", 365 | "\n", 378 | "\n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | "
imageproductDisplayNamegendermasterCategoryseasonresponse
0<PIL.JpegImagePlugin.JpegImageFile image mode=...Turtle Check Men Navy Blue ShirtMenApparelFalldescription='A young man wearing a checkered l...
1<PIL.JpegImagePlugin.JpegImageFile image mode=...Peter England Men Party Blue JeansMenApparelSummerdescription='A pair of blue jeans' category='c...
2<PIL.Image.Image image mode=L size=60x80 at 0x...Titan Women Silver WatchWomenAccessoriesWinterdescription='A stylish silver wristwatch with ...
3<PIL.JpegImagePlugin.JpegImageFile image mode=...Manchester United Men Solid Black Track PantsMenApparelFalldescription='Black athletic jogging pants with...
4<PIL.Image.Image image mode=RGB size=60x80 at ...Puma Men Grey T-shirtMenApparelSummerdescription=\"Men's grey short-sleeve polo shir...
5<PIL.JpegImagePlugin.JpegImageFile image mode=...Inkfruit Mens Chain Reaction T-shirtMenApparelSummerdescription=\"Men's Grey T-shirt with print des...
6<PIL.JpegImagePlugin.JpegImageFile image mode=...Fabindia Men Striped Green ShirtMenApparelSummerdescription='Green long-sleeve shirt' category...
7<PIL.Image.Image image mode=RGB size=60x80 at ...Jealous 21 Women Purple ShirtWomenApparelSummerdescription=\"Stylish maroon women's short slee...
8<PIL.Image.Image image mode=RGB size=60x80 at ...Puma Men Pack of 3 SocksMenAccessoriesSummerdescription='Puma black socks with white strip...
9<PIL.Image.Image image mode=RGB size=60x80 at ...Skagen Men Black WatchMenAccessoriesWinterdescription='Black wristwatch with a leather s...
\n", 483 | "
" 484 | ], 485 | "text/plain": [ 486 | " image \\\n", 487 | "0 \n", 559 | "\n", 572 | "\n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | "
imagedescriptioncategoryseasongender
0<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=60x80 at 0x1479A4E80>A young man wearing a checkered long-sleeve shirt. The shirt features a blend of blue and white colors.apparelSeason.springGender.men
1<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=60x80 at 0x1479A75B0>A pair of blue jeansclothingSeason.fallGender.men
2<PIL.Image.Image image mode=L size=60x80 at 0x1479BA080>A stylish silver wristwatch with a minimalist design, featuring a round dial and a silver bracelet strap.accessoriesSeason.fallGender.women
3<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=60x80 at 0x1479A5300>Black athletic jogging pants with a logo on the left thigh, worn by a person in a red shirt.pantsSeason.fallGender.men
4<PIL.Image.Image image mode=RGB size=60x80 at 0x1479B9C00>Men's grey short-sleeve polo shirtshirtSeason.summerGender.men
5<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=60x80 at 0x1479A5AB0>Men's Grey T-shirt with print designT-shirtSeason.summerGender.men
6<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=60x80 at 0x1479A7010>Green long-sleeve shirtShirtsSeason.fallGender.men
7<PIL.Image.Image image mode=RGB size=60x80 at 0x150C78340>Stylish maroon women's short sleeve blousetopwearSeason.springGender.women
8<PIL.Image.Image image mode=RGB size=60x80 at 0x150C78640>Puma black socks with white stripessocksSeason.fallGender.men
9<PIL.Image.Image image mode=RGB size=60x80 at 0x150C78880>Black wristwatch with a leather strap and minimalist design.watchesSeason.fallGender.men
\n", 666 | "" 667 | ], 668 | "text/plain": [ 669 | " image \\\n", 670 | "0 \n", 671 | "1 \n", 672 | "2 \n", 673 | "3 \n", 674 | "4 \n", 675 | "5 \n", 676 | "6 \n", 677 | "7 \n", 678 | "8 \n", 679 | "9 \n", 680 | "\n", 681 | " description \\\n", 682 | "0 A young man wearing a checkered long-sleeve shirt. The shirt features a blend of blue and white colors. \n", 683 | "1 A pair of blue jeans \n", 684 | "2 A stylish silver wristwatch with a minimalist design, featuring a round dial and a silver bracelet strap. \n", 685 | "3 Black athletic jogging pants with a logo on the left thigh, worn by a person in a red shirt. \n", 686 | "4 Men's grey short-sleeve polo shirt \n", 687 | "5 Men's Grey T-shirt with print design \n", 688 | "6 Green long-sleeve shirt \n", 689 | "7 Stylish maroon women's short sleeve blouse \n", 690 | "8 Puma black socks with white stripes \n", 691 | "9 Black wristwatch with a leather strap and minimalist design. \n", 692 | "\n", 693 | " category season gender \n", 694 | "0 apparel Season.spring Gender.men \n", 695 | "1 clothing Season.fall Gender.men \n", 696 | "2 accessories Season.fall Gender.women \n", 697 | "3 pants Season.fall Gender.men \n", 698 | "4 shirt Season.summer Gender.men \n", 699 | "5 T-shirt Season.summer Gender.men \n", 700 | "6 Shirts Season.fall Gender.men \n", 701 | "7 topwear Season.spring Gender.women \n", 702 | "8 socks Season.fall Gender.men \n", 703 | "9 watches Season.fall Gender.men " 704 | ] 705 | }, 706 | "execution_count": 61, 707 | "metadata": {}, 708 | "output_type": "execute_result" 709 | } 710 | ], 711 | "source": [ 712 | "# Let's explode the response column into a dataframe and display the predictions dataframe\n", 713 | "pd.set_option(\"display.max_colwidth\", None)\n", 714 | "df_response_json = df_response[\"response\"].apply(lambda x: x.model_dump() if x is not None else None)\n", 715 | "df_response_json = pd.concat([df_response[[\"image\"]], pd.json_normalize(df_response_json)], axis=1)\n", 716 | "df_response_json\n" 717 | ] 718 | }, 719 | { 720 | "cell_type": "markdown", 721 | "id": "182be362", 722 | "metadata": {}, 723 | "source": [ 724 | "Now, let's define a few utilities for rendering the dataframe with images / tags." 725 | ] 726 | }, 727 | { 728 | "cell_type": "code", 729 | "execution_count": 59, 730 | "id": "b67cd1ea", 731 | "metadata": {}, 732 | "outputs": [], 733 | "source": [ 734 | "from IPython.display import HTML, display\n", 735 | "from vlmrun.common.image import encode_image\n", 736 | "\n", 737 | "style = \"\"\"\n", 738 | "\n", 748 | "\"\"\"\n", 749 | "\n", 750 | "formatters = {\n", 751 | " \"image\": lambda x: f\"\",\n", 752 | " \"category\": lambda x: f\"{x}\",\n", 753 | " \"season\": lambda x: f\"{x}\",\n", 754 | " \"gender\": lambda x: f\"{x}\",\n", 755 | "}" 756 | ] 757 | }, 758 | { 759 | "cell_type": "markdown", 760 | "id": "352131c2", 761 | "metadata": {}, 762 | "source": [ 763 | "Display the dataframe with images and VLM Run predictions." 764 | ] 765 | }, 766 | { 767 | "cell_type": "code", 768 | "execution_count": 62, 769 | "id": "7f59c302-7a55-48f6-b788-06e17ade0e89", 770 | "metadata": {}, 771 | "outputs": [ 772 | { 773 | "data": { 774 | "text/html": [ 775 | "\n", 776 | "\n", 786 | "\n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | " \n", 796 | " \n", 797 | " \n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | " \n", 809 | " \n", 810 | " \n", 811 | " \n", 812 | " \n", 813 | " \n", 814 | " \n", 815 | " \n", 816 | " \n", 817 | " \n", 818 | " \n", 819 | " \n", 820 | " \n", 821 | " \n", 822 | " \n", 823 | " \n", 824 | " \n", 825 | " \n", 826 | " \n", 827 | " \n", 828 | " \n", 829 | " \n", 830 | " \n", 831 | " \n", 832 | " \n", 833 | " \n", 834 | " \n", 835 | " \n", 836 | " \n", 837 | " \n", 838 | " \n", 839 | " \n", 840 | " \n", 841 | " \n", 842 | " \n", 843 | " \n", 844 | " \n", 845 | " \n", 846 | " \n", 847 | " \n", 848 | " \n", 849 | " \n", 850 | " \n", 851 | " \n", 852 | " \n", 853 | " \n", 854 | " \n", 855 | " \n", 856 | " \n", 857 | " \n", 858 | " \n", 859 | " \n", 860 | " \n", 861 | " \n", 862 | " \n", 863 | " \n", 864 | " \n", 865 | " \n", 866 | " \n", 867 | " \n", 868 | " \n", 869 | " \n", 870 | " \n", 871 | " \n", 872 | " \n", 873 | " \n", 874 | " \n", 875 | " \n", 876 | " \n", 877 | " \n", 878 | " \n", 879 | "
imagedescriptioncategoryseasongender
0A young man wearing a checkered long-sleeve shirt. The shirt features a blend of blue and white colors.apparelSeason.springGender.men
1A pair of blue jeansclothingSeason.fallGender.men
2A stylish silver wristwatch with a minimalist design, featuring a round dial and a silver bracelet strap.accessoriesSeason.fallGender.women
3Black athletic jogging pants with a logo on the left thigh, worn by a person in a red shirt.pantsSeason.fallGender.men
4Men's grey short-sleeve polo shirtshirtSeason.summerGender.men
5Men's Grey T-shirt with print designT-shirtSeason.summerGender.men
6Green long-sleeve shirtShirtsSeason.fallGender.men
7Stylish maroon women's short sleeve blousetopwearSeason.springGender.women
8Puma black socks with white stripessocksSeason.fallGender.men
9Black wristwatch with a leather strap and minimalist design.watchesSeason.fallGender.men
" 880 | ], 881 | "text/plain": [ 882 | "" 883 | ] 884 | }, 885 | "metadata": {}, 886 | "output_type": "display_data" 887 | } 888 | ], 889 | "source": [ 890 | "display(HTML(style + df_response_json.to_html(formatters=formatters, escape=False)))" 891 | ] 892 | }, 893 | { 894 | "cell_type": "markdown", 895 | "id": "6be95cbf-6273-47d4-b242-26a78473549e", 896 | "metadata": {}, 897 | "source": [ 898 | "## Additional Resources\n", 899 | "- [VLM Run Documentation](https://docs.vlm.run)\n", 900 | "- [API Reference](https://docs.vlm.run/)\n", 901 | "- [More Examples](https://github.com/vlm-run/vlmrun-cookbook)\n", 902 | "- [Fashion Dataset](https://huggingface.co/datasets/ashraq/fashion-product-images-small)" 903 | ] 904 | } 905 | ], 906 | "metadata": { 907 | "kernelspec": { 908 | "display_name": "vlm-cookbook", 909 | "language": "python", 910 | "name": "python3" 911 | }, 912 | "language_info": { 913 | "codemirror_mode": { 914 | "name": "ipython", 915 | "version": 3 916 | }, 917 | "file_extension": ".py", 918 | "mimetype": "text/x-python", 919 | "name": "python", 920 | "nbconvert_exporter": "python", 921 | "pygments_lexer": "ipython3", 922 | "version": "3.10.16" 923 | } 924 | }, 925 | "nbformat": 4, 926 | "nbformat_minor": 5 927 | } 928 | -------------------------------------------------------------------------------- /notebooks/advanced_video_transcription.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "ngFBpdKlwtbq" 7 | }, 8 | "source": [ 9 | "
\n", 10 | "

\n", 11 | " \"VLM
\n", 12 | "

\n", 13 | "

Website | API Docs | Blog | Discord\n", 14 | "

\n", 15 | "

\n", 16 | "\"Discord\"\n", 17 | "\"Twitter\n", 18 | "

\n", 19 | "
\n", 20 | "\n", 21 | "Welcome to **[VLM Run Cookbooks](https://github.com/vlm-run/vlmrun-cookbook)**, a comprehensive collection of examples and notebooks demonstrating the power of structured visual understanding using the [VLM Run Platform](https://app.vlm.run). " 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "## Breaking the 8000 Token Barrier: Long-form visual transcription with VLM Run\n", 29 | "\n", 30 | "VLM Run is pioneering an API designed for video understanding at scale, capable of processing long-form content such as keynotes or films in a single request without partitioning. This capability extends beyond the typical 8192 output token limit found in many APIs, allowing for comprehensive visual transcription that includes detailed descriptions of both audio and visual elements in the video.\n", 31 | "\n", 32 | "This notebook demonstrates how to extract both audio transcripts and visual scene descriptions from video content using VLM Run's advanced video transcription capabilities." 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "metadata": { 38 | "id": "Rk9qtMAqwtbr" 39 | }, 40 | "source": [ 41 | "### Environment Setup\n", 42 | "\n", 43 | "To get started, install the VLM Run Python SDK and sign-up for an API key on the [VLM Run App](https://app.vlm.run).\n", 44 | "- Store the VLM Run API key under the `VLM_RUN_API_KEY` environment variable." 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": {}, 50 | "source": [ 51 | "### Prerequisites\n", 52 | "\n", 53 | "* Python 3.9+\n", 54 | "* VLM Run API key (get one at [app.vlm.run](https://app.vlm.run))" 55 | ] 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "metadata": {}, 60 | "source": [ 61 | "## Setup\n", 62 | "\n", 63 | "First, let's install the required packages:" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 1, 69 | "metadata": { 70 | "id": "FK_MJ9H9wtbr" 71 | }, 72 | "outputs": [], 73 | "source": [ 74 | "! pip install \"vlmrun[all]\" --quiet\n", 75 | "! pip install yt-dlp --quiet" 76 | ] 77 | }, 78 | { 79 | "cell_type": "markdown", 80 | "metadata": { 81 | "id": "E7DD4PlPwtbr" 82 | }, 83 | "source": [ 84 | "## Configure VLM Run" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 2, 90 | "metadata": { 91 | "id": "-0QUC2Jfwtbr", 92 | "outputId": "7fd0fb12-32d9-452e-dbec-eb4d1a9c8d4e" 93 | }, 94 | "outputs": [ 95 | { 96 | "name": "stdin", 97 | "output_type": "stream", 98 | "text": [ 99 | " ········\n" 100 | ] 101 | } 102 | ], 103 | "source": [ 104 | "import os\n", 105 | "import getpass\n", 106 | "\n", 107 | "VLMRUN_BASE_URL = os.getenv(\"VLMRUN_BASE_URL\", \"https://api.vlm.run/v1\")\n", 108 | "VLMRUN_API_KEY = os.getenv(\"VLMRUN_API_KEY\", None)\n", 109 | "if VLMRUN_API_KEY is None:\n", 110 | " VLMRUN_API_KEY = getpass.getpass()" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": 3, 116 | "metadata": {}, 117 | "outputs": [], 118 | "source": [ 119 | "from vlmrun.client import VLMRun\n", 120 | "\n", 121 | "client = VLMRun(base_url=VLMRUN_BASE_URL, api_key=VLMRUN_API_KEY)" 122 | ] 123 | }, 124 | { 125 | "cell_type": "markdown", 126 | "metadata": {}, 127 | "source": [ 128 | "### Download sample YouTube video\n", 129 | "\n", 130 | "For this example, we're going to be using a sample YouTube video.\n" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": 4, 136 | "metadata": {}, 137 | "outputs": [ 138 | { 139 | "name": "stdout", 140 | "output_type": "stream", 141 | "text": [ 142 | "[youtube] Extracting URL: https://www.youtube.com/watch?v=KxjPgGLVJSg\n", 143 | "[youtube] KxjPgGLVJSg: Downloading webpage\n", 144 | "[youtube] KxjPgGLVJSg: Downloading tv client config\n", 145 | "[youtube] KxjPgGLVJSg: Downloading player 74e4bb46\n", 146 | "[youtube] KxjPgGLVJSg: Downloading tv player API JSON\n", 147 | "[youtube] KxjPgGLVJSg: Downloading ios player API JSON\n", 148 | "[youtube] KxjPgGLVJSg: Downloading m3u8 information\n", 149 | "[info] KxjPgGLVJSg: Downloading 1 format(s): 398+140\n", 150 | "[download] /Users/kaushikbokka/.vlmrun/tmp/KxjPgGLVJSg.mp4 has already been downloaded\n", 151 | "Downloaded video [path=KxjPgGLVJSg.mp4, size=24.85 MB]\n" 152 | ] 153 | } 154 | ], 155 | "source": [ 156 | "# Download sample youtube video for transcription purposes\n", 157 | "import yt_dlp\n", 158 | "from vlmrun.constants import VLMRUN_TMP_DIR\n", 159 | "\n", 160 | "URL = \"https://www.youtube.com/watch?v=KxjPgGLVJSg\"\n", 161 | "\n", 162 | "height = 720\n", 163 | "options = {\n", 164 | " \"outtmpl\": str(VLMRUN_TMP_DIR / \"%(id)s.%(ext)s\"),\n", 165 | " \"format\": f\"bestvideo[height<={height}][ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best\",\n", 166 | " \"keepvideo\": True,\n", 167 | "}\n", 168 | "with yt_dlp.YoutubeDL(options) as ydl:\n", 169 | " info = ydl.extract_info(URL, download=True)\n", 170 | " path = VLMRUN_TMP_DIR / f\"{info['id']}.mp4\"\n", 171 | "print(f\"Downloaded video [path={path.name}, size={path.stat().st_size / 1024 / 1024:.2f} MB]\")" 172 | ] 173 | }, 174 | { 175 | "cell_type": "markdown", 176 | "metadata": {}, 177 | "source": [ 178 | "### Visualize the video" 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": 5, 184 | "metadata": { 185 | "id": "P72SChjDwtbs", 186 | "outputId": "f34b4a99-1b91-4469-d320-74c808cd41db" 187 | }, 188 | "outputs": [ 189 | { 190 | "name": "stderr", 191 | "output_type": "stream", 192 | "text": [ 193 | "/Users/kaushikbokka/apps/vlm-run/vlmrun-cookbook/venv/lib/python3.9/site-packages/IPython/core/display.py:431: UserWarning: Consider using IPython.display.IFrame instead\n", 194 | " warnings.warn(\"Consider using IPython.display.IFrame instead\")\n" 195 | ] 196 | }, 197 | { 198 | "data": { 199 | "text/html": [ 200 | "" 201 | ], 202 | "text/plain": [ 203 | "" 204 | ] 205 | }, 206 | "metadata": {}, 207 | "output_type": "display_data" 208 | } 209 | ], 210 | "source": [ 211 | "from IPython.display import HTML, display\n", 212 | "\n", 213 | "_, yt_id = URL.split(\"?v=\")\n", 214 | "IFRAME_STR = f''\n", 215 | "\n", 216 | "display(HTML(IFRAME_STR))" 217 | ] 218 | }, 219 | { 220 | "cell_type": "markdown", 221 | "metadata": {}, 222 | "source": [ 223 | "### Generate structured data from a long-form video\n", 224 | "\n", 225 | "Let's take this 4-minute long video and generate audio and visual trascripts. We take both the audio and video transcripts and segment them into ~20s scenes." 226 | ] 227 | }, 228 | { 229 | "cell_type": "code", 230 | "execution_count": 11, 231 | "metadata": {}, 232 | "outputs": [ 233 | { 234 | "name": "stderr", 235 | "output_type": "stream", 236 | "text": [ 237 | "\u001b[32m2025-03-13 08:45:59.568\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mvlmrun.client.predictions\u001b[0m:\u001b[36m_handle_file_or_url\u001b[0m:\u001b[36m317\u001b[0m - \u001b[34m\u001b[1mUploading file [path=/Users/kaushikbokka/.vlmrun/tmp/KxjPgGLVJSg.mp4, size=24.85 MB] to VLM Run\u001b[0m\n", 238 | "\u001b[32m2025-03-13 08:45:59.571\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mvlmrun.client.files\u001b[0m:\u001b[36mget_cached_file\u001b[0m:\u001b[36m56\u001b[0m - \u001b[34m\u001b[1mComputing md5 hash for file [file=/Users/kaushikbokka/.vlmrun/tmp/KxjPgGLVJSg.mp4]\u001b[0m\n", 239 | "\u001b[32m2025-03-13 08:45:59.634\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mvlmrun.client.files\u001b[0m:\u001b[36mget_cached_file\u001b[0m:\u001b[36m62\u001b[0m - \u001b[34m\u001b[1mComputed md5 hash for file [file=/Users/kaushikbokka/.vlmrun/tmp/KxjPgGLVJSg.mp4, hash=8e8ee35999cc6b6a45a6ed3f9dfac24a]\u001b[0m\n", 240 | "\u001b[32m2025-03-13 08:45:59.635\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mvlmrun.client.files\u001b[0m:\u001b[36mget_cached_file\u001b[0m:\u001b[36m65\u001b[0m - \u001b[34m\u001b[1mChecking if file exists in the database [file=/Users/kaushikbokka/.vlmrun/tmp/KxjPgGLVJSg.mp4, hash=8e8ee35999cc6b6a45a6ed3f9dfac24a]\u001b[0m\n", 241 | "\u001b[32m2025-03-13 08:46:01.647\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mvlmrun.client.predictions\u001b[0m:\u001b[36m_handle_file_or_url\u001b[0m:\u001b[36m323\u001b[0m - \u001b[34m\u001b[1mUploaded file [file_id=38e1bb96-98a7-45a4-b831-a7c1a3fe7fca, name=KxjPgGLVJSg.mp4]\u001b[0m\n" 242 | ] 243 | }, 244 | { 245 | "name": "stdout", 246 | "output_type": "stream", 247 | "text": [ 248 | "{\n", 249 | " \"id\": \"25936135-1f6a-4f1c-b22c-6fca51880ec1\",\n", 250 | " \"created_at\": \"2025-03-13T03:16:02.401101\",\n", 251 | " \"completed_at\": null,\n", 252 | " \"response\": null,\n", 253 | " \"status\": \"pending\",\n", 254 | " \"usage\": {\n", 255 | " \"elements_processed\": null,\n", 256 | " \"element_type\": null,\n", 257 | " \"credits_used\": null\n", 258 | " }\n", 259 | "}\n" 260 | ] 261 | } 262 | ], 263 | "source": [ 264 | "from vlmrun.client.types import GenerationConfig\n", 265 | "\n", 266 | "# Generate structured data from the video\n", 267 | "response = client.video.generate(\n", 268 | " domain=\"video.transcription\",\n", 269 | " file=path,\n", 270 | " batch=True,\n", 271 | " config=GenerationConfig(detail=\"hi\"),\n", 272 | ")\n", 273 | "print(response.model_dump_json(indent=2))" 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": 12, 279 | "metadata": {}, 280 | "outputs": [ 281 | { 282 | "name": "stderr", 283 | "output_type": "stream", 284 | "text": [ 285 | "Waiting for prediction to complete: 5%|▊ | 48/1000 [04:28<1:28:51, 5.60s/it]\n" 286 | ] 287 | } 288 | ], 289 | "source": [ 290 | "from vlmrun.client.types import PredictionResponse\n", 291 | "\n", 292 | "# Wait for the prediction to complete\n", 293 | "response: PredictionResponse = client.predictions.wait(id=response.id, timeout=1000, sleep=5)\n", 294 | "assert isinstance(response, PredictionResponse)" 295 | ] 296 | }, 297 | { 298 | "cell_type": "markdown", 299 | "metadata": {}, 300 | "source": [ 301 | "### Analyzing the Transcription Results\n", 302 | "\n", 303 | "The transcription result contains rich structured data with both audio and visual information for each segment. Let's explore different ways to visualize and work with this data:\n", 304 | "\n", 305 | "#### 1. Understanding the Response Structure\n", 306 | "\n", 307 | "The response contains:\n", 308 | "- `segments`: List of video segments with audio and visual transcriptions\n", 309 | "- `metadata`: Overall video information (language, content, topics, duration)" 310 | ] 311 | }, 312 | { 313 | "cell_type": "code", 314 | "execution_count": 13, 315 | "metadata": {}, 316 | "outputs": [ 317 | { 318 | "data": { 319 | "text/html": [ 320 | "
\n", 321 | "\n", 334 | "\n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | "
segmentsmetadata.descriptionmetadata.topicsmetadata.duration
0[{'start_time': 0.0, 'end_time': 25.8, 'audio': {'content': ' Like the only ...NoneNone488.56
\n", 354 | "
" 355 | ], 356 | "text/plain": [ 357 | " segments \\\n", 358 | "0 [{'start_time': 0.0, 'end_time': 25.8, 'audio': {'content': ' Like the only ... \n", 359 | "\n", 360 | " metadata.description metadata.topics metadata.duration \n", 361 | "0 None None 488.56 " 362 | ] 363 | }, 364 | "execution_count": 13, 365 | "metadata": {}, 366 | "output_type": "execute_result" 367 | } 368 | ], 369 | "source": [ 370 | "import pandas as pd\n", 371 | "pd.set_option('display.max_colwidth', 80)\n", 372 | "\n", 373 | "# Print the high-level video transcription\n", 374 | "df = pd.json_normalize(response.response)\n", 375 | "df.head()" 376 | ] 377 | }, 378 | { 379 | "cell_type": "markdown", 380 | "metadata": {}, 381 | "source": [ 382 | "#### 2. Exploring Segment Details\n", 383 | "\n", 384 | "Each segment contains:\n", 385 | "- `start_time` and `end_time`: Temporal boundaries in seconds\n", 386 | "- `audio.content`: Text transcription of spoken content\n", 387 | "- `video.content`: Description of visual elements in the scene" 388 | ] 389 | }, 390 | { 391 | "cell_type": "code", 392 | "execution_count": 15, 393 | "metadata": {}, 394 | "outputs": [ 395 | { 396 | "data": { 397 | "text/html": [ 398 | "\n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | "
start_timeend_timeaudio.contentvideo.contentpreview
00.0025.80Like the only way to find these opportunities to learn about them is to find weirdos on the internet that are also into this thing. Yes. And they're figuring it out too. And you can kind of compare notes. Yes. And this is how new industries are created. Literally. By weirdos on the internet. Like literally. Literally. This is Dalton, plus Michael, and today we're going to talk about why AI is going to create more successful founders in the world.Two men are sitting at a table in a brightly lit room with large windows in the background. The man on the left is wearing a light gray button-up shirt and has curly hair. He is gesturing with his hands as he speaks. The man on the right is wearing a blue button-up shirt and glasses. He is smiling and listening attentively. They appear to be engaged in a conversation.
125.8051.71It's interesting, as we've gotten older, we kind of see a new set of tools come into the market and then an explosion in the number of founders who can now create value. And we've seen this before, right? Like, what was the first time you saw this? I certainly noticed when the internet was new, people that knew how to build websites were suddenly able to make lots of money fromA man in a blue shirt is seated at a table, engaged in a conversation with another person whose back is facing the camera. The man in the blue shirt is speaking and gesturing with his hands, while the other person listens attentively. The setting appears to be an indoor office or meeting room with a window in the background. The video includes text overlays that read \"AI Will Create More Successful Founders\" and \"Founder Explosion.\"
251.7171.89the skill. And it was like really basic stuff. High school kids were making tons of money. Yep. I remember people that could just figure out how to sell stuff on eBay, where you would go buy something cheap but then listed on eBay and arbitrage. Yep. Basically, you would see people that kind of understood the new tooling that came out and would like do a hustle and make ungodly amounts of money.Two men are engaged in a conversation at a table. The man on the left, wearing a light gray button-up shirt, is gesturing with his hands as he speaks. The man on the right, dressed in a blue jacket over a black shirt, listens attentively with his arms crossed. The background features a large window with a view of a cityscape, suggesting an urban setting. The conversation appears to be casual and focused, possibly discussing business or personal matters.
372.0192.67Yeah. And it was just because they understood the new tools. And I already wasn't even a hustle. Like it was a good business. Like it was, they saw that tools enabled new businesses. You know, we saw this, you know, tail end of the open source world where like we could build all of Justin TV with free software. Yep.A man with curly hair is speaking animatedly to another man who is sitting and listening attentively. The speaker gestures with his hands as he talks, emphasizing his points. The listener remains seated, occasionally nodding and responding to the speaker. The setting appears to be an office or meeting room with a window in the background.
492.67112.85And then we were there in the beginning of cloud compute where we didn't have to rack servers anymore. Any kid could sign up for an Amazon account, put a couple bucks down, and get access to a server. And so what's interesting is that we might, I think we feel pretty good about saying this, we might be onA man with a bald head and a gray beard is sitting at a table, wearing a blue shirt over a black t-shirt. He is engaged in a conversation with another person whose back is facing the camera. The man is speaking and gesturing with his hands, occasionally clapping them together. The setting appears to be an indoor environment, possibly an office or a meeting room, with a neutral background.
5112.85135.69the cusp of the next one of these. And that means there are maybe a whole bunch of new opportunities for successful businesses to be created. Yeah, starting now. Yeah, I mean, here's another metaphor. When the iPhone came out, who would have thought that Flappy Bird would have been created? And I think I read that that guy made like 20 million in cash.Two men are engaged in a conversation at a table. The man on the left, wearing a light gray shirt, is speaking animatedly, gesturing with his hands as he talks. The man on the right, dressed in a blue jacket over a black shirt, listens attentively, occasionally nodding and responding. The setting appears to be an office or conference room with large windows in the background, allowing natural light to fill the space.
6135.87159.75Boom. In like two months and then shut it down. And so if you watch, okay, iPhone, Steve Jobs on stage, some guy in Southeast Asia building Flappy Bird. That's like wild. Never would have guessed. And so, again, to be very direct, what we're arguing is that when brand new technologies come out that are powerful, the people that are on the cusp of understanding them and that quicklyTwo men are sitting at a table in a modern office setting with large windows in the background. The man on the left is wearing a light gray button-up shirt and has curly hair. He is gesturing with his hands as he speaks, occasionally raising them to emphasize points. The man on the right is wearing a blue jacket and glasses, with a beard. He is listening attentively, nodding his head and smiling. The conversation appears to be casual and friendly.
7159.75180.77build businesses or build useful things using those tools have a very unique view of creating businesses and wealth. And again, to be on the nose for AI, it seems like you can do things that would require way more headcount than you would otherwise. Yes. And so, you know, we're not even saying we know the ideas.A man with curly hair is speaking animatedly to another man who is seated and listening attentively. The speaker uses expressive hand gestures as he explains something, while the listener remains still, occasionally nodding his head. The setting appears to be an office or a casual meeting room with large windows in the background, allowing natural light to fill the space.
8180.97201.72No. We're just saying if you're watching this and you're interested in being a founder or maybe not working at a company. Yeah. And you just pay attention to every new thing that comes out and try to find these opportunities or, I don't't know arbitrage is the right word, but no, just you know, new opportunities. New opportunities using these cutting edge toolsTwo men are sitting at a table in an office setting. The man on the left is wearing a light gray shirt and has curly hair. He is gesturing with his hands as he speaks. The man on the right is wearing a blue jacket over a black shirt and has a beard. He is listening attentively with his hands clasped together. The background features large windows with a view of a body of water, and the room has a modern, minimalist design.
9201.72221.80and you're on the bleeding edge, you're not competing with anyone. No. It's green field. I think what's cool is any time one of these technologies shifts happens, the cost of starting a business, some set of businesses, reduces by up to like 10x. Yep. And so suddenly, businesses that either wouldn't have made senseTwo men are sitting at a table in a modern office setting. The man on the left is wearing a light gray button-up shirt and has curly hair. He is gesturing with his hands as he speaks. The man on the right is wearing a blue jacket over a black shirt and has a beard. He is listening attentively and occasionally responds with hand gestures. The background features large windows with a view of the outside, and the room has a clean, minimalist design.
10221.80244.96or certainly a normal person couldn't just stand up and do, right? Like can you imagine just, oh, it's pre online selling in eBay. All you have to do is rent a storefront and run a store, right? Like that's cheap, right? Like, absolutely not. Or like pre-Ari-NB. Like, all you have to do is just like buy a house and set up your own air bed and breakfastTwo men are engaged in a conversation at a table. The man on the left, wearing a light gray shirt, listens attentively while the man on the right, dressed in a blue jacket over a black shirt, gestures with his hands as he speaks. The background features a large window with a view of a body of water, suggesting an office or conference room setting. The conversation appears to be focused on business-related topics, as indicated by the text on the right side of the screen.
11244.96265.28bed and breakfast thing or even by hotel yeah that's crazy crazy. Whereas like Airbnb can rent a room. And think about it. Your own place. If you saw Airbnb early and you just decided to be a host and be like, oh, I should like do this as a business. You could do it pretty well. You could do it pretty well. Yeah. When Shopify was a brand new thing, like all of these platforms exactly the people that were the first to recognize that these wereA man in a blue shirt is seated at a table, gesturing with his hands as he speaks to another man who is also seated at the table. The man in the blue shirt appears to be explaining something, using hand movements to emphasize his points. The other man listens attentively, occasionally responding with gestures of his own. The setting is a simple, well-lit room with large windows in the background, allowing natural light to fill the space.
12265.84286.22gave them leverage yes those entrepreneurial-minded people did really well. Yes. And so I think what's so cool is that what we're saying is like if you're ambitious and you're paying attention, you might not ever need to work at a big company. You might not ever need to have a boss.Two men are engaged in a conversation at a table. The man on the left, wearing a light gray shirt, is gesturing with his hands as he speaks, indicating an animated discussion. The man on the right, dressed in a blue jacket, listens attentively, occasionally responding with his own gestures. The background features a window with blinds, suggesting an indoor setting, possibly an office or meeting room. The overall atmosphere appears to be professional and focused.
13286.22307.96Like you can be in control of your own destiny. And these moments don't happen every week. No. Like, we wish we did. It would be the investor. But like, when they do, the people who move. I mean, this is a very specific example. Yeah. But whatnot, the online live shopping thing, I still talk to the founders a lot.Two men are sitting at a table in a modern office setting. The man on the left is wearing a light gray button-up shirt and has short, curly hair. He appears to be listening attentively to the man on the right, who is wearing a blue jacket over a black shirt and glasses. The man on the right is gesturing with his hands as he speaks, indicating an engaging conversation. The background features large windows with a view of a cityscape, suggesting an urban environment.
14308.22328.00And they have, I think, like, high school-aged kids selling stuff on their... Making real money, right? And making just, again, I don't even want to say the numbers. Yeah. But they figured out the format. They understand how to use whatnot. They built a user base there. Yeah. And they're basically... They're making enough money to set themselves up for their entire life.A man with curly hair is speaking animatedly to another man who is sitting across from him. The man with curly hair is gesturing with his hands as he talks, indicating that he is explaining something or making a point. The other man is listening attentively, nodding his head slightly. The background shows a window with blinds partially open, allowing some natural light into the room.
15328.12351.76Yeah. By just seeing this new platform, figuring it out, and then making a bet on it. Yes. I mean, this happened with Twitch. Happens with Twitch. Whole new industry, basically. Yeah. No, I think that what's cool is that we're also talking about every scale, right? We're talking about things that can be venture backed, maybe billion dollar companies one day. But we're also talking about things that can justTwo men are sitting at a table in a modern office setting. The man on the left is wearing a light gray shirt and has curly hair. He is gesturing with his hands as he speaks. The man on the right is wearing a blue jacket over a black shirt and has a bald head with a beard. He is listening attentively and occasionally responds with hand gestures. The background features large windows with a view of a cityscape, and the room has a minimalist design with white walls and a light-colored floor.
16351.76372.66set you up so that you can pay your rent and live a good life. Yep. The opportunities are across the entire spectrum. And I think that's what's really cool about new technology. Like when there's a real technology shift, it affects businesses across the board. We're not just talking about companies that YCP even fun. I think that the last point I'd want to makeA man with a bald head and a beard is sitting at a table, speaking animatedly to another person whose back is facing the camera. He is wearing a blue jacket over a black shirt and is gesturing with his hands as he talks. The other person is wearing a light-colored shirt. The background is a plain, light-colored wall with a window visible behind them. On the right side of the screen, there is a list of topics or titles, including \"Endless Opportunity,\" \"Internet Weirdos,\" and \"Now Is The Time.\"
17372.66398.57on this front is that you don't get this opportunity if you're just thinking, you gotta actually do. Well, and they won't teach you this in schools. Schools teach you stuff for 10 or 20 years ago. So the other thing that I've noticed in these trends is that when you are part of the history being made and you're this early on the cutting edge of a new tech coming out, you can't expect your university or your teachers or your peers people in your communityTwo men are sitting at a table in a modern office setting. The man on the left is wearing a light gray button-up shirt and has curly hair. The man on the right is wearing a blue jacket over a black shirt and has a bald head with a beard. They are engaged in a conversation, with the man on the right gesturing with his hands as he speaks. The background features large windows with a view of a cityscape, and there is a whiteboard behind them.
18398.57419.33or your peers to teach you about it. It's only basically weirdos on the internet. Yes. Like the only way to find these opportunities to learn about them is to find weirdos on the internet that are also into this thing. Yes. And they're figuring it out too. that are also into this thing. And they're figuring it out too. And you can kind of compare notes. Yes. And this is how new industries are created. Literally. By weirdos on the internet. Like literally. Literally. By weirdos on the internet. Like literally. Literally, there's like some subreddit with a bunch of weirdos.Two men are sitting at a table in a modern office setting. The man on the left is wearing a light gray button-up shirt and has curly hair. He is gesturing with his hands as he speaks, indicating an animated conversation. The man on the right is wearing a blue jacket over a black shirt and has a beard. He is listening attentively, occasionally nodding and smiling. The background features large windows with a view of the outside, and the room has a clean, minimalist design.
19419.33447.20And like someday from now, you know, 10 years from now, there'll be an entire industry of people that learned about this thing in some subred somewhere there. Yeah, no, I totally agree. So hey, the big takeaway is if you've been wrestling your lawyers, if you thought, oh, this isn't the time to start a new business. Maybe you should reconsider. Yeah. This is a very interesting time. I think the final argument is there's a good case where a smaller percentage of the population will need to get jobs,Two men are engaged in a conversation at a table. The man on the left, wearing a light gray shirt, is gesturing animatedly with his hands as he speaks. He appears to be explaining something with enthusiasm. The man on the right, dressed in a blue jacket over a black shirt, listens attentively, occasionally nodding and responding. The background features a large window with a view of a cityscape, suggesting an urban setting. The conversation seems to be focused on business or technology topics, as indicated by the text overlays on the right side of the screen.
20447.26469.08and more people will be able to use tools like this to be self-employed in some way. I don't think there's any, I think all the structural changes imply that more folks will just use their highly leveraged selves using all these tools to run businesses, then have to go get a job. Yeah. Right? And I think that story isn't told, right? I think the story is always this kind of depressing story of like,A man with curly hair is speaking animatedly to another man who is sitting across from him. The man with curly hair is gesturing with his hands as he talks, emphasizing his points. The other man listens attentively, occasionally nodding his head. The setting appears to be an office or conference room with large windows in the background, allowing natural light to fill the space.
21469.08488.56oh, maybe you won't need it, you won't be needed anymore, as opposed to here's a set of tools. You could do things that people couldn't think of doing affordably before. Like you could be your own boss. You don't even need to be inside of a company to create value. Yeah. So anyways, hopefully that's inspiring. Good shot. Thanks. good shot thanksTwo men are sitting at a table in an office setting. The man on the left is wearing a light gray button-up shirt and has curly hair. The man on the right is wearing a blue jacket over a black shirt and has a beard and glasses. They are engaged in a conversation, with the man in the blue jacket gesturing with his hands as he speaks. The background features large windows with a view of a cloudy sky.
" 588 | ], 589 | "text/plain": [ 590 | "" 591 | ] 592 | }, 593 | "execution_count": 15, 594 | "metadata": {}, 595 | "output_type": "execute_result" 596 | } 597 | ], 598 | "source": [ 599 | "pd.set_option('display.max_colwidth', 600)\n", 600 | "\n", 601 | "segments_json = response.response.get(\"segments\", [])\n", 602 | "segments_df = pd.json_normalize(segments_json)\n", 603 | "segments_df[\"preview\"] = segments_df.apply(\n", 604 | " lambda x: IFRAME_STR.replace(\"?rel=0\", f\"?start={int(x['start_time'])}&end={int(x['end_time'])}\"), axis=1\n", 605 | ")\n", 606 | "HTML(segments_df.to_html(escape=False))" 607 | ] 608 | }, 609 | { 610 | "cell_type": "markdown", 611 | "metadata": {}, 612 | "source": [ 613 | "As you can see, the video has been segmented into ~20s scenes each with detailed audio transcriptions and corresponding visual captions. This provides developers with a powerful means to understand the video content at a granular level." 614 | ] 615 | }, 616 | { 617 | "cell_type": "markdown", 618 | "metadata": {}, 619 | "source": [ 620 | "### Thanks for following along!\n", 621 | "\n", 622 | "Head over to the [VLM Run App](https://app.vlm.run) to try out the [VLM Run](https://vlm.run) API for yourself!" 623 | ] 624 | } 625 | ], 626 | "metadata": { 627 | "colab": { 628 | "provenance": [] 629 | }, 630 | "kernelspec": { 631 | "display_name": "Python 3 (ipykernel)", 632 | "language": "python", 633 | "name": "python3" 634 | }, 635 | "language_info": { 636 | "codemirror_mode": { 637 | "name": "ipython", 638 | "version": 3 639 | }, 640 | "file_extension": ".py", 641 | "mimetype": "text/x-python", 642 | "name": "python", 643 | "nbconvert_exporter": "python", 644 | "pygments_lexer": "ipython3", 645 | "version": "3.9.7" 646 | } 647 | }, 648 | "nbformat": 4, 649 | "nbformat_minor": 4 650 | } 651 | --------------------------------------------------------------------------------