├── .gitignore ├── CONTRIBUTING.md ├── Dockerfile ├── LICENSE ├── README.md ├── requirements.txt └── token_bench ├── fvd.py ├── metrics_cli.py └── video ├── list.txt └── preprocessing_script.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.zip 2 | **/.DS_Store 3 | credentials 4 | *.secret 5 | __pycache__ 6 | pretrained_ckpts/ 7 | pretrained_ckpts 8 | reconstructions/ 9 | .vscode/ 10 | .flake8 11 | scrub_*.sh -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # How to Contribute 2 | 3 | We'd love to accept your patches and contributions. 4 | 5 | ## Code Reviews 6 | 7 | All submissions, including submissions by project members, require review. We use GitHub pull requests for this purpose. Consult 8 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more information on using pull requests. 9 | 10 | ## Signing Your Work 11 | 12 | * We require that all contributors "sign-off" on their commits. This certifies that the contribution is your original work, or you have rights to submit it under the same license, or a compatible license. 13 | 14 | * Any contribution which contains commits that are not Signed-Off will not be accepted. 15 | 16 | * To sign off on a commit you simply use the `--signoff` (or `-s`) option when committing your changes: 17 | ```bash 18 | $ git commit -s -m "Add cool feature." 19 | ``` 20 | This will append the following to your commit message: 21 | ``` 22 | Signed-off-by: Your Name 23 | ``` 24 | 25 | * Full text of the DCO: 26 | 27 | ``` 28 | Developer Certificate of Origin 29 | Version 1.1 30 | 31 | Copyright (C) 2004, 2006 The Linux Foundation and its contributors. 32 | 1 Letterman Drive 33 | Suite D4700 34 | San Francisco, CA, 94129 35 | 36 | Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. 37 | ``` 38 | 39 | ``` 40 | Developer's Certificate of Origin 1.1 41 | 42 | By making a contribution to this project, I certify that: 43 | 44 | (a) The contribution was created in whole or in part by me and I have the right to submit it under the open source license indicated in the file; or 45 | 46 | (b) The contribution is based upon previous work that, to the best of my knowledge, is covered under an appropriate open source license and I have the right under that license to submit that work with modifications, whether created in whole or in part by me, under the same open source license (unless I am permitted to submit under a different license), as indicated in the file; or 47 | 48 | (c) The contribution was provided directly to me by some other person who certified (a), (b) or (c) and I have not modified it. 49 | 50 | (d) I understand and agree that this project and the contribution are public and that a record of the contribution (including all personal information I submit with it, including my sign-off) is maintained indefinitely and may be redistributed consistent with this project or the open source license(s) involved. 51 | ``` -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | FROM pytorch/pytorch:2.5.1-cuda12.1-cudnn9-devel 16 | ARG DEBIAN_FRONTEND=noninteractive 17 | 18 | ENV TZ=America/Los_Angeles 19 | 20 | RUN apt-get update && apt-get install -y --no-install-recommends \ 21 | build-essential \ 22 | ffmpeg \ 23 | git \ 24 | git-lfs 25 | 26 | RUN pip install --upgrade pip 27 | COPY requirements.txt requirements.txt 28 | RUN pip install -r requirements.txt 29 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 15 | 16 | # TokenBench 17 | 18 | ### [Cosmos-Tokenizer Code](https://github.com/NVIDIA/Cosmos-Tokenizer) | [Technical Report](https://research.nvidia.com/labs/dir/cosmos-tokenizer/) 19 | 20 | 21 | https://github.com/user-attachments/assets/72536cfc-5cb5-4b48-88fa-b06f3c8c4495 22 | 23 | 24 | TokenBench is a comprehensive benchmark to standardize the evaluation for [Cosmos-Tokenizer](https://github.com/NVIDIA/Cosmos-Tokenizer), which covers a wide variety of domains including robotic manipulation, driving, egocentric, and web videos. It consists of high-resolution, long-duration videos, and is designed to evaluate the performance of video tokenizers. We resort to existing video datasets that are commonly used for various tasks, including [BDD100K](http://bdd-data.berkeley.edu/), [EgoExo-4D](https://docs.ego-exo4d-data.org/), [BridgeData V2](https://rail-berkeley.github.io/bridgedata/), and [Panda-70M](https://snap-research.github.io/Panda-70M/). This repo provides instructions on how to download and preprocess the videos for TokenBench. 25 | 26 | ## Installation 27 | - Clone the source code 28 | ``` 29 | git clone https://github.com/NVlabs/TokenBench.git 30 | cd TokenBench 31 | ``` 32 | - Install via pip 33 | ``` 34 | pip3 install -r requirements.txt 35 | apt-get install -y ffmpeg 36 | ``` 37 | 38 | Preferably, build a docker image using the provided Dockerfile 39 | ``` 40 | docker build -t token-bench -f Dockerfile . 41 | 42 | # You can run the container as: 43 | docker run --gpus all -it --rm -v /home/${USER}:/home/${USER} \ 44 | --workdir ${PWD} token-bench /bin/bash 45 | ``` 46 | 47 | ## Download StyleGAN Checkpoints from Hugging Face 48 | 49 | 50 | You can use this snippet to download StyleGAN checkpoints from [huggingface.co/LanguageBind/Open-Sora-Plan-v1.0.0](https://huggingface.co/LanguageBind/Open-Sora-Plan-v1.0.0): 51 | ```python 52 | from huggingface_hub import login, snapshot_download 53 | import os 54 | 55 | login(token="", add_to_git_credential=True) 56 | model_name="LanguageBind/Open-Sora-Plan-v1.0.0" 57 | local_dir = "pretrained_ckpts/" + model_name 58 | os.makedirs(local_dir, exist_ok=True) 59 | print(f"downloading `{model_name}` ...") 60 | snapshot_download(repo_id=f"{model_name}", local_dir=local_dir) 61 | ``` 62 | 63 | Under `pretrained_ckpts/Open-Sora-Plan-v1.0.0`, you can find the StyleGAN checkpoints required for FVD metrics. 64 | ```bash 65 | ├── opensora/eval/fvd/styleganv/ 66 | │ ├── fvd.py 67 | │ ├── i3d_torchscript.pt 68 | ``` 69 | 70 | ## Instructions to build TokenBench 71 | 72 | 1. Download the datasets from the official websites: 73 | * EgoExo4D: https://docs.ego-exo4d-data.org/ 74 | * BridgeData V2: https://rail-berkeley.github.io/bridgedata/ 75 | * Panda70M: https://snap-research.github.io/Panda-70M/ 76 | * BDD100K: http://bdd-data.berkeley.edu/ 77 | 78 | 2. Pick the videos as specified in the `token_bench/video/list.txt` file. 79 | 3. Preprocess the videos using the script `token_bench/video/preprocessing_script.py`. 80 | 81 | 82 | ## Evaluation on the token-bench 83 | 84 | We provide the basic scripts to compute the common evaluation metrics for video tokenizer reonctruction, including `PSNR`, `SSIM`, and `lpips`. Use the code to compute metrics between two folders as below 85 | 86 | ``` 87 | python3 -m token_bench.metrics_cli --mode=lpips \ 88 | --gtpath \ 89 | --targetpath 90 | ``` 91 | 92 | ## Continuous video tokenizer leaderboard 93 | 94 | | Tokenizer | Compression Ratio (T x H x W) | Formulation | PSNR | SSIM | rFVD | 95 | | -------------- | ----------------- | ----------- | ----- | ---- | ----- | 96 | | [CogVideoX](https://huggingface.co/docs/diffusers/en/api/models/autoencoderkl_cogvideox) | 4 × 8 × 8 | VAE | 33.149 | 0.908 | 6.970 | 97 | | [OmniTokenizer](https://github.com/FoundationVision/OmniTokenizer) | 4 × 8 × 8 | VAE | 29.705 | 0.830 | 35.867 | 98 | | Cosmos-CV | 4 × 8 × 8 | AE | 37.270 | 0.928 | 6.849 | 99 | | Cosmos-CV | 8 × 8 × 8 | AE | 36.856 | 0.917 | 11.624 | 100 | | Cosmos-CV | 8 × 16 × 16 | AE | 35.158 | 0.875 | 43.085 | 101 | 102 | ## Discrete video tokenizer leaderboard 103 | 104 | | Tokenizer | Compression Ratio (T x H x W) | Quantization | PSNR | SSIM | rFVD | 105 | | -------------- | ----------------- | ------------ | ----- | ---- | ----- | 106 | | [VideoGPT](https://github.com/wilson1yan/VideoGPT) | 4 × 4 × 4 | VQ | 35.119 | 0.914 | 13.855 | 107 | | [OmniTokenizer](https://github.com/FoundationVision/OmniTokenizer) | 4 × 8 × 8 | VQ | 30.152 | 0.827 | 53.553 | 108 | | Cosmos-DV | 4 × 8 × 8 | FSQ | 35.137 | 0.887 | 19.672 | 109 | | Cosmos-DV | 8 × 8 × 8 | FSQ | 34.746 | 0.872 | 43.865 | 110 | | Cosmos-DV | 8 × 16 × 16 | FSQ | 33.718 | 0.828 | 113.481 | 111 | 112 | 113 | ## Core contributors 114 | 115 | Fitsum Reda, Jinwei Gu, Xian Liu, Songwei Ge, Ting-Chun Wang, Haoxiang Wang, Ming-Yu Liu 116 | 117 | ## Citation 118 | 119 | If you find TokenBench useful in your works, please acknowledge it 120 | appropriately by citing: 121 | 122 | ``` 123 | @article{agarwal2025cosmos, 124 | title={Cosmos World Foundation Model Platform for Physical AI}, 125 | author={NVIDIA et. al.}, 126 | journal={arXiv preprint arXiv:2501.03575}, 127 | year={2025} 128 | } 129 | ``` 130 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | loguru>=0.7.0 16 | mediapy==1.1.6 17 | einops==0.7.0 18 | einx==0.1.3 19 | huggingface-hub>=0.26.2 20 | ipdb>=0.13.13 21 | -------------------------------------------------------------------------------- /token_bench/fvd.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import math 17 | from copy import deepcopy 18 | from typing import Optional, Sequence, Union 19 | 20 | import torch 21 | import torch.nn.functional as F 22 | from torch import Tensor 23 | from torch.nn import Module 24 | from torchmetrics.image.fid import _compute_fid 25 | from torchmetrics.metric import Metric 26 | from torchmetrics.utilities.plot import _AX_TYPE, _PLOT_OUT_TYPE 27 | 28 | _STYLEGAN_TORCHSCRIPT_CKPT = ( 29 | "./pretrained_ckpts/opensora/eval/fvd/styleganv/i3d_torchscript.pt" 30 | ) 31 | 32 | 33 | def preprocess_single(video, resolution=224, sequence_length=None): 34 | # video: CTHW, [0, 1] 35 | c, t, h, w = video.shape 36 | 37 | # temporal crop 38 | if sequence_length is not None: 39 | assert sequence_length <= t 40 | video = video[:, :sequence_length] 41 | 42 | # scale shorter side to resolution 43 | scale = resolution / min(h, w) 44 | if h < w: 45 | target_size = (resolution, math.ceil(w * scale)) 46 | else: 47 | target_size = (math.ceil(h * scale), resolution) 48 | video = F.interpolate(video, size=target_size, mode="bilinear", align_corners=False) 49 | 50 | # center crop 51 | c, t, h, w = video.shape 52 | w_start = (w - resolution) // 2 53 | h_start = (h - resolution) // 2 54 | video = video[:, :, h_start : h_start + resolution, w_start : w_start + resolution] 55 | 56 | # [0, 1] -> [-1, 1] 57 | video = (video - 0.5) * 2 58 | 59 | return video.contiguous() 60 | 61 | 62 | class StyleGANvFeatureExtractor(Module): 63 | def __init__(self): 64 | super().__init__() 65 | self.model = torch.jit.load(_STYLEGAN_TORCHSCRIPT_CKPT) 66 | self.model.eval() 67 | for param in self.model.parameters(): 68 | param.requires_grad = False 69 | 70 | @torch.no_grad() 71 | def forward(self, x): 72 | detector_kwargs = dict( 73 | rescale=False, resize=False, return_features=True 74 | ) # Return raw features before the softmax layer. 75 | return self.model( 76 | torch.stack([preprocess_single(video) for video in x]), **detector_kwargs 77 | ) 78 | 79 | 80 | class FVD(Metric): 81 | r""" 82 | Frechet Video Distance (FVD) is a metric to evaluate the quality of video generation models. 83 | 84 | As input to ``forward`` and ``update`` the metric accepts the following input 85 | 86 | - ``videos`` (:class:`~torch.Tensor`): tensor with images feed to the feature extractor with. [0, 1] 87 | - ``real`` (:class:`~bool`): bool indicating if ``videos`` belong to the real or the fake distribution 88 | 89 | As output of `forward` and `compute` the metric returns the following output 90 | 91 | - ``fvd`` (:class:`~torch.Tensor`): float scalar tensor with mean FVD value over samples 92 | 93 | Example: 94 | >>> import torch 95 | >>> torch.manual_seed(123) 96 | >>> NUMBER_OF_VIDEOS = 8 97 | >>> VIDEO_LENGTH = 50 98 | >>> CHANNEL = 3 99 | >>> SIZE = 64 100 | >>> videos1 = torch.zeros(NUMBER_OF_VIDEOS, CHANNEL, VIDEO_LENGTH, SIZE, SIZE, requires_grad=False).cuda() 101 | >>> videos2 = torch.ones(NUMBER_OF_VIDEOS, CHANNEL, VIDEO_LENGTH, SIZE, SIZE, requires_grad=False).cuda() 102 | >>> metric = FVD().cuda() 103 | >>> metric.update(videos1, real=True) 104 | >>> metric.update(videos2, real=False) 105 | >>> metric.compute() 106 | >>> tensor(232.7575) 107 | """ 108 | higher_is_better: bool = False 109 | is_differentiable: bool = False 110 | full_state_update: bool = False 111 | plot_lower_bound: float = 0.0 112 | 113 | real_features_sum: Tensor 114 | real_features_cov_sum: Tensor 115 | real_features_num_samples: Tensor 116 | 117 | fake_features_sum: Tensor 118 | fake_features_cov_sum: Tensor 119 | fake_features_num_samples: Tensor 120 | 121 | feature_extractor: Module 122 | extractor_option: str = "styleganv" 123 | 124 | def __init__( 125 | self, 126 | feature_extractor: Union[str, Module] = "styleganv", 127 | real_feature_stats: Optional[str] = None, 128 | reset_real_features: bool = True, 129 | **kwargs, 130 | ): 131 | super().__init__(**kwargs) 132 | if isinstance(feature_extractor, str): 133 | # assert feature_extractor == 'styleganv', 'Only StyleGAN video is supported for now' 134 | if feature_extractor.lower() == "styleganv": 135 | self.feature_extractor = StyleGANvFeatureExtractor() 136 | else: 137 | raise NotImplementedError( 138 | "Only StyleGANv and inceptionI3d are supported for now" 139 | ) 140 | num_features = 400 141 | else: 142 | raise NotImplementedError() 143 | 144 | mx_num_feats = (num_features, num_features) 145 | self.add_state( 146 | "real_features_sum", 147 | torch.zeros(num_features).double(), 148 | dist_reduce_fx="sum", 149 | ) 150 | self.add_state( 151 | "real_features_cov_sum", 152 | torch.zeros(mx_num_feats).double(), 153 | dist_reduce_fx="sum", 154 | ) 155 | self.add_state( 156 | "real_features_num_samples", torch.tensor(0).long(), dist_reduce_fx="sum" 157 | ) 158 | 159 | self.add_state( 160 | "fake_features_sum", 161 | torch.zeros(num_features).double(), 162 | dist_reduce_fx="sum", 163 | ) 164 | self.add_state( 165 | "fake_features_cov_sum", 166 | torch.zeros(mx_num_feats).double(), 167 | dist_reduce_fx="sum", 168 | ) 169 | self.add_state( 170 | "fake_features_num_samples", torch.tensor(0).long(), dist_reduce_fx="sum" 171 | ) 172 | 173 | self.reset_real_features = reset_real_features 174 | self.reuse_real_stats = real_feature_stats is not None 175 | if self.reuse_real_stats: 176 | raise NotImplementedError() 177 | 178 | def update(self, videos: Tensor, real: bool) -> None: 179 | features = self.feature_extractor(videos) 180 | self.orig_dtype = features.dtype 181 | features = features.double() 182 | 183 | if features.dim() == 1: 184 | features = features.unsqueeze(0) 185 | if real: 186 | self.real_features_sum += features.sum(dim=0) 187 | self.real_features_cov_sum += features.t().mm(features) 188 | self.real_features_num_samples += videos.shape[0] 189 | else: 190 | self.fake_features_sum += features.sum(dim=0) 191 | self.fake_features_cov_sum += features.t().mm(features) 192 | self.fake_features_num_samples += videos.shape[0] 193 | 194 | def update_real_fake_batch(self, real_video: Tensor, fake_video: Tensor) -> None: 195 | self.update(real_video, real=True) 196 | self.update(fake_video, real=False) 197 | 198 | def compute_fvd_from_features( 199 | self, real_features: Tensor, fake_features: Tensor 200 | ) -> float: 201 | real_features = real_features.double() 202 | fake_features = fake_features.double() 203 | real_features_sum = real_features.sum(dim=0) 204 | real_features_cov_sum = real_features.t().mm(real_features) 205 | real_features_num_samples = real_features.shape[0] 206 | 207 | fake_features_sum = fake_features.sum(dim=0) 208 | fake_features_cov_sum = fake_features.t().mm(fake_features) 209 | fake_features_num_samples = fake_features.shape[0] 210 | 211 | if real_features_num_samples < 2 or fake_features_num_samples < 2: 212 | raise RuntimeError( 213 | "More than one sample is required for both the real and fake distributed to compute FID" 214 | ) 215 | mean_real = (real_features_sum / real_features_num_samples).unsqueeze(0) 216 | mean_fake = (fake_features_sum / fake_features_num_samples).unsqueeze(0) 217 | 218 | cov_real_num = ( 219 | real_features_cov_sum 220 | - real_features_num_samples * mean_real.t().mm(mean_real) 221 | ) 222 | cov_real = cov_real_num / (real_features_num_samples - 1) 223 | cov_fake_num = ( 224 | fake_features_cov_sum 225 | - fake_features_num_samples * mean_fake.t().mm(mean_fake) 226 | ) 227 | cov_fake = cov_fake_num / (fake_features_num_samples - 1) 228 | return ( 229 | _compute_fid(mean_real.squeeze(0), cov_real, mean_fake.squeeze(0), cov_fake) 230 | .float() 231 | .item() 232 | ) 233 | 234 | def compute(self) -> Tensor: 235 | """Calculate FID score based on accumulated extracted features from the two distributions.""" 236 | if self.real_features_num_samples < 2 or self.fake_features_num_samples < 2: 237 | raise RuntimeError( 238 | "More than one sample is required for both the real and fake distributed to compute FID" 239 | ) 240 | mean_real = (self.real_features_sum / self.real_features_num_samples).unsqueeze( 241 | 0 242 | ) 243 | mean_fake = (self.fake_features_sum / self.fake_features_num_samples).unsqueeze( 244 | 0 245 | ) 246 | 247 | cov_real_num = ( 248 | self.real_features_cov_sum 249 | - self.real_features_num_samples * mean_real.t().mm(mean_real) 250 | ) 251 | cov_real = cov_real_num / (self.real_features_num_samples - 1) 252 | cov_fake_num = ( 253 | self.fake_features_cov_sum 254 | - self.fake_features_num_samples * mean_fake.t().mm(mean_fake) 255 | ) 256 | cov_fake = cov_fake_num / (self.fake_features_num_samples - 1) 257 | return _compute_fid( 258 | mean_real.squeeze(0), cov_real, mean_fake.squeeze(0), cov_fake 259 | ).to(self.orig_dtype) 260 | 261 | def reset(self) -> None: 262 | """Reset metric states.""" 263 | if not self.reset_real_features: 264 | real_features_sum = deepcopy(self.real_features_sum) 265 | real_features_cov_sum = deepcopy(self.real_features_cov_sum) 266 | real_features_num_samples = deepcopy(self.real_features_num_samples) 267 | super().reset() 268 | self.real_features_sum = real_features_sum 269 | self.real_features_cov_sum = real_features_cov_sum 270 | self.real_features_num_samples = real_features_num_samples 271 | else: 272 | super().reset() 273 | 274 | def plot( 275 | self, 276 | val: Optional[Union[Tensor, Sequence[Tensor]]] = None, 277 | ax: Optional[_AX_TYPE] = None, 278 | ) -> _PLOT_OUT_TYPE: 279 | """Plot a single or multiple values from the metric. 280 | 281 | Args: 282 | val: Either a single result from calling `metric.forward` or `metric.compute` or a list of these results. 283 | If no value is provided, will automatically call `metric.compute` and plot that result. 284 | ax: An matplotlib axis object. If provided will add plot to that axis 285 | 286 | Returns: 287 | Figure and Axes object 288 | 289 | Raises: 290 | ModuleNotFoundError: 291 | If `matplotlib` is not installed 292 | """ 293 | return self._plot(val, ax) 294 | -------------------------------------------------------------------------------- /token_bench/metrics_cli.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Code to compute different metrics for tokenizer evaluation. 17 | 18 | Assumes the reconstructed and ground truth folders contain the same number 19 | of videos with the filenames. Compute PSNR, SSIM, LPIPS, and FVD. 20 | 21 | Example for MP4 videos: 22 | python3 -m token_bench.metrics_cli \ 23 | --mode=all \ 24 | --ext=mp4 \ 25 | --gtpath \ 26 | --targetpath 27 | 28 | For images, set the ext to "png" or "jpg". 29 | """ 30 | 31 | import argparse 32 | import os 33 | from typing import Callable 34 | 35 | import json 36 | import lpips 37 | import numpy as np 38 | import torch 39 | from skimage.metrics import structural_similarity as ssim 40 | from tqdm import tqdm 41 | from glob import glob 42 | 43 | from mediapy import read_video 44 | from token_bench.fvd import FVD 45 | 46 | _FLOAT32_EPS = np.finfo(np.float32).eps 47 | _UINT8_MAX_F = float(np.iinfo(np.uint8).max) 48 | 49 | parser = argparse.ArgumentParser() 50 | parser.add_argument( 51 | "--gtpath", 52 | type=str, 53 | required=True, 54 | help="path/to/eval/videos//", 55 | ) 56 | parser.add_argument( 57 | "--targetpath", 58 | type=str, 59 | default=None, 60 | help="path/to/eval/videos//", 61 | ) 62 | parser.add_argument("--mode", type=str, choices=["psnr", "lpips", "fvd", "all"]) 63 | parser.add_argument("--device", type=str, default="cuda") 64 | parser.add_argument("--ext", type=str, default="mp4") 65 | args = parser.parse_args() 66 | 67 | 68 | def PSNR(input0: np.ndarray, input1: np.ndarray) -> float: 69 | """Compute PSNR between two videos or two images. 70 | 71 | Args: 72 | input0: The first video or image, of shape [..., H, W, C], of [0..255]. 73 | input1: The second video or image, of shape [..., H, W, C], of [0..255]. 74 | 75 | Returns: 76 | The PSNR value. 77 | """ 78 | assert input0.shape == input1.shape, "inputs should have the same shape" 79 | mse = ((input0 - input1) ** 2).mean() 80 | psnr = 20 * np.log10(_UINT8_MAX_F / (np.sqrt(mse) + _FLOAT32_EPS)) 81 | return psnr.item() 82 | 83 | 84 | def SSIM(input0: np.ndarray, input1: np.ndarray) -> float: 85 | """Compute SSIM between two videos or two images. 86 | 87 | Args: 88 | input0: The first video or image, of shape [..., H, W, C], of [0..255]. 89 | input1: The second video or image, of shape [..., H, W, C], of [0..255]. 90 | 91 | Returns: 92 | The SSIM value. 93 | """ 94 | assert input0.shape == input1.shape, "inputs should have the same shape" 95 | if input0.ndim == 3: 96 | input0, input1 = np.array([input0]), np.array([input1]) 97 | ssim_values = [] 98 | from concurrent.futures import ThreadPoolExecutor 99 | 100 | def compute_ssim(pair): 101 | one_image0, one_image1 = pair 102 | return ssim( 103 | one_image0, 104 | one_image1, 105 | data_range=_UINT8_MAX_F, 106 | multichannel=True, 107 | channel_axis=-1, 108 | ) 109 | 110 | with ThreadPoolExecutor() as executor: 111 | ssim_values = list(executor.map(compute_ssim, zip(input0, input1))) 112 | return np.mean(ssim_values) 113 | 114 | 115 | def LPIPS(input0: np.ndarray, input1: np.ndarray, loss_fn_vgg: Callable) -> float: 116 | """Compute LPIPS between two videos or two images. 117 | 118 | Args: 119 | input0: The first video or image, of shape [..., H, W, C], of [0..255]. 120 | input1: The second video or image, of shape [..., H, W, C], of [0..255]. 121 | loss_fn_vgg: The LPIPS loss function. 122 | device: The device to run the computation. 123 | 124 | Returns: 125 | The LPIPS value. 126 | """ 127 | assert input0.shape == input1.shape, "inputs should have the same shape" 128 | if input0.ndim == 3: 129 | input0, input1 = np.array([input0]), np.array([input1]) 130 | 131 | # computing LPIPS needs to normalize input to [-1,1]. 132 | input0 = torch.from_numpy(2 * (input0 / _UINT8_MAX_F - 0.5)).to(torch.float32) 133 | input1 = torch.from_numpy(2 * (input1 / _UINT8_MAX_F - 0.5)).to(torch.float32) 134 | 135 | input0 = input0.permute(0, 3, 1, 2) # N, C, H, W 136 | input1 = input1.permute(0, 3, 1, 2) # N, C, H, W 137 | 138 | # average LPIPS over all frames 139 | results = [] 140 | for one_input0, one_input1 in zip(input0, input1): 141 | fm0 = one_input0.unsqueeze(0).to(args.device) 142 | fm1 = one_input1.unsqueeze(0).to(args.device) 143 | res = loss_fn_vgg(fm0, fm1).item() 144 | results.append(res) 145 | 146 | return np.mean(results) 147 | 148 | 149 | def main_psnr_ssim() -> None: 150 | vfiles0 = sorted(list(set(glob(str(f"{args.gtpath}/*.{args.ext}"))))) 151 | vfiles1 = sorted(list(set(glob(str(f"{args.targetpath}/*.{args.ext}"))))) 152 | 153 | psnr_filename = f"{args.targetpath}/psnr.csv" 154 | ssim_filename = f"{args.targetpath}/ssim.csv" 155 | if os.path.exists(psnr_filename) and os.path.exists(ssim_filename): 156 | print(f"{psnr_filename} already exists. Recomputing ...") 157 | print(f"{ssim_filename} already exists. Recomputing ...") 158 | 159 | assert len(vfiles0) == len(vfiles1), "number of media files must match" 160 | 161 | print(f"Calculating PSNR on {len(vfiles0)} pairs ...") 162 | psnr_values, ssim_values = list(), list() 163 | for input0_file, input1_file in tqdm(zip(vfiles0, vfiles1)): 164 | assert ( 165 | input0_file.split("/")[-1] == input1_file.split("/")[-1] 166 | ), "file names must match" 167 | input0 = read_video(input0_file).astype(np.float32) 168 | input1 = read_video(input1_file).astype(np.float32) 169 | 170 | name = input0_file.split("/")[-1] 171 | psnr_value = PSNR(input0, input1) 172 | ssim_value = SSIM(input0, input1) 173 | 174 | psnr_values.append([name, psnr_value]) 175 | ssim_values.append([name, ssim_value]) 176 | print(f"{name} PSNR: {psnr_value}, SSIM: {ssim_value}") 177 | 178 | print(f"mean PSNR: {np.mean([el[-1] for el in psnr_values])}") 179 | print(f"mean SSIM: {np.mean([el[-1] for el in ssim_values])}") 180 | 181 | with open(psnr_filename, "w") as fw: 182 | json.dump(psnr_values, fw) 183 | 184 | with open(ssim_filename, "w") as fw: 185 | json.dump(ssim_values, fw) 186 | 187 | 188 | def main_lpips() -> None: 189 | loss_fn_vgg = lpips.LPIPS(net="vgg").to(args.device).eval() 190 | 191 | vfiles0 = sorted(list(set(glob(str(f"{args.gtpath}/*.{args.ext}"))))) 192 | vfiles1 = sorted(list(set(glob(str(f"{args.targetpath}/*.{args.ext}"))))) 193 | 194 | lpips_filename = f"{args.targetpath}/lpips.csv" 195 | if os.path.exists(lpips_filename): 196 | print(f"{lpips_filename} already exists. Recomputing ...") 197 | 198 | assert len(vfiles0) == len(vfiles1), "video files not match" 199 | 200 | print(f"Calculating LPIPS on {len(vfiles1)} pairs ...") 201 | lpips_values = list() 202 | for i in tqdm(range(len(vfiles0))): 203 | vid0 = read_video(vfiles0[i]) 204 | vid1 = read_video(vfiles1[i]) 205 | 206 | name = vfiles0[i].split("/")[-1] 207 | lpips_value = LPIPS(vid0, vid1, loss_fn_vgg) 208 | lpips_values.append([name, lpips_value]) 209 | 210 | print(f"mean LPIPS: {np.mean([el[-1] for el in lpips_values])}") 211 | 212 | with open(lpips_filename, "w") as fw: 213 | json.dump(lpips_values, fw) 214 | 215 | 216 | def main_fvd(max_n_frame: int = 300) -> None: 217 | fvd_model = FVD("styleganv").to(args.device).double() 218 | 219 | vfiles0 = sorted(list(set(glob(str(f"{args.gtpath}/*.{args.ext}"))))) 220 | vfiles1 = sorted(list(set(glob(str(f"{args.targetpath}/*.{args.ext}"))))) 221 | fvd_filename = f"{args.targetpath}/fvd.csv" 222 | if os.path.exists(fvd_filename): 223 | print(f"{fvd_filename} already exists. Recomputing ...") 224 | 225 | fvd_model.reset() 226 | 227 | assert len(vfiles0) == len(vfiles1), "video files not match" 228 | 229 | print(f"Calculating FVD on {len(vfiles1)} pairs ...") 230 | for i in tqdm(range(len(vfiles0))): 231 | vid0 = read_video(vfiles0[i])[:max_n_frame] 232 | vid1 = read_video(vfiles1[i])[:max_n_frame] 233 | 234 | if vid0.ndim == 3: 235 | vid0, vid1 = np.array([vid0]), np.array([vid1]) 236 | 237 | vid0 = torch.from_numpy(vid0 / 255.0).to(args.device).float() 238 | vid1 = torch.from_numpy(vid1 / 255.0).to(args.device).float() 239 | vid0 = vid0.permute(3, 0, 1, 2).unsqueeze(0) 240 | vid1 = vid1.permute(3, 0, 1, 2).unsqueeze(0) 241 | 242 | fvd_model.update_real_fake_batch(vid0, vid1) 243 | 244 | fvd = fvd_model.compute().item() 245 | print(f"FVD: {fvd}") 246 | 247 | with open(fvd_filename, "w") as fw: 248 | json.dump([fvd], fw) 249 | 250 | 251 | if __name__ == "__main__": 252 | if args.mode.lower() == "psnr" or args.mode.lower() == "all": 253 | main_psnr_ssim() 254 | 255 | if args.mode.lower() == "lpips" or args.mode.lower() == "all": 256 | main_lpips() 257 | 258 | if args.mode.lower() == "fvd" or args.mode.lower() == "all": 259 | main_fvd() 260 | -------------------------------------------------------------------------------- /token_bench/video/list.txt: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | ### BDD100K 17 | 0114bdd0-f317da84.mov 18 | 0129ed1d-d8572c11.mov 19 | 02086f50-c1e5d802.mov 20 | 04ab9fd3-635a600c.mov 21 | 0679098c-8505f50b.mov 22 | 074f7c18-5077b13e.mov 23 | 0acc0c71-c476dd2a.mov 24 | 0b1beb15-18d74ac1.mov 25 | 0b4034a2-769ded45.mov 26 | 10fc5fdb-dfbee909.mov 27 | 1812c350-bbf674fb.mov 28 | 1cef9ae4-f924b2f5.mov 29 | 1d2090ea-c3bba67f.mov 30 | 2106a66a-71b2070c.mov 31 | 2173474d-fe4bcc41.mov 32 | 224d0424-e93a76ba.mov 33 | 23a9d660-db3f344a.mov 34 | 26e31dd6-96569c52.mov 35 | 28e65305-1b99e432.mov 36 | 291dee4f-96af2f0e.mov 37 | 29c9ca26-7d1c3cf1.mov 38 | 2b0cf575-9d9cff53.mov 39 | 2d522831-d089f33b.mov 40 | 2dc12212-ce30cce3.mov 41 | 31a9e4c6-3f8cbfc4.mov 42 | 3584e2a1-a2975190.mov 43 | 360c24e7-0d6b223b.mov 44 | 36429156-0d51c714.mov 45 | 3b3fda3d-e07e7ca2.mov 46 | 45ba36bc-feacc8e3.mov 47 | 4d937412-27135796.mov 48 | 4e62224d-abacf2f4.mov 49 | 516bbf6a-74fb1d01.mov 50 | 51dfaba5-56364809.mov 51 | 54be9082-49bcb9e4.mov 52 | 5804addd-974eca9b.mov 53 | 5980ee70-150f0e17.mov 54 | 5cf0b01a-96b278aa.mov 55 | 609d9e62-2a51f401.mov 56 | 63d043cc-a63afce5.mov 57 | 64505e49-d58ca737.mov 58 | 645f2d38-854fe929.mov 59 | 64d21f66-0fe98085.mov 60 | 653bc721-69d16aba.mov 61 | 68839897-f1b0ddc2.mov 62 | 6a01e6d3-88a8146f.mov 63 | 6b099ca0-fcc176c5.mov 64 | 6b1f5022-643ba354.mov 65 | 6f3376e6-804cae82.mov 66 | 6fc91039-33a3f08c.mov 67 | 718b654d-413e583e.mov 68 | 7513fc19-43906d06.mov 69 | 7742bb4c-06b7ae4b.mov 70 | 78652566-3dfd7b3c.mov 71 | 7a3a5a4a-404486a2.mov 72 | 7a9d6ddb-e8664b82.mov 73 | 7d1a9525-4c2c6e75.mov 74 | 7d773114-2e6ef082.mov 75 | 821e597a-861c39b6.mov 76 | 82be5d01-2b1973c9.mov 77 | 830e5896-4e6f4704.mov 78 | 851f548d-db9d7ca6.mov 79 | 85226849-53900bd1.mov 80 | 882be3d2-1053c510.mov 81 | 88b3b973-355a037f.mov 82 | 8ab4cc62-5ae03755.mov 83 | 90a32478-fd55db45.mov 84 | 9349409c-82841d3b.mov 85 | 97f52cc4-301eb414.mov 86 | 9fc1b74e-aa9bbf62.mov 87 | a064b122-6649e77b.mov 88 | a28cf0f4-d9f7d840.mov 89 | a309d2f1-fe73ec63.mov 90 | a4f18ae7-cb5958a4.mov 91 | a68b0982-1cae0b53.mov 92 | aa60b8fc-e765849f.mov 93 | af20f5ca-ff3f7a18.mov 94 | b7a8e795-36cc1230.mov 95 | ba475d4d-ed2daa1d.mov 96 | bc6a2a77-e9984397.mov 97 | bf80a27d-2933a89a.mov 98 | bf8ff5f5-1133a840.mov 99 | c59500f0-f26be735.mov 100 | c62073d1-3d9adad7.mov 101 | c7292daf-0761f9c4.mov 102 | c8f90ec2-37aa3a28.mov 103 | c96ffe59-25022c0b.mov 104 | ce0f3100-bb6bd505.mov 105 | d45f1e68-d1d9676e.mov 106 | e22d9ea1-ea51bc75.mov 107 | e425ee9e-4860754b.mov 108 | e5cd93c4-41251973.mov 109 | e6fa775c-e402ddf4.mov 110 | e822bbc5-0a373fca.mov 111 | ec28e927-41fd88bd.mov 112 | f2e850fd-d5d2e81e.mov 113 | f7e43a5f-f2f23ba0.mov 114 | f8835c7b-dd780d50.mov 115 | fd6b1286-c444b362.mov 116 | fe189115-cfad8fcf.mov 117 | ### EgoExo-4D 118 | cmu_bike01_2/frame_aligned_videos/aria01_214-1.mp4 119 | cmu_bike01_2/frame_aligned_videos/cam01.mp4 120 | cmu_bike01_3/frame_aligned_videos/aria01_214-1.mp4 121 | cmu_bike01_3/frame_aligned_videos/cam01.mp4 122 | cmu_bike09_4/frame_aligned_videos/aria01_214-1.mp4 123 | cmu_bike09_4/frame_aligned_videos/cam01.mp4 124 | cmu_bike15_1/frame_aligned_videos/aria01_214-1.mp4 125 | cmu_bike15_1/frame_aligned_videos/cam01.mp4 126 | cmu_bike18_2/frame_aligned_videos/aria01_214-1.mp4 127 | cmu_bike18_2/frame_aligned_videos/cam01.mp4 128 | cmu_soccer06_6/frame_aligned_videos/aria01_214-1.mp4 129 | cmu_soccer06_6/frame_aligned_videos/cam01.mp4 130 | cmu_soccer07_4/frame_aligned_videos/aria01_214-1.mp4 131 | cmu_soccer07_4/frame_aligned_videos/cam01.mp4 132 | cmu_soccer08_1/frame_aligned_videos/aria01_214-1.mp4 133 | cmu_soccer08_1/frame_aligned_videos/cam01.mp4 134 | cmu_soccer11_2/frame_aligned_videos/aria01_214-1.mp4 135 | cmu_soccer11_2/frame_aligned_videos/cam01.mp4 136 | cmu_soccer12_2/frame_aligned_videos/aria01_214-1.mp4 137 | cmu_soccer12_2/frame_aligned_videos/cam01.mp4 138 | cmu_soccer14_3/frame_aligned_videos/aria01_214-1.mp4 139 | cmu_soccer14_3/frame_aligned_videos/cam01.mp4 140 | fair_bike_01_14/frame_aligned_videos/aria01_214-1.mp4 141 | fair_bike_01_14/frame_aligned_videos/cam01.mp4 142 | fair_bike_01_16/frame_aligned_videos/aria01_214-1.mp4 143 | fair_bike_01_16/frame_aligned_videos/cam01.mp4 144 | fair_bike_01_2/frame_aligned_videos/aria01_214-1.mp4 145 | fair_bike_01_2/frame_aligned_videos/cam01.mp4 146 | fair_bike_01_3/frame_aligned_videos/aria01_214-1.mp4 147 | fair_bike_01_3/frame_aligned_videos/cam01.mp4 148 | fair_bike_10_4/frame_aligned_videos/aria01_214-1.mp4 149 | fair_bike_10_4/frame_aligned_videos/cam01.mp4 150 | georgiatech_cooking_01_03_2/frame_aligned_videos/aria01_214-1.mp4 151 | georgiatech_cooking_01_03_2/frame_aligned_videos/cam01.mp4 152 | georgiatech_covid_06_11/frame_aligned_videos/aria01_214-1.mp4 153 | georgiatech_covid_06_11/frame_aligned_videos/cam01.mp4 154 | georgiatech_covid_09_4/frame_aligned_videos/aria01_214-1.mp4 155 | georgiatech_covid_09_4/frame_aligned_videos/cam01.mp4 156 | iiith_cooking_100_6/frame_aligned_videos/aria01_214-1.mp4 157 | iiith_cooking_100_6/frame_aligned_videos/cam01.mp4 158 | iiith_cooking_11_1/frame_aligned_videos/aria01_214-1.mp4 159 | iiith_cooking_11_1/frame_aligned_videos/cam01.mp4 160 | iiith_cooking_29_1/frame_aligned_videos/aria01_214-1.mp4 161 | iiith_cooking_29_1/frame_aligned_videos/cam01.mp4 162 | iiith_cooking_55_2/frame_aligned_videos/aria01_214-1.mp4 163 | iiith_cooking_55_2/frame_aligned_videos/cam01.mp4 164 | iiith_cooking_67_6/frame_aligned_videos/aria01_214-1.mp4 165 | iiith_cooking_67_6/frame_aligned_videos/cam01.mp4 166 | iiith_cooking_75_2/frame_aligned_videos/aria01_214-1.mp4 167 | iiith_cooking_75_2/frame_aligned_videos/cam01.mp4 168 | iiith_cooking_89_6/frame_aligned_videos/aria01_214-1.mp4 169 | iiith_cooking_89_6/frame_aligned_videos/cam01.mp4 170 | iiith_soccer_028_4/frame_aligned_videos/aria01_214-1.mp4 171 | iiith_soccer_028_4/frame_aligned_videos/cam01.mp4 172 | iiith_soccer_040_6/frame_aligned_videos/aria01_214-1.mp4 173 | iiith_soccer_040_6/frame_aligned_videos/cam01.mp4 174 | iiith_soccer_048_4/frame_aligned_videos/aria01_214-1.mp4 175 | iiith_soccer_048_4/frame_aligned_videos/cam01.mp4 176 | iiith_soccer_053_2/frame_aligned_videos/aria01_214-1.mp4 177 | iiith_soccer_053_2/frame_aligned_videos/cam01.mp4 178 | indiana_bike_05_11/frame_aligned_videos/aria01_214-1.mp4 179 | indiana_bike_05_11/frame_aligned_videos/cam01.mp4 180 | indiana_bike_09_6/frame_aligned_videos/aria01_214-1.mp4 181 | indiana_bike_09_6/frame_aligned_videos/cam01.mp4 182 | indiana_music_01_5/frame_aligned_videos/aria01_214-1.mp4 183 | indiana_music_01_5/frame_aligned_videos/cam01.mp4 184 | indiana_music_04_3/frame_aligned_videos/aria01_214-1.mp4 185 | indiana_music_04_3/frame_aligned_videos/cam01.mp4 186 | indiana_music_09_5/frame_aligned_videos/aria01_214-1.mp4 187 | indiana_music_09_5/frame_aligned_videos/cam01.mp4 188 | indiana_music_13_4/frame_aligned_videos/aria01_214-1.mp4 189 | indiana_music_13_4/frame_aligned_videos/cam01.mp4 190 | minnesota_rockclimbing_020_10/frame_aligned_videos/aria01_214-1.mp4 191 | minnesota_rockclimbing_020_10/frame_aligned_videos/cam01.mp4 192 | minnesota_rockclimbing_020_12/frame_aligned_videos/aria01_214-1.mp4 193 | minnesota_rockclimbing_020_12/frame_aligned_videos/cam01.mp4 194 | minnesota_rockclimbing_020_16/frame_aligned_videos/aria01_214-1.mp4 195 | minnesota_rockclimbing_020_16/frame_aligned_videos/cam01.mp4 196 | minnesota_rockclimbing_020_30/frame_aligned_videos/aria01_214-1.mp4 197 | minnesota_rockclimbing_020_30/frame_aligned_videos/cam01.mp4 198 | minnesota_rockclimbing_030_24/frame_aligned_videos/aria01_214-1.mp4 199 | minnesota_rockclimbing_030_24/frame_aligned_videos/cam01.mp4 200 | nus_cooking_12_3/frame_aligned_videos/aria01_214-1.mp4 201 | nus_cooking_12_3/frame_aligned_videos/cam01.mp4 202 | nus_cooking_16_2/frame_aligned_videos/aria01_214-1.mp4 203 | nus_cooking_16_2/frame_aligned_videos/cam01.mp4 204 | nus_covidtest_50_1/frame_aligned_videos/aria01_214-1.mp4 205 | nus_covidtest_50_1/frame_aligned_videos/cam01.mp4 206 | nus_covidtest_53_1/frame_aligned_videos/aria01_214-1.mp4 207 | nus_covidtest_53_1/frame_aligned_videos/cam01.mp4 208 | nus_cpr_26_4/frame_aligned_videos/aria01_214-1.mp4 209 | nus_cpr_26_4/frame_aligned_videos/cam01.mp4 210 | nus_cpr_34_3/frame_aligned_videos/aria01_214-1.mp4 211 | nus_cpr_34_3/frame_aligned_videos/cam01.mp4 212 | nus_cpr_38_2/frame_aligned_videos/aria01_214-1.mp4 213 | nus_cpr_38_2/frame_aligned_videos/cam01.mp4 214 | nus_cpr_44_4/frame_aligned_videos/aria01_214-1.mp4 215 | nus_cpr_44_4/frame_aligned_videos/cam01.mp4 216 | sfu_cooking_008_5/frame_aligned_videos/aria01_214-1.mp4 217 | sfu_cooking_008_5/frame_aligned_videos/cam01.mp4 218 | sfu_cooking_010_3/frame_aligned_videos/aria01_214-1.mp4 219 | sfu_cooking_010_3/frame_aligned_videos/cam01.mp4 220 | sfu_cooking_011_3/frame_aligned_videos/aria01_214-1.mp4 221 | sfu_cooking_011_3/frame_aligned_videos/cam01.mp4 222 | sfu_cooking_012_3/frame_aligned_videos/aria01_214-1.mp4 223 | sfu_cooking_012_3/frame_aligned_videos/cam01.mp4 224 | sfu_cooking017_2/frame_aligned_videos/aria01_214-1.mp4 225 | sfu_cooking017_2/frame_aligned_videos/cam01.mp4 226 | sfu_cooking017_8/frame_aligned_videos/aria01_214-1.mp4 227 | sfu_cooking017_8/frame_aligned_videos/cam01.mp4 228 | sfu_cooking022_4/frame_aligned_videos/aria01_214-1.mp4 229 | sfu_cooking022_4/frame_aligned_videos/cam01.mp4 230 | sfu_cooking022_8/frame_aligned_videos/aria01_214-1.mp4 231 | sfu_cooking022_8/frame_aligned_videos/cam01.mp4 232 | sfu_cooking023_8/frame_aligned_videos/aria01_214-1.mp4 233 | sfu_cooking023_8/frame_aligned_videos/cam01.mp4 234 | sfu_cooking028_2/frame_aligned_videos/aria01_214-1.mp4 235 | sfu_cooking028_2/frame_aligned_videos/cam01.mp4 236 | sfu_cooking028_6/frame_aligned_videos/aria01_214-1.mp4 237 | sfu_cooking028_6/frame_aligned_videos/cam01.mp4 238 | sfu_cooking030_4/frame_aligned_videos/aria01_214-1.mp4 239 | sfu_cooking030_4/frame_aligned_videos/cam01.mp4 240 | sfu_covid_009_2/frame_aligned_videos/aria01_214-1.mp4 241 | sfu_covid_009_2/frame_aligned_videos/cam01.mp4 242 | sfu_covid_009_4/frame_aligned_videos/aria01_214-1.mp4 243 | sfu_covid_009_4/frame_aligned_videos/cam01.mp4 244 | sfu_covid_013_4/frame_aligned_videos/aria01_214-1.mp4 245 | sfu_covid_013_4/frame_aligned_videos/cam01.mp4 246 | unc_basketball_02-24-23_01_20/frame_aligned_videos/aria01_214-1.mp4 247 | unc_basketball_02-24-23_01_20/frame_aligned_videos/cam01.mp4 248 | unc_basketball_02-24-23_01_29/frame_aligned_videos/aria01_214-1.mp4 249 | unc_basketball_02-24-23_01_29/frame_aligned_videos/cam01.mp4 250 | unc_basketball_03-16-23_01_37/frame_aligned_videos/aria01_214-1.mp4 251 | unc_basketball_03-16-23_01_37/frame_aligned_videos/cam01.mp4 252 | unc_basketball_03-30-23_01_52/frame_aligned_videos/aria01_214-1.mp4 253 | unc_basketball_03-30-23_01_52/frame_aligned_videos/cam01.mp4 254 | unc_basketball_03-31-23_01_31/frame_aligned_videos/aria01_214-1.mp4 255 | unc_basketball_03-31-23_01_31/frame_aligned_videos/cam01.mp4 256 | unc_soccer_09-21-23_01_21/frame_aligned_videos/aria01_214-1.mp4 257 | unc_soccer_09-21-23_01_21/frame_aligned_videos/cam01.mp4 258 | unc_soccer_09-21-23_01_26/frame_aligned_videos/aria01_214-1.mp4 259 | unc_soccer_09-21-23_01_26/frame_aligned_videos/cam01.mp4 260 | unc_soccer_09-21-23_01_33/frame_aligned_videos/aria01_214-1.mp4 261 | unc_soccer_09-21-23_01_33/frame_aligned_videos/cam01.mp4 262 | unc_soccer_09-21-23_01_4/frame_aligned_videos/aria01_214-1.mp4 263 | unc_soccer_09-21-23_01_4/frame_aligned_videos/cam01.mp4 264 | unc_soccer_09-22-23_01_26/frame_aligned_videos/aria01_214-1.mp4 265 | unc_soccer_09-22-23_01_26/frame_aligned_videos/cam01.mp4 266 | unc_soccer_09-22-23_02_14/frame_aligned_videos/aria01_214-1.mp4 267 | unc_soccer_09-22-23_02_14/frame_aligned_videos/cam01.mp4 268 | uniandes_bouldering_013_16/frame_aligned_videos/aria01_214-1.mp4 269 | uniandes_bouldering_013_16/frame_aligned_videos/cam01.mp4 270 | uniandes_bouldering_017_7/frame_aligned_videos/aria01_214-1.mp4 271 | uniandes_bouldering_017_7/frame_aligned_videos/cam01.mp4 272 | uniandes_bouldering_026_44/frame_aligned_videos/aria01_214-1.mp4 273 | uniandes_bouldering_026_44/frame_aligned_videos/cam01.mp4 274 | uniandes_bouldering_027_59/frame_aligned_videos/aria01_214-1.mp4 275 | uniandes_bouldering_027_59/frame_aligned_videos/cam01.mp4 276 | uniandes_bouldering_031_71/frame_aligned_videos/aria01_214-1.mp4 277 | uniandes_bouldering_031_71/frame_aligned_videos/cam01.mp4 278 | uniandes_bouldering_032_34/frame_aligned_videos/aria01_214-1.mp4 279 | uniandes_bouldering_032_34/frame_aligned_videos/cam01.mp4 280 | uniandes_bouldering_032_60/frame_aligned_videos/aria01_214-1.mp4 281 | uniandes_bouldering_032_60/frame_aligned_videos/cam01.mp4 282 | uniandes_bouldering_034_65/frame_aligned_videos/aria01_214-1.mp4 283 | uniandes_bouldering_034_65/frame_aligned_videos/cam01.mp4 284 | uniandes_dance_012_47/frame_aligned_videos/aria01_214-1.mp4 285 | uniandes_dance_012_47/frame_aligned_videos/cam01.mp4 286 | uniandes_dance_012_6/frame_aligned_videos/aria01_214-1.mp4 287 | uniandes_dance_012_6/frame_aligned_videos/cam01.mp4 288 | uniandes_dance_013_11/frame_aligned_videos/aria01_214-1.mp4 289 | uniandes_dance_013_11/frame_aligned_videos/cam01.mp4 290 | uniandes_dance_013_19/frame_aligned_videos/aria01_214-1.mp4 291 | uniandes_dance_013_19/frame_aligned_videos/cam01.mp4 292 | uniandes_dance_017_23/frame_aligned_videos/aria01_214-1.mp4 293 | uniandes_dance_017_23/frame_aligned_videos/cam01.mp4 294 | uniandes_dance_020_52/frame_aligned_videos/aria01_214-1.mp4 295 | uniandes_dance_020_52/frame_aligned_videos/cam01.mp4 296 | utokyo_cpr_2005_25_2/frame_aligned_videos/aria01_214-1.mp4 297 | utokyo_cpr_2005_25_2/frame_aligned_videos/cam01.mp4 298 | utokyo_cpr_2005_30_2/frame_aligned_videos/aria01_214-1.mp4 299 | utokyo_cpr_2005_30_2/frame_aligned_videos/cam01.mp4 300 | utokyo_cpr_2005_36_2/frame_aligned_videos/aria01_214-1.mp4 301 | utokyo_cpr_2005_36_2/frame_aligned_videos/cam01.mp4 302 | utokyo_pcr_2001_27_6/frame_aligned_videos/aria01_214-1.mp4 303 | utokyo_pcr_2001_27_6/frame_aligned_videos/cam01.mp4 304 | utokyo_pcr_2001_29_2/frame_aligned_videos/aria01_214-1.mp4 305 | utokyo_pcr_2001_29_2/frame_aligned_videos/cam01.mp4 306 | utokyo_pcr_2001_30_2/frame_aligned_videos/aria01_214-1.mp4 307 | utokyo_pcr_2001_30_2/frame_aligned_videos/cam01.mp4 308 | utokyo_pcr_2001_32_2/frame_aligned_videos/aria01_214-1.mp4 309 | utokyo_pcr_2001_32_2/frame_aligned_videos/cam01.mp4 310 | utokyo_pcr_2001_34_2/frame_aligned_videos/aria01_214-1.mp4 311 | utokyo_pcr_2001_34_2/frame_aligned_videos/cam01.mp4 312 | utokyo_pcr_2001_35_6/frame_aligned_videos/aria01_214-1.mp4 313 | utokyo_pcr_2001_35_6/frame_aligned_videos/cam01.mp4 314 | utokyo_soccer_8000_46_47_4/frame_aligned_videos/aria01_214-1.mp4 315 | utokyo_soccer_8000_46_47_4/frame_aligned_videos/cam01.mp4 316 | utokyo_soccer_8000_46_47_6/frame_aligned_videos/aria01_214-1.mp4 317 | utokyo_soccer_8000_46_47_6/frame_aligned_videos/cam01.mp4 318 | ### BridgeData V2 319 | bridge_data_v2/datacol1_toykitchen1/many_skills/00/2023-03-15_13-35-31/raw/traj_group0/traj3/images0 320 | bridge_data_v2/datacol1_toykitchen1/many_skills/02/2023-03-15_14-02-55/raw/traj_group0/traj1/images0 321 | bridge_data_v2/datacol1_toykitchen1/many_skills/04/2023-03-15_14-21-55/raw/traj_group0/traj11/images0 322 | bridge_data_v2/datacol1_toykitchen1/many_skills/05/2023-03-15_14-28-28/raw/traj_group0/traj7/images0 323 | bridge_data_v2/datacol1_toykitchen6/fold_cloth/02/2023-02-15_18-25-24/raw/traj_group0/traj3/images0 324 | bridge_data_v2/datacol1_toykitchen6/fold_cloth/22/2023-03-04_18-42-31/raw/traj_group0/traj2/images0 325 | bridge_data_v2/datacol1_toykitchen6/fold_cloth/27/2023-03-04_19-28-12/raw/traj_group0/traj0/images0 326 | bridge_data_v2/datacol1_toykitchen6/fold_cloth/30/2023-03-04_19-52-50/raw/traj_group0/traj0/images0 327 | bridge_data_v2/datacol1_toykitchen6/pnp_sweep/02/2023-01-25_18-02-11/raw/traj_group0/traj0/images0 328 | bridge_data_v2/datacol1_toykitchen6/pnp_sweep/22/2023-01-26_16-12-08/raw/traj_group0/traj0/images0 329 | bridge_data_v2/datacol1_toykitchen6/pnp_sweep/30/2023-01-31_17-03-56/raw/traj_group0/traj5/images0 330 | bridge_data_v2/datacol1_toykitchen6/pnp_sweep/35/2023-01-31_17-32-18/raw/traj_group0/traj5/images0 331 | bridge_data_v2/datacol1_toykitchen6/pnp_sweep/45/2023-01-31_18-23-54/raw/traj_group0/traj2/images0 332 | bridge_data_v2/datacol2_folding_table/drawer_pnp/00/2023-05-28_16-02-57/raw/traj_group0/traj15/images0 333 | bridge_data_v2/datacol2_folding_table/drawer_pnp/01/2023-05-28_16-12-58/raw/traj_group0/traj2/images0 334 | bridge_data_v2/datacol2_folding_table/drawer_pnp/02/2023-05-28_16-19-57/raw/traj_group0/traj14/images0 335 | bridge_data_v2/datacol2_folding_table/fold_cloth_pnp/01/2023-05-19_12-20-29/raw/traj_group0/traj1/images0 336 | bridge_data_v2/datacol2_folding_table/fold_cloth_pnp/02/2023-05-19_12-36-53/raw/traj_group0/traj2/images0 337 | bridge_data_v2/datacol2_folding_table/fold_cloth_pnp/15/2023-05-22_14-25-33/raw/traj_group0/traj1/images0 338 | bridge_data_v2/datacol2_folding_table/fold_cloth_pnp/22/2023-05-29_11-19-13/raw/traj_group0/traj0/images0 339 | bridge_data_v2/datacol2_folding_table/pnp_push_sweep/01/2023-07-09_10-54-40/raw/traj_group0/traj35/images0 340 | bridge_data_v2/datacol2_folding_table/pnp_push_sweep/02/2023-07-09_11-41-33/raw/traj_group0/traj2/images0 341 | bridge_data_v2/datacol2_folding_table/pnp_push_sweep/04/2023-07-09_13-21-52/raw/traj_group0/traj1/images0 342 | bridge_data_v2/datacol2_folding_table/pnp_push_sweep/05/2023-07-09_13-44-01/raw/traj_group0/traj0/images0 343 | bridge_data_v2/datacol2_folding_table/stack_blocks/00/2023-05-29_10-20-41/raw/traj_group0/traj0/images0 344 | bridge_data_v2/datacol2_folding_table/stack_blocks/01/2023-05-29_10-36-05/raw/traj_group0/traj5/images0 345 | bridge_data_v2/datacol2_folding_table_white_tray/sweep_granular/13/2023-05-28_16-42-53/raw/traj_group0/traj0/images0 346 | bridge_data_v2/datacol2_laundry_machine/pnp_push_sweep/01/2023-07-13_17-47-39/raw/traj_group0/traj1/images0 347 | bridge_data_v2/datacol2_laundry_machine/pnp_push_sweep/04/2023-07-17_16-14-07/raw/traj_group0/traj1/images0 348 | bridge_data_v2/datacol2_laundry_machine/pnp_sweep/01/2023-02-10_11-46-01/raw/traj_group0/traj7/images0 349 | bridge_data_v2/datacol2_laundry_machine/pnp_sweep/02/2023-02-10_11-50-57/raw/traj_group0/traj0/images0 350 | bridge_data_v2/datacol2_laundry_machine/pnp_sweep/22/2023-02-11_18-59-48/raw/traj_group0/traj4/images0 351 | bridge_data_v2/datacol2_laundry_machine/pnp_sweep/27/2023-02-11_19-40-37/raw/traj_group0/traj4/images0 352 | bridge_data_v2/datacol2_laundry_machine/pnp_sweep/30/2023-02-11_20-01-40/raw/traj_group0/traj0/images0 353 | bridge_data_v2/datacol2_tabletop_dark_wood/drawer_pnp/02/2023-04-17_09-54-38/raw/traj_group0/traj0/images0 354 | bridge_data_v2/datacol2_tabletop_dark_wood/drawer_pnp/15/2023-04-18_12-27-29/raw/traj_group0/traj5/images0 355 | bridge_data_v2/datacol2_tabletop_dark_wood/drawer_pnp/22/2023-04-18_13-55-55/raw/traj_group0/traj13/images0 356 | bridge_data_v2/datacol2_tabletop_dark_wood/fold_cloth/02/2023-02-15_13-27-47/raw/traj_group0/traj1/images0 357 | bridge_data_v2/datacol2_tabletop_dark_wood/fold_cloth/22/2023-02-16_18-25-48/raw/traj_group0/traj11/images0 358 | bridge_data_v2/datacol2_tabletop_dark_wood/fold_cloth/30/2023-02-16_19-18-57/raw/traj_group0/traj11/images0 359 | bridge_data_v2/datacol2_tabletop_dark_wood/fold_cloth_pnp/15/2023-05-18_13-04-44/raw/traj_group0/traj1/images0 360 | bridge_data_v2/datacol2_tabletop_dark_wood/fold_cloth_pnp/20/2023-05-18_15-11-10/raw/traj_group0/traj0/images0 361 | bridge_data_v2/datacol2_tabletop_dark_wood/many_skills/02/2023-02-24_13-02-53/raw/traj_group0/traj2/images0 362 | bridge_data_v2/datacol2_tabletop_dark_wood/many_skills/22/2023-03-07_12-31-02/raw/traj_group0/traj3/images0 363 | bridge_data_v2/datacol2_tabletop_dark_wood/many_skills/30/2023-03-07_13-19-59/raw/traj_group0/traj8/images0 364 | bridge_data_v2/datacol2_tabletop_dark_wood/many_skills/35/2023-03-07_13-51-04/raw/traj_group0/traj3/images0 365 | bridge_data_v2/datacol2_tabletop_dark_wood/pnp_push_sweep/01/2023-06-27_14-48-20/raw/traj_group0/traj5/images0 366 | bridge_data_v2/datacol2_tabletop_dark_wood/pnp_push_sweep/15/2023-07-20_17-49-08/raw/traj_group0/traj1/images0 367 | bridge_data_v2/datacol2_tabletop_dark_wood/pnp_push_sweep/18/2023-07-25_18-12-36/raw/traj_group0/traj14/images0 368 | bridge_data_v2/datacol2_toykitchen1/pnp_push_sweep/04/2023-07-05_16-29-29/raw/traj_group0/traj1/images0 369 | bridge_data_v2/datacol2_toykitchen2/many_skills/02/2023-03-08_13-10-52/raw/traj_group0/traj1/images0 370 | bridge_data_v2/datacol2_toykitchen2/many_skills/35/2023-03-09_12-38-48/raw/traj_group0/traj20/images0 371 | bridge_data_v2/datacol2_toykitchen2/many_skills/37/2023-03-09_12-47-28/raw/traj_group0/traj5/images0 372 | bridge_data_v2/datacol2_toykitchen2/pnp_push_sweep/01/2023-07-11_18-51-09/raw/traj_group0/traj0/images0 373 | bridge_data_v2/datacol2_toykitchen2/pnp_push_sweep/02/2023-07-11_19-26-50/raw/traj_group0/traj5/images0 374 | bridge_data_v2/datacol2_toykitchen2/pnp_push_sweep/15/2023-07-27_18-09-15/raw/traj_group0/traj0/images0 375 | bridge_data_v2/datacol2_toykitchen2/pnp_push_sweep/18/2023-07-27_20-16-39/raw/traj_group0/traj0/images0 376 | bridge_data_v2/datacol2_toykitchen2/pnp_push_sweep/20/2023-07-13_14-18-52/raw/traj_group0/traj0/images0 377 | bridge_data_v2/datacol2_toykitchen2/stack_blocks/04/2023-05-08_16-25-30/raw/traj_group0/traj1/images0 378 | bridge_data_v2/datacol2_toykitchen2/stack_blocks/01/2023-05-08_15-05-17/raw/traj_group0/traj1/images0 379 | bridge_data_v2/datacol2_toykitchen2/stack_blocks/02/2023-05-08_15-23-07/raw/traj_group0/traj1/images0 380 | bridge_data_v2/datacol2_toykitchen2/stack_blocks/03/2023-05-08_15-44-57/raw/traj_group0/traj0/images0 381 | bridge_data_v2/datacol2_toykitchen5/pnp_push_sweep/01/2023-07-10_17-16-45/raw/traj_group0/traj3/images0 382 | bridge_data_v2/datacol2_toykitchen5/pnp_push_sweep/02/2023-07-10_18-44-56/raw/traj_group0/traj3/images0 383 | bridge_data_v2/datacol2_toykitchen5/pnp_push_sweep/15/2023-08-01_19-06-43/raw/traj_group0/traj28/images0 384 | bridge_data_v2/datacol2_toykitchen5/pnp_push_sweep/16/2023-08-01_20-23-24/raw/traj_group0/traj5/images0 385 | bridge_data_v2/datacol2_toykitchen7/drawer_pnp/02/2023-04-19_09-37-30/raw/traj_group0/traj4/images0 386 | bridge_data_v2/datacol2_toykitchen7/drawer_pnp/15/2023-04-20_10-37-39/raw/traj_group0/traj0/images0 387 | bridge_data_v2/datacol2_toykitchen7/drawer_pnp/20/2023-04-20_12-18-42/raw/traj_group0/traj14/images0 388 | bridge_data_v2/datacol2_toykitchen7/many_skills/01/2023-04-13_15-03-04/raw/traj_group0/traj0/images0 389 | bridge_data_v2/datacol2_toykitchen7/many_skills/02/2023-04-13_15-08-50/raw/traj_group0/traj4/images0 390 | bridge_data_v2/datacol2_toykitchen7/many_skills/22/2023-04-15_17-05-56/raw/traj_group0/traj8/images0 391 | bridge_data_v2/datacol2_toykitchen7/many_skills/30/2023-04-15_18-58-35/raw/traj_group0/traj3/images0 392 | bridge_data_v2/datacol2_toykitchen7/pnp_push_sweep/01/2023-07-08_11-47-52/raw/traj_group0/traj1/images0 393 | bridge_data_v2/datacol2_toykitchen7/pnp_push_sweep/02/2023-07-08_12-17-07/raw/traj_group0/traj2/images0 394 | bridge_data_v2/datacol2_toykitchen7/pnp_push_sweep/04/2023-07-08_14-20-54/raw/traj_group0/traj4/images0 395 | bridge_data_v2/datacol2_toykitchen7/pnp_push_sweep/05/2023-07-08_14-43-13/raw/traj_group0/traj5/images0 396 | bridge_data_v2/datacol2_toykitchen7/stack_blocks/01/2023-04-28_11-53-36/raw/traj_group0/traj0/images0 397 | bridge_data_v2/datacol2_toykitchen7/stack_blocks/03/2023-05-05_11-07-41/raw/traj_group0/traj0/images0 398 | bridge_data_v2/datacol2_toykitchen7/stack_blocks/04/2023-05-05_11-48-32/raw/traj_group0/traj0/images0 399 | bridge_data_v2/datacol2_toykitchen7/sweep_granular/00/2023-04-20_13-19-55/raw/traj_group0/traj0/images0 400 | bridge_data_v2/datacol2_toykitchen7/sweep_granular/02/2023-04-21_10-08-39/raw/traj_group0/traj0/images0 401 | bridge_data_v2/datacol2_toykitchen7/sweep_granular/04/2023-04-21_11-24-09/raw/traj_group0/traj1/images0 402 | bridge_data_v2/datacol2_toykitchen7/sweep_granular/05/2023-04-21_12-25-07/raw/traj_group0/traj1/images0 403 | bridge_data_v2/datacol2_toysink2/pnp_push_sweep/01/2023-06-13_15-54-35/raw/traj_group0/traj0/images0 404 | bridge_data_v2/datacol2_toysink2/pnp_push_sweep/02/2023-06-13_16-28-14/raw/traj_group0/traj0/images0 405 | bridge_data_v2/datacol2_toysink2/pnp_push_sweep/14/2023-08-08_18-09-18/raw/traj_group0/traj8/images0 406 | bridge_data_v2/datacol2_toysink2/pnp_push_sweep/15/2023-08-08_18-41-53/raw/traj_group0/traj4/images0 407 | bridge_data_v2/deepthought_folding_table/stack_blocks/01/2023-04-27_10-02-50/raw/traj_group0/traj0/images0 408 | bridge_data_v2/deepthought_folding_table/stack_blocks/02/2023-04-27_10-21-26/raw/traj_group0/traj0/images0 409 | bridge_data_v2/deepthought_folding_table/stack_blocks/16/2023-05-03_12-12-04/raw/traj_group0/traj0/images0 410 | bridge_data_v2/deepthought_folding_table/stack_blocks/22/2023-05-05_20-14-40/raw/traj_group0/traj2/images0 411 | bridge_data_v2/deepthought_robot_desk/drawer_pnp/02/2023-04-05_14-00-16/raw/traj_group0/traj5/images0 412 | bridge_data_v2/deepthought_robot_desk/drawer_pnp/22/2023-04-13_12-23-14/raw/traj_group0/traj5/images0 413 | bridge_data_v2/deepthought_robot_desk/drawer_pnp/35/2023-04-13_14-06-53/raw/traj_group0/traj23/images0 414 | bridge_data_v2/deepthought_toykitchen2/stack_blocks/01/2023-05-11_18-00-12/raw/traj_group0/traj0/images0 415 | bridge_data_v2/deepthought_toykitchen2/stack_blocks/02/2023-05-11_18-55-20/raw/traj_group0/traj0/images0 416 | bridge_data_v2/deepthought_toykitchen2/stack_blocks/15/2023-05-12_17-37-44/raw/traj_group0/traj0/images0 417 | bridge_data_v2/deepthought_toykitchen2/stack_blocks/18/2023-05-12_19-10-39/raw/traj_group0/traj0/images0 418 | bridge_data_v2/minsky_folding_table_white_tray/sweep_granular/00/2023-05-26_18-30-07/raw/traj_group0/traj0/images0 419 | ### Panda70M 420 | videoID,url,timestamp,caption,matching_score,desirable_filtering,shot_boundary_detection 421 | 2k_jxJAdGTE,https://www.youtube.com/watch?v=2k_jxJAdGTE,"[('00:01:16.643', '00:01:56.416')]",['A man is standing next to a motorcycle and talking about it.'],['0.454345703125'],['desirable'],"['0:00:00.000', '0:00:39.739']" 422 | -TfD4eKDfbI,https://www.youtube.com/watch?v=-TfD4eKDfbI,"[('00:02:28.715', '00:02:35.522')]",['A woman is washing her hands in a bathroom.'],['0.4306640625'],['desirable'],"['0:00:00.000', '0:00:06.773']" 423 | -tAvY8gW8rU,https://www.youtube.com/watch?v=-tAvY8gW8rU,"[('00:04:02.720', '00:04:05.640')]",['The person is wearing a suit and walking up some stairs.'],['0.46728515625'],['desirable'],"['0:00:00.000', '0:00:02.880']" 424 | 0-aW3PKnj8E,https://www.youtube.com/watch?v=0-aW3PKnj8E,"[('00:00:46.413', '00:01:00.593')]",['A man in a suit is talking to the camera while holding a microphone.'],['0.44970703125'],['desirable'],"['0:00:00.000', '0:00:14.147']" 425 | 3UD7v_8lPv4,https://www.youtube.com/watch?v=3UD7v_8lPv4,"[('00:02:15.602', '00:02:18.204')]",['A person is fishing with a rod and reel on a boat in the ocean.'],['0.4453125'],['desirable'],"['0:00:00.000', '0:00:02.569']" 426 | 3Dgpwh5A8R8,https://www.youtube.com/watch?v=3Dgpwh5A8R8,"[('00:00:00.633', '00:00:05.900')]",['A man in a gray shirt standing next to a sink.'],['0.443603515625'],['desirable'],"['0:00:00.000', '0:00:05.233']" 427 | 3Q__bVY6wtk,https://www.youtube.com/watch?v=3Q__bVY6wtk,"[('00:06:01.027', '00:06:05.073')]","['A man wearing a baseball cap, sunglasses, and a chain walks into a locker room.']",['0.453369140625'],['desirable'],"['0:00:00.000', '0:00:04.045']" 428 | 3IIZmiGivRA,https://www.youtube.com/watch?v=3IIZmiGivRA,"[('00:00:00.480', '00:00:04.360')]",['People are sitting at tables in a room at a meeting.'],['0.43896484375'],['desirable'],"['0:00:00.000', '0:00:03.840']" 429 | -fI-ogq9OC8,https://www.youtube.com/watch?v=-fI-ogq9OC8,"[('00:00:17.684', '00:00:29.662')]",['A man in a blue jacket is talking on a microphone in front of a sign.'],['0.478271484375'],['desirable'],"['0:00:00.000', '0:00:11.945']" 430 | 1ySlUbU_p_0,https://www.youtube.com/watch?v=1ySlUbU_p_0,"[('00:00:43.710', '00:00:48.681')]",['A man is fishing from a boat on a lake and casting his line.'],['0.456787109375'],['desirable'],"['0:00:00.000', '0:00:04.938']" 431 | 09iAWc3WroQ,https://www.youtube.com/watch?v=09iAWc3WroQ,"[('00:01:20.380', '00:02:08.428')]",['A person is wrapping a piece of paper around a pillow.'],['0.4365234375'],['desirable'],"['0:00:00.000', '0:00:48.014']" 432 | 2Wn6E_Vp-30,https://www.youtube.com/watch?v=2Wn6E_Vp-30,"[('00:01:41.251', '00:01:54.564')]","['The speaker is sitting in front of a microphone, wearing a brown suit and a blue tie.']",['0.4443359375'],['desirable'],"['0:00:00.000', '0:00:13.296']" 433 | 351DxQghbh0,https://www.youtube.com/watch?v=351DxQghbh0,"[('00:01:22.874', '00:01:31.966')]",['There are different types of earphones on display in a shop.'],['0.44580078125'],['2_tiny_camera_movement'],"['0:00:00.000', '0:00:09.050']" 434 | 1ySlUbU_p_0,https://www.youtube.com/watch?v=1ySlUbU_p_0,"[('00:00:25.125', '00:00:36.636')]",['The man is fishing from a boat and using a trolling motor with an i-pilot system.'],['0.448974609375'],['desirable'],"['0:00:00.000', '0:00:11.478']" 435 | 28hKF2Rhebs,https://www.youtube.com/watch?v=28hKF2Rhebs,"[('00:02:05.600', '00:02:15.200')]","[""A person's hand is pointing a finger at a tree in a backyard of a house with grass and a fence.""]",['0.477783203125'],['desirable'],"['0:00:00.000', '0:00:09.566']" 436 | 2Wn6E_Vp-30,https://www.youtube.com/watch?v=2Wn6E_Vp-30,"[('00:07:27.630', '00:07:51.204')]",['A man with a bald head wearing a blue shirt and tie is talking to someone.'],['0.44091796875'],['desirable'],"['0:00:00.000', '0:00:23.573']" 437 | 3FYivm45mjs,https://www.youtube.com/watch?v=3FYivm45mjs,"[('00:00:09.909', '00:00:12.312')]",['A man and a woman talking to the media.'],['0.45654296875'],['desirable'],"['0:00:00.000', '0:00:00.767']" 438 | 2sZRw6a2rT0,https://www.youtube.com/watch?v=2sZRw6a2rT0,"[('00:01:12.000', '00:01:15.400')]",['The user is cooking quinoa on a stovetop using a frying pan.'],['0.463134765625'],['desirable'],"['0:00:00.000', '0:00:03.360']" 439 | -ZsxlUiiOdI,https://www.youtube.com/watch?v=-ZsxlUiiOdI,"[('00:00:56.523', '00:01:23.083')]",['A person using an orange piece of tape to hold a marker.'],['0.4462890625'],['desirable'],"['0:00:00.000', '0:00:06.139']" 440 | 351DxQghbh0,https://www.youtube.com/watch?v=351DxQghbh0,"[('00:00:36.536', '00:00:41.166')]",['A person sitting at a desk with a crown in front of it.'],['0.464111328125'],['desirable'],"['0:00:00.000', '0:00:04.587']" 441 | 22U8P7ZfG8g,https://www.youtube.com/watch?v=22U8P7ZfG8g,"[('00:04:29.143', '00:04:31.604')]",['There is a man and woman standing in front of an open door in a dark room.'],['0.44677734375'],['desirable'],"['0:00:00.000', '0:00:02.460']" 442 | 1tUgMixon2w,https://www.youtube.com/watch?v=1tUgMixon2w,"[('00:03:00.547', '00:03:11.124')]",['A person is preparing meatballs in a metal bowl on a stainless steel tray.'],['0.47314453125'],['desirable'],"['0:00:00.000', '0:00:05.005']" 443 | -qZM-qxiMQk,https://www.youtube.com/watch?v=-qZM-qxiMQk,"[('00:00:44.440', '00:00:50.480')]",['A man and a woman are standing in a kitchen and talking to each other.'],['0.470947265625'],['desirable'],"['0:00:00.000', '0:00:06.000']" 444 | 28Fx5bVejho,https://www.youtube.com/watch?v=28Fx5bVejho,"[('00:05:47.708', '00:05:53.166')]",['A woman is organizing her craft supplies in a drawer.'],['0.450439453125'],['desirable'],"['0:00:00.000', '0:00:05.416']" 445 | -XPurrmMnJg,https://www.youtube.com/watch?v=-XPurrmMnJg,"[('00:01:29.856', '00:01:33.893')]",['A person is using a paintbrush to paint an object.'],['0.453125'],['desirable'],"['0:00:00.000', '0:00:04.004']" 446 | 0HujyOjjpZY,https://www.youtube.com/watch?v=0HujyOjjpZY,"[('00:00:48.339', '00:00:51.426')]",['A deer is hiding in the woods among trees covered in snow.'],['0.45703125'],['desirable'],"['0:00:00.000', '0:00:03.044']" 447 | 3cIsikJDcdA,https://www.youtube.com/watch?v=3cIsikJDcdA,"[('00:00:05.666', '00:00:08.333')]",['A group of people playing basketball in a gym.'],['0.430419921875'],['desirable'],"['0:00:00.000', '0:00:02.633']" 448 | 0G3oIqrJync,https://www.youtube.com/watch?v=0G3oIqrJync,"[('00:00:00.433', '00:00:03.937')]",['There are two glasses of chocolate pudding with whipped cream on a table.'],['0.48291015625'],['1_still_foreground_image'],"['0:00:00.000', '0:00:03.503']" 449 | -qZM-qxiMQk,https://www.youtube.com/watch?v=-qZM-qxiMQk,"[('00:02:17.080', '00:02:22.440')]",['A woman with red hair blending food in a blender.'],['0.456787109375'],['desirable'],"['0:00:00.000', '0:00:03.360']" 450 | 28Fx5bVejho,https://www.youtube.com/watch?v=28Fx5bVejho,"[('00:08:26.916', '00:08:30.458')]",['A white closet with a drawer full of clothes.'],['0.442626953125'],['desirable'],"['0:00:00.000', '0:00:02.666']" 451 | 2m3AG4x2bL8,https://www.youtube.com/watch?v=2m3AG4x2bL8,"[('00:00:51.551', '00:00:57.724')]",['A woman sitting in a chair talking to a man on a television show.'],['0.478515625'],['desirable'],"['0:00:00.000', '0:00:06.139']" 452 | 351DxQghbh0,https://www.youtube.com/watch?v=351DxQghbh0,"[('00:06:36.187', '00:06:40.024')]",['A wall of electronics and wires in plastic boxes.'],['0.47412109375'],['0_low_desirable_score'],"['0:00:00.000', '0:00:03.795']" 453 | 09GG1VGyFsg,https://www.youtube.com/watch?v=09GG1VGyFsg,"[('00:03:16.040', '00:03:20.840')]","['A man and a woman are preparing a dish in a kitchen, they are adding seasoning to a broth and making sure the heat is low.']",['0.43603515625'],['desirable'],"['0:00:00.000', '0:00:04.760']" 454 | 2qrAcoaQzDg,https://www.youtube.com/watch?v=2qrAcoaQzDg,"[('00:00:56.181', '00:01:13.198')]",['A man is mowing his lawn in the driveway of his home.'],['0.4326171875'],['desirable'],"['0:00:00.000', '0:00:01.376']" 455 | 2qrAcoaQzDg,https://www.youtube.com/watch?v=2qrAcoaQzDg,"[('00:00:37.537', '00:00:39.873')]",['A man pushing a red lawn mower in front of a house.'],['0.45068359375'],['desirable'],"['0:00:00.000', '0:00:02.293']" 456 | 0Ai3dTTGvkc,https://www.youtube.com/watch?v=0Ai3dTTGvkc,"[('00:07:48.601', '00:08:02.014')]",['There is a person standing on a staircase in a narrow canyon with walls made of stone.'],['0.45361328125'],['desirable'],"['0:00:00.000', '0:00:13.380']" 457 | 3VcdJlrUrSw,https://www.youtube.com/watch?v=3VcdJlrUrSw,"[('00:00:04.120', '00:00:08.120')]",['A man is walking down an escalator in an airport at night.'],['0.447998046875'],['desirable'],"['0:00:00.000', '0:00:03.960']" 458 | -ZsxlUiiOdI,https://www.youtube.com/watch?v=-ZsxlUiiOdI,"[('00:00:00.433', '00:00:04.137')]",['A person holding a small toy army on the floor.'],['0.4580078125'],['desirable'],"['0:00:00.000', '0:00:03.703']" 459 | 2m3AG4x2bL8,https://www.youtube.com/watch?v=2m3AG4x2bL8,"[('00:02:27.514', '00:02:32.619')]",['A person sitting in a chair talking to a woman on a show with a full moon in the background.'],['0.49609375'],['desirable'],"['0:00:00.000', '0:00:05.071']" 460 | -ZsxlUiiOdI,https://www.youtube.com/watch?v=-ZsxlUiiOdI,"[('00:00:05.205', '00:00:10.310')]",['There are toy soldiers and tanks on a rug.'],['0.46630859375'],['desirable'],"['0:00:00.000', '0:00:03.870']" 461 | 2k_jxJAdGTE,https://www.youtube.com/watch?v=2k_jxJAdGTE,"[('00:01:07.167', '00:01:11.204')]",['A man is standing next to a motorcycle and talking to someone.'],['0.4638671875'],['desirable'],"['0:00:00.000', '0:00:04.004']" 462 | 38OE63uRI5k,https://www.youtube.com/watch?v=38OE63uRI5k,"[('00:00:13.833', '00:00:17.833')]","['A car is driving on a dirt road in the mountains, passing by trees and hills.']",['0.453125'],['desirable'],"['0:00:00.000', '0:00:01.250']" 463 | 2m3AG4x2bL8,https://www.youtube.com/watch?v=2m3AG4x2bL8,"[('00:00:01.401', '00:00:12.846')]",['A man wearing a suit and tie is talking.'],['0.4443359375'],['desirable'],"['0:00:00.000', '0:00:11.411']" 464 | -XPurrmMnJg,https://www.youtube.com/watch?v=-XPurrmMnJg,"[('00:00:33.433', '00:00:37.971')]",['A man standing next to a large pot with steam coming out of it.'],['0.45947265625'],['desirable'],"['0:00:00.000', '0:00:04.537']" 465 | 09GG1VGyFsg,https://www.youtube.com/watch?v=09GG1VGyFsg,"[('00:04:28.240', '00:04:40.800')]","['A man and a woman are standing in a kitchen, smiling at each other as they prepare food together.']",['0.453857421875'],['desirable'],"['0:00:00.000', '0:00:12.520']" 466 | 228zX1vL9bY,https://www.youtube.com/watch?v=228zX1vL9bY,"[('00:00:13.179', '00:00:21.062')]",['Three people are sitting at a table and discussing something.'],['0.47412109375'],['desirable'],"['0:00:00.000', '0:00:07.841']" 467 | 1tUgMixon2w,https://www.youtube.com/watch?v=1tUgMixon2w,"[('00:03:20.200', '00:03:27.874')]",['A tray of meatballs on top of a wooden table.'],['0.45947265625'],['desirable'],"['0:00:00.000', '0:00:01.901']" 468 | -vmdbpEnPik,https://www.youtube.com/watch?v=-vmdbpEnPik,"[('00:01:07.240', '00:01:09.800')]",['There is a soccer game on the stadium and the players are clapping to the audience.'],['0.453125'],['desirable'],"['0:00:00.000', '0:00:02.520']" 469 | 2MvkB0qNPpk,https://www.youtube.com/watch?v=2MvkB0qNPpk,"[('00:08:16.696', '00:08:28.174')]",['A man kneeling down next to a large lizard.'],['0.472900390625'],['desirable'],"['0:00:00.000', '0:00:11.444']" 470 | -MiZb4F8mis,https://www.youtube.com/watch?v=-MiZb4F8mis,"[('00:03:14.827', '00:03:22.835')]",['There is a greenhouse in the garden with grass on the ground.'],['0.443115234375'],['desirable'],"['0:00:00.000', '0:00:07.974']" 471 | 2MvkB0qNPpk,https://www.youtube.com/watch?v=2MvkB0qNPpk,"[('00:00:19.753', '00:01:07.801')]",['A man in a blue t-shirt is standing in front of shelves filled with plastic containers and he is talking to the camera.'],['0.50146484375'],['desirable'],"['0:00:00.000', '0:00:48.014']" 472 | 06akl7o7O2s,https://www.youtube.com/watch?v=06akl7o7O2s,"[('00:00:06.633', '00:00:15.700')]","['The man is sitting at a desk with a whiteboard behind him, he is wearing glasses and has a striped shirt on.']",['0.462646484375'],['desirable'],"['0:00:00.000', '0:00:09.033']" 473 | 2sZRw6a2rT0,https://www.youtube.com/watch?v=2sZRw6a2rT0,"[('00:00:46.360', '00:00:49.080')]",['A woman is cooking food in a white pot on a gas stove.'],['0.447021484375'],['desirable'],"['0:00:00.000', '0:00:02.680']" 474 | -EgW0xV1fsE,https://www.youtube.com/watch?v=-EgW0xV1fsE,"[('00:08:33.100', '00:08:37.100')]",['Two people holding two iphones next to each other.'],['0.478759765625'],['desirable'],"['0:00:00.000', '0:00:03.966']" 475 | 25BOri2peMQ,https://www.youtube.com/watch?v=25BOri2peMQ,"[('00:00:05.939', '00:00:53.720')]","[""It's a winter wonderland with snow on the ground, and there are cars parked on the side of the road.""]",['0.46728515625'],['desirable'],"['0:00:00.000', '0:00:47.747']" 476 | -XPurrmMnJg,https://www.youtube.com/watch?v=-XPurrmMnJg,"[('00:00:47.213', '00:00:52.852')]",['A person is making paper pulp in a factory.'],['0.43408203125'],['desirable'],"['0:00:00.000', '0:00:01.401']" 477 | 3Y8SvZUZ30U,https://www.youtube.com/watch?v=3Y8SvZUZ30U,"[('00:05:18.251', '00:05:49.248')]",['A woman of arab descent is applying makeup using a beauty blender sponge.'],['0.49169921875'],['desirable'],"['0:00:00.000', '0:00:30.964']" 478 | 25BOri2peMQ,https://www.youtube.com/watch?v=25BOri2peMQ,"[('00:02:57.544', '00:03:09.522')]",['There is a wooden building with glass windows and doors.'],['0.462890625'],['desirable'],"['0:00:00.000', '0:00:11.945']" 479 | 218_U-77XmM,https://www.youtube.com/watch?v=218_U-77XmM,"[('00:02:55.742', '00:02:59.779')]",['Person is using a white substance to create a sculpture on a wooden surface.'],['0.487060546875'],['desirable'],"['0:00:00.000', '0:00:01.301']" 480 | -fI-ogq9OC8,https://www.youtube.com/watch?v=-fI-ogq9OC8,"[('00:00:37.804', '00:00:41.207')]","['A man is fishing from the shore of a lake, using a fishing rod. he is wearing a black jacket and is standing on the grassy bank of the lake.']",['0.468994140625'],['desirable'],"['0:00:00.000', '0:00:03.370']" 481 | 228zX1vL9bY,https://www.youtube.com/watch?v=228zX1vL9bY,"[('00:00:31.614', '00:00:35.285')]","['A group of people are sitting around a black table in a dark room, and they appear to be discussing something.']",['0.46923828125'],['desirable'],"['0:00:00.000', '0:00:03.670']" 482 | 2qrAcoaQzDg,https://www.youtube.com/watch?v=2qrAcoaQzDg,"[('00:02:13.842', '00:02:17.178')]",['A man is standing in a garage and talking to the camera.'],['0.485595703125'],['desirable'],"['0:00:00.000', '0:00:03.294']" 483 | 28hKF2Rhebs,https://www.youtube.com/watch?v=28hKF2Rhebs,"[('00:08:59.733', '00:09:47.733')]",['A person touching a wooden fence with their finger.'],['0.475341796875'],['desirable'],"['0:00:00.000', '0:00:44.466']" 484 | 1tUgMixon2w,https://www.youtube.com/watch?v=1tUgMixon2w,"[('00:00:19.686', '00:00:42.142')]",['A chef is standing in front of a kitchen and talking to the camera.'],['0.471435546875'],['desirable'],"['0:00:00.000', '0:00:22.422']" 485 | -TfD4eKDfbI,https://www.youtube.com/watch?v=-TfD4eKDfbI,"[('00:02:04.357', '00:02:10.196')]",['A person is cleaning a kitchen drawer with a damp cloth.'],['0.439208984375'],['desirable'],"['0:00:00.000', '0:00:05.805']" 486 | 39Mlhp_z6JA,https://www.youtube.com/watch?v=39Mlhp_z6JA,"[('00:00:31.164', '00:00:34.134')]",['A woman is sitting in front of a news station.'],['0.4345703125'],['desirable'],"['0:00:00.000', '0:00:02.936']" 487 | 2MvkB0qNPpk,https://www.youtube.com/watch?v=2MvkB0qNPpk,"[('00:03:58.905', '00:04:00.974')]","['A group of men are standing in front of each other, with one man holding his arm out.']",['0.46484375'],['desirable'],"['0:00:00.000', '0:00:02.035']" 488 | 25BOri2peMQ,https://www.youtube.com/watch?v=25BOri2peMQ,"[('00:03:12.192', '00:03:22.101')]",['A large window with a reflection of a bridge and mountains in the glass.'],['0.487548828125'],['desirable'],"['0:00:00.000', '0:00:09.876']" 489 | 3K8zBpQr4Vc,https://www.youtube.com/watch?v=3K8zBpQr4Vc,"[('00:00:11.920', '00:00:14.760')]",['A middle-aged asian woman wearing glasses is smiling and talking to the camera.'],['0.473876953125'],['desirable'],"['0:00:00.000', '0:00:02.800']" 490 | 081ILYBU1pw,https://www.youtube.com/watch?v=081ILYBU1pw,"[('00:00:42.866', '00:00:47.866')]",['The person is cutting a piece of fabric on a rotating cutting mat.'],['0.456787109375'],['desirable'],"['0:00:00.000', '0:00:04.966']" 491 | -q7jbZkOa0o,https://www.youtube.com/watch?v=-q7jbZkOa0o,"[('00:02:19.347', '00:02:23.393')]",['A black and white dog is laying on a bed with a blanket on the floor and a ball under his paw.'],['0.4560546875'],['desirable'],"['0:00:00.000', '0:00:04.004']" 492 | 22U8P7ZfG8g,https://www.youtube.com/watch?v=22U8P7ZfG8g,"[('00:06:06.616', '00:06:11.537')]",['There is a person in a blue dress walking in a forest and there is a fallen tree in the forest.'],['0.47119140625'],['desirable'],"['0:00:00.000', '0:00:04.879']" 493 | -vmdbpEnPik,https://www.youtube.com/watch?v=-vmdbpEnPik,"[('00:01:30.600', '00:01:38.960')]","['A soccer game is being played on the field, and the players are wearing different colors such as red and white jerseys.']",['0.44482421875'],['desirable'],"['0:00:00.000', '0:00:03.480']" 494 | 28Fx5bVejho,https://www.youtube.com/watch?v=28Fx5bVejho,"[('00:04:43.833', '00:05:03.833')]",['A woman is putting nail polish bottles into a drawer with dividers.'],['0.450439453125'],['desirable'],"['0:00:00.000', '0:00:01.166']" 495 | -BA_ifeRHxs,https://www.youtube.com/watch?v=-BA_ifeRHxs,"[('00:01:01.933', '00:01:17.666')]",['There is a dish of lasagna in a rectangular baking dish with tomato sauce and cheese.'],['0.45068359375'],['0_low_desirable_score'],"['0:00:00.000', '0:00:15.700']" 496 | 081ILYBU1pw,https://www.youtube.com/watch?v=081ILYBU1pw,"[('00:08:27.200', '00:08:39.200')]",['A person is folding a piece of white cloth on a table.'],['0.47119140625'],['desirable'],"['0:00:00.000', '0:00:11.966']" 497 | 1ySlUbU_p_0,https://www.youtube.com/watch?v=1ySlUbU_p_0,"[('00:00:40.240', '00:00:42.809')]",['There is a fish finder on a boat and it is displaying the depth of the water in real-time.'],['0.448974609375'],['desirable'],"['0:00:00.000', '0:00:02.535']" 498 | 09GG1VGyFsg,https://www.youtube.com/watch?v=09GG1VGyFsg,"[('00:05:39.760', '00:05:42.040')]",['A white bowl filled with vegetables on a wooden cutting board.'],['0.454833984375'],['desirable'],"['0:00:00.000', '0:00:02.240']" 499 | 0C4oY_6A4co,https://www.youtube.com/watch?v=0C4oY_6A4co,"[('00:00:23.040', '00:00:32.600')]",['There are people walking on a street and some of them are carrying shopping bags.'],['0.450927734375'],['desirable'],"['0:00:00.000', '0:00:05.320']" 500 | 218_U-77XmM,https://www.youtube.com/watch?v=218_U-77XmM,"[('00:03:12.659', '00:03:25.271')]","['The person is holding a white object that appears to be made of clay or plaster, and they are using their hands to shape it into a sphere.']",['0.46533203125'],['desirable'],"['0:00:00.000', '0:00:12.579']" 501 | 3S-7-KeCpNE,https://www.youtube.com/watch?v=3S-7-KeCpNE,"[('00:00:47.366', '00:00:52.066')]",['A person is cooking food in a frying pan on a gas stove.'],['0.44580078125'],['desirable'],"['0:00:00.000', '0:00:04.666']" 502 | 36k4x2ExEJQ,https://www.youtube.com/watch?v=36k4x2ExEJQ,"[('00:00:02.068', '00:00:18.651')]",['A woman talking to a man in front of a white van.'],['0.46142578125'],['desirable'],"['0:00:00.000', '0:00:16.549']" 503 | -tAvY8gW8rU,https://www.youtube.com/watch?v=-tAvY8gW8rU,"[('00:03:05.000', '00:03:11.680')]",['A woman is using her hands to decorate a cake with frosting.'],['0.43310546875'],['desirable'],"['0:00:00.000', '0:00:06.640']" 504 | 22U8P7ZfG8g,https://www.youtube.com/watch?v=22U8P7ZfG8g,"[('00:04:32.272', '00:04:35.566')]",['A man and a woman are having a conversation.'],['0.461669921875'],['desirable'],"['0:00:00.000', '0:00:03.253']" 505 | 2Wn6E_Vp-30,https://www.youtube.com/watch?v=2Wn6E_Vp-30,"[('00:04:59.882', '00:05:10.526')]",['A woman with glasses is talking to a man at an event.'],['0.448974609375'],['desirable'],"['0:00:00.000', '0:00:10.643']" 506 | -q7jbZkOa0o,https://www.youtube.com/watch?v=-q7jbZkOa0o,"[('00:02:39.200', '00:02:43.246')]",['A black and white dog is chewing on a toy underneath a wooden table.'],['0.469970703125'],['desirable'],"['0:00:00.000', '0:00:04.004']" 507 | -MiZb4F8mis,https://www.youtube.com/watch?v=-MiZb4F8mis,"[('00:02:46.866', '00:02:50.904')]",['A person is pointing to a box under a table.'],['0.4609375'],['desirable'],"['0:00:00.000', '0:00:04.037']" 508 | BeNwloO546Y,https://www.youtube.com/watch?v=BeNwloO546Y,"[('00:01:05.965', '00:01:08.535')]",['A couple of people standing on a ladder on top of an adobe building.'],['0.44384765625'],['desirable'],"['0:00:00.000', '0:00:02.569']" 509 | 2k_jxJAdGTE,https://www.youtube.com/watch?v=2k_jxJAdGTE,"[('00:00:08.708', '00:00:50.383')]",['A man is standing next to a black motorcycle in a parking lot.'],['0.459228515625'],['desirable'],"['0:00:00.000', '0:00:41.641']" 510 | -qZM-qxiMQk,https://www.youtube.com/watch?v=-qZM-qxiMQk,"[('00:02:01.680', '00:02:14.840')]","['A young woman is standing in front of a blender in a kitchen, preparing a smoothie.']",['0.44677734375'],['desirable'],"['0:00:00.000', '0:00:06.160']" 511 | -TfD4eKDfbI,https://www.youtube.com/watch?v=-TfD4eKDfbI,"[('00:00:00.600', '00:00:05.672')]","['A woman is standing in front of a kitchen counter, holding a spray bottle filled with cleaning solution and talking to the camera.']",['0.456787109375'],['desirable'],"['0:00:00.000', '0:00:05.038']" 512 | -vmdbpEnPik,https://www.youtube.com/watch?v=-vmdbpEnPik,"[('00:00:25.240', '00:00:35.040')]",['A soccer game is being played on a stadium and the players are running on the field.'],['0.4755859375'],['desirable'],"['0:00:00.000', '0:00:07.880']" 513 | 2sZRw6a2rT0,https://www.youtube.com/watch?v=2sZRw6a2rT0,"[('00:00:34.920', '00:00:37.520')]",['A woman is cooking food in a frying pan on a stove.'],['0.438232421875'],['desirable'],"['0:00:00.000', '0:00:02.560']" 514 | 0-aW3PKnj8E,https://www.youtube.com/watch?v=0-aW3PKnj8E,"[('00:01:29.923', '00:01:33.993')]",['A man in a suit and tie is talking into a microphone.'],['0.466796875'],['desirable'],"['0:00:00.000', '0:00:04.037']" 515 | 28hKF2Rhebs,https://www.youtube.com/watch?v=28hKF2Rhebs,"[('00:10:08.666', '00:10:24.766')]",['There is a fence with a gate in the foreground and a house in the background.'],['0.43994140625'],['desirable'],"['0:00:00.000', '0:00:13.666']" 516 | 3FHyqjEMsN4,https://www.youtube.com/watch?v=3FHyqjEMsN4,"[('00:01:54.760', '00:02:01.600')]",['There is a game of rugby being played in a stadium with many people watching.'],['0.45556640625'],['desirable'],"['0:00:00.000', '0:00:06.800']" 517 | 3bLno9ZGFUU,https://www.youtube.com/watch?v=3bLno9ZGFUU,"[('00:00:50.500', '00:01:03.266')]",['The person is walking into a modern kitchen and pointing out various features of the room.'],['0.440673828125'],['desirable'],"['0:00:00.000', '0:00:12.733']" 518 | -6r04gCLMWw,https://www.youtube.com/watch?v=-6r04gCLMWw,"[('00:01:31.600', '00:01:41.560')]","['The person is holding a flip wallet case for the samsung galaxy s5 mini, and they are reviewing it in a vlog.']",['0.440185546875'],['desirable'],"['0:00:00.000', '0:00:09.920']" 519 | -fI-ogq9OC8,https://www.youtube.com/watch?v=-fI-ogq9OC8,"[('00:01:02.829', '00:01:05.765')]","['People are walking in the rain on a city street, and there is water flowing from a drain.']",['0.432861328125'],['desirable'],"['0:00:00.000', '0:00:02.902']" 520 | 0-aFxUUa04o,https://www.youtube.com/watch?v=0-aFxUUa04o,"[('00:00:00.733', '00:00:06.700')]",['A news tv screen with a woman on it.'],['0.45166015625'],['desirable'],"['0:00:00.000', '0:00:05.933']" 521 | -------------------------------------------------------------------------------- /token_bench/video/preprocessing_script.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import os 17 | import imageio 18 | import numpy as np 19 | from glob import glob 20 | import mediapy as media 21 | 22 | 23 | def resize_video(video: np.ndarray, short_size: int = None) -> np.ndarray: 24 | """Resizes a video to have the short side of `short_size`.""" 25 | if short_size is None: 26 | return video 27 | height, width = video.shape[-3:-1] 28 | if height <= width: 29 | height_new, width_new = short_size, int(width * short_size / height + 0.5) 30 | else: 31 | height_new, width_new = int(height * short_size / width + 0.5), short_size 32 | return media.resize_video(video, shape=(height_new, width_new)) 33 | 34 | 35 | raw_video_dir = "/root/dataset" 36 | 37 | input_pattern = raw_video_dir + "/%s/*.%s" 38 | benchmarks = ["bdd_100", "egoexo4D", "panda", "bridgev2"] 39 | exts = ["mov", "mp4", "mp4", "mp4"] 40 | for benchmark, ext in zip(benchmarks, exts): 41 | input_files = sorted(glob(str(input_pattern % (benchmark, ext)))) 42 | print( 43 | "Processing", len(input_files), "videos for", input_pattern % (benchmark, ext) 44 | ) 45 | for jdx, video_file in enumerate(input_files): 46 | video_reader = imageio.get_reader(video_file, ext) 47 | video_frames = [] 48 | for frame in video_reader: 49 | video_frames.append(frame) 50 | 51 | input_video, meta_data = np.array(video_frames), video_reader.get_meta_data() 52 | 53 | video_fps = meta_data["fps"] 54 | video_duration = meta_data["duration"] 55 | input_video = np.array(input_video) 56 | T, H, W, C = input_video.shape 57 | print("loaded", video_file, "with", (T, H, W)) 58 | # clip the videos to 10 seconds if they are longer 59 | num_frame_thres = max(int(np.ceil(video_fps * 10)), 300) 60 | output_video = ( 61 | input_video[:num_frame_thres] if T > num_frame_thres else input_video 62 | ) 63 | del input_video 64 | # resize the videos to 1080p if needed 65 | output_video = ( 66 | resize_video(output_video, 1080) if min(H, W) > 1080 else output_video 67 | ) 68 | print((T, H, W, C), "resized to", output_video.shape) 69 | video_file_tokenbench = video_file.replace( 70 | f"/dataset/{benchmark}/", f"/dataset/tokenbench/{benchmark}_" 71 | ).replace(f".{ext}", ".mp4") 72 | os.makedirs(os.path.dirname(video_file_tokenbench), exist_ok=True) 73 | print("writing to ...", video_file_tokenbench) 74 | media.write_video(video_file_tokenbench, output_video, fps=video_fps) 75 | del output_video 76 | --------------------------------------------------------------------------------