├── finegrained-breakfast-dataset ├── .gitignore ├── original_videos │ └── original_videos.txt ├── clip_original_videos.py ├── README.md ├── compute_mof_iou_f1.py └── label_data_gt_right.json ├── requirements.txt ├── src └── pipeline.jpg ├── auth.env ├── docs ├── src │ ├── arxiv.png │ ├── table.jpg │ ├── pipeline.jpg │ ├── github-mark.png │ ├── top-level-schema.jpg │ └── qualitative_results.jpg └── index.html ├── sample_video └── sample.mp4 ├── results ├── Grasping_the_can │ ├── grid_image_sample.png │ └── Grasping_the_can._segment_0.5_1.4.mp4 ├── Moving_the_can_upwards │ ├── grid_image_sample.png │ └── Moving_the_can_upwards_segment_2.1_4.9.mp4 └── Releasing_the_can_placed_on_the_shelf │ ├── grid_image_sample.png │ └── Releasing_the_can_placed_on_the_shelf_segment_4.5_4.9.mp4 ├── CODE_OF_CONDUCT.md ├── LICENSE ├── SUPPORT.md ├── breakfast-dataset ├── README.md └── compute_mof_iou_f1.py ├── thumos14-dataset └── README.md ├── SECURITY.md ├── README.md ├── .gitignore └── example.py /finegrained-breakfast-dataset/.gitignore: -------------------------------------------------------------------------------- 1 | out/ -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | openai 2 | opencv-python 3 | -------------------------------------------------------------------------------- /src/pipeline.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/VLM-Video-Action-Localization/HEAD/src/pipeline.jpg -------------------------------------------------------------------------------- /auth.env: -------------------------------------------------------------------------------- 1 | AZURE_OPENAI_ENDPOINT= 2 | AZURE_OPENAI_API_KEY= 3 | AZURE_OPENAI_DEPLOYMENT_NAME= 4 | OPENAI_API_KEY= -------------------------------------------------------------------------------- /docs/src/arxiv.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/VLM-Video-Action-Localization/HEAD/docs/src/arxiv.png -------------------------------------------------------------------------------- /docs/src/table.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/VLM-Video-Action-Localization/HEAD/docs/src/table.jpg -------------------------------------------------------------------------------- /docs/src/pipeline.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/VLM-Video-Action-Localization/HEAD/docs/src/pipeline.jpg -------------------------------------------------------------------------------- /docs/src/github-mark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/VLM-Video-Action-Localization/HEAD/docs/src/github-mark.png -------------------------------------------------------------------------------- /sample_video/sample.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/VLM-Video-Action-Localization/HEAD/sample_video/sample.mp4 -------------------------------------------------------------------------------- /docs/src/top-level-schema.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/VLM-Video-Action-Localization/HEAD/docs/src/top-level-schema.jpg -------------------------------------------------------------------------------- /docs/src/qualitative_results.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/VLM-Video-Action-Localization/HEAD/docs/src/qualitative_results.jpg -------------------------------------------------------------------------------- /results/Grasping_the_can/grid_image_sample.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/VLM-Video-Action-Localization/HEAD/results/Grasping_the_can/grid_image_sample.png -------------------------------------------------------------------------------- /results/Moving_the_can_upwards/grid_image_sample.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/VLM-Video-Action-Localization/HEAD/results/Moving_the_can_upwards/grid_image_sample.png -------------------------------------------------------------------------------- /results/Grasping_the_can/Grasping_the_can._segment_0.5_1.4.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/VLM-Video-Action-Localization/HEAD/results/Grasping_the_can/Grasping_the_can._segment_0.5_1.4.mp4 -------------------------------------------------------------------------------- /results/Releasing_the_can_placed_on_the_shelf/grid_image_sample.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/VLM-Video-Action-Localization/HEAD/results/Releasing_the_can_placed_on_the_shelf/grid_image_sample.png -------------------------------------------------------------------------------- /results/Moving_the_can_upwards/Moving_the_can_upwards_segment_2.1_4.9.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/VLM-Video-Action-Localization/HEAD/results/Moving_the_can_upwards/Moving_the_can_upwards_segment_2.1_4.9.mp4 -------------------------------------------------------------------------------- /results/Releasing_the_can_placed_on_the_shelf/Releasing_the_can_placed_on_the_shelf_segment_4.5_4.9.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/VLM-Video-Action-Localization/HEAD/results/Releasing_the_can_placed_on_the_shelf/Releasing_the_can_placed_on_the_shelf_segment_4.5_4.9.mp4 -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Microsoft Open Source Code of Conduct 2 | 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 4 | 5 | Resources: 6 | 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns 10 | -------------------------------------------------------------------------------- /finegrained-breakfast-dataset/original_videos/original_videos.txt: -------------------------------------------------------------------------------- 1 | subject_10_gopro_seg_1.mp4 2 | subject_10_gopro_seg_2.mp4 3 | subject_11_gopro_seg_1.mp4 4 | subject_11_gopro_seg_2.mp4 5 | subject_12_gopro_seg_1.mp4 6 | subject_12_gopro_seg_2.mp4 7 | subject_13_gopro_seg_1.mp4 8 | subject_1_gopro_seg_1.mp4 9 | subject_1_gopro_seg_2.mp4 10 | subject_2_d_gopro_seg_1.mp4 11 | subject_2_d_gopro_seg_2.mp4 12 | subject_3_o_gopro_seg_1.mp4 13 | subject_3_o_gopro_seg_2.mp4 14 | subject_4_gopro_seg_1.mp4 15 | subject_4_gopro_seg_2.mp4 16 | subject_5_gopro_seg_1.mp4 17 | subject_5_gopro_seg_2.mp4 18 | subject_6_gopro_seg_1.mp4 19 | subject_6_gopro_seg_2.mp4 20 | subject_7_gopro_seg_1.mp4 21 | subject_7_gopro_seg_2.mp4 22 | subject_8_gopro_seg_1.mp4 23 | subject_8_gopro_seg_2.mp4 24 | subject_9_gopro_seg_1.mp4 -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Microsoft Corporation. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE 22 | -------------------------------------------------------------------------------- /SUPPORT.md: -------------------------------------------------------------------------------- 1 | # TODO: The maintainer of this repo has not yet edited this file 2 | 3 | **REPO OWNER**: Do you want Customer Service & Support (CSS) support for this product/project? 4 | 5 | - **No CSS support:** Fill out this template with information about how to file issues and get help. 6 | - **Yes CSS support:** Fill out an intake form at [aka.ms/onboardsupport](https://aka.ms/onboardsupport). CSS will work with/help you to determine next steps. 7 | - **Not sure?** Fill out an intake as though the answer were "Yes". CSS will help you decide. 8 | 9 | *Then remove this first heading from this SUPPORT.MD file before publishing your repo.* 10 | 11 | # Support 12 | 13 | ## How to file issues and get help 14 | 15 | This project uses GitHub Issues to track bugs and feature requests. Please search the existing 16 | issues before filing new issues to avoid duplicates. For new issues, file your bug or 17 | feature request as a new Issue. 18 | 19 | For help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE 20 | FOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER 21 | CHANNEL. WHERE WILL YOU HELP PEOPLE?**. 22 | 23 | ## Microsoft Support Policy 24 | 25 | Support for this **PROJECT or PRODUCT** is limited to the resources listed above. 26 | -------------------------------------------------------------------------------- /breakfast-dataset/README.md: -------------------------------------------------------------------------------- 1 | # Breakfast dataset 2 | 3 | This folder provides resources for evaluating action label predictions on videos from the Breakfast dataset. It includes ground-truth annotations and an evaluation script. 4 | 5 | This dataset is provided as supplementary material for the paper: 6 | 7 | > **Open-vocabulary action localization with iterative visual prompting** 8 | > *Naoki Wake, Atsushi Kanehira, Kazuhiro Sasabuchi, Jun Takamatsu, Katsushi Ikeuchi (2025), [IEEE Access, 5, 56908-56917](https://ieeexplore.ieee.org/abstract/document/10942370)* 9 | > 10 | > ```bibtex 11 | >@article{wake2025open, 12 | > author={Wake, Naoki and Kanehira, Atsushi and Sasabuchi, Kazuhiro and Takamatsu, Jun and Ikeuchi, Katsushi}, 13 | > journal={IEEE Access}, 14 | > title={Open-vocabulary action localization with iterative visual prompting}, 15 | > year={2025}, 16 | > volume={13}, 17 | > number={}, 18 | > pages={56908--56917}, 19 | > doi={10.1109/ACCESS.2025.3555167}} 20 | > ``` 21 | 22 | The original data is derived from the paper below: 23 | 24 | > **Human grasping database for activities of daily living with depth, color and kinematic data streams** 25 | > *Hilde Kuehne, Ali Arslan, and Thomas Serre (2014), CVPR, 780--787* 26 | > 27 | > ```bibtex 28 | >@inproceedings{kuehne2014language, 29 | > title={The language of actions: Recovering the syntax and semantics of goal-directed human activities}, 30 | > author={Kuehne, Hilde and Arslan, Ali and Serre, Thomas}, 31 | > booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, 32 | > pages={780--787}, 33 | > year={2014} 34 | >} 35 | > ``` 36 | 37 | ## Directory and File Structure 38 | 39 | - **label_data_gt_breakfast.json** 40 | This JSON file holds the ground-truth annotations for the videos. Each entry in the JSON contains: 41 | - **action**: A sequence of action labels that occur in the video. 42 | - **gt_time**: The frame index annotations corresponding to each action label (FPS=15.0). 43 | - **video_path**: The relative path to the corresponding video file. 44 | 45 | - **label_data_estimate_baseline_breakfast.json** 46 | This is an example file that contains estimated action labels. It is used as an input to the evaluation script. 47 | 48 | - **compute_mof_iou_f1.py** 49 | This evaluation script computes performance metrics (e.g., MOF, IoU, and F1 score) by comparing predicted action labels with the ground truth. 50 | ```bash 51 | python compute_mof_iou_f1.py --file label_data_estimate_baseline.json 52 | ``` -------------------------------------------------------------------------------- /thumos14-dataset/README.md: -------------------------------------------------------------------------------- 1 | # THUMOS14 dataset 2 | s 3 | This folder provides resources for evaluating action label predictions on videos from the THUMOS14 dataset. Most of the necessary code, including the evaluation script and ground truth labels, needs to be downloaded from the official [THUMOS14 page](https://www.crcv.ucf.edu/THUMOS14/). 4 | 5 | This dataset is provided as supplementary material for the paper: 6 | 7 | > **Open-vocabulary action localization with iterative visual prompting** 8 | > *Naoki Wake, Atsushi Kanehira, Kazuhiro Sasabuchi, Jun Takamatsu, Katsushi Ikeuchi (2025), [IEEE Access, 5, 56908-56917](https://ieeexplore.ieee.org/abstract/document/10942370)* 9 | > 10 | > ```bibtex 11 | >@article{wake2025open, 12 | > author={Wake, Naoki and Kanehira, Atsushi and Sasabuchi, Kazuhiro and Takamatsu, Jun and Ikeuchi, Katsushi}, 13 | > journal={IEEE Access}, 14 | > title={Open-vocabulary action localization with iterative visual prompting}, 15 | > year={2025}, 16 | > volume={13}, 17 | > number={}, 18 | > pages={56908--56917}, 19 | > doi={10.1109/ACCESS.2025.3555167}} 20 | > ``` 21 | 22 | The following is the citation for the THUMOS challenge, taken from the official [THUMOS14 page](https://www.crcv.ucf.edu/THUMOS14/): 23 | > ```bibtex 24 | >@misc{THUMOS14, 25 | > author = "Jiang, Y.-G. and Liu, J. and Roshan Zamir, A. and Toderici, G. and Laptev, I. and Shah, M. and Sukthankar, R.", 26 | > title = "{THUMOS} Challenge: Action Recognition with a Large Number of Classes", 27 | > howpublished = "\url{http://crcv.ucf.edu/THUMOS14/}", 28 | > Year = {2014}} 29 | > ``` 30 | 31 | ## Directory and File Structure 32 | 33 | - **label_data_estimate_thumos14.txt** 34 | This is an example file that contains estimated action labels. It is used as input to the evaluation script (see below). For details on the file format, please refer to the [THUMOS14 challenge documentation](https://www.crcv.ucf.edu/THUMOS14/THUMOS14_Evaluation.pdf). 35 | 36 | ## Usage Instructions 37 | 38 | 1. **Download the THUMOS14 Evaluation Toolkit** 39 | - As described in the [THUMOS14 challenge documentation](https://www.crcv.ucf.edu/THUMOS14/THUMOS14_Evaluation.pdf), download the evaluation toolkit (see "Section 4 Development kit"). 40 | - Unzip the downloaded file (`THUMOS14_evalkit_20140818.zip`). 41 | 2. **Place the Label Data File** 42 | - Move `label_data_estimate_thumos14.txt` into the `THUMOS14_evalkit_20140818/TH14evalkit/results` directory. 43 | - Install MATLAB or Octave, and from within the `THUMOS14_evalkit_20140818/TH14evalkit` directory, run the following command to compute the evaluation metrics: 44 | 45 | ```matlab 46 | [pr_all, ap_all, map] = TH14evaldet('results/label_data_estimate_thumos14.txt', 'groundtruth', 'test') 47 | ``` -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Security 4 | 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet) and [Xamarin](https://github.com/xamarin). 6 | 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/security.md/definition), please report it to us as described below. 8 | 9 | ## Reporting Security Issues 10 | 11 | **Please do not report security vulnerabilities through public GitHub issues.** 12 | 13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/security.md/msrc/create-report). 14 | 15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/security.md/msrc/pgp). 16 | 17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). 18 | 19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: 20 | 21 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) 22 | * Full paths of source file(s) related to the manifestation of the issue 23 | * The location of the affected source code (tag/branch/commit or direct URL) 24 | * Any special configuration required to reproduce the issue 25 | * Step-by-step instructions to reproduce the issue 26 | * Proof-of-concept or exploit code (if possible) 27 | * Impact of the issue, including how an attacker might exploit the issue 28 | 29 | This information will help us triage your report more quickly. 30 | 31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/security.md/msrc/bounty) page for more details about our active programs. 32 | 33 | ## Preferred Languages 34 | 35 | We prefer all communications to be in English. 36 | 37 | ## Policy 38 | 39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/security.md/cvd). 40 | 41 | 42 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # VLM-Video-Action-Localization 2 | This repository provides a sample code of a paper, [Open-vocabulary action localization with iterative visual prompting (IEEE Access)](https://ieeexplore.ieee.org/abstract/document/10942370). This paper was authored by the [Applied Robotics Research](https://www.microsoft.com/en-us/research/group/applied-robotics-research/) team. 3 | 4 | ### Overview of the pipeline: 5 | ![Temporal PIVOT pipeline](./src/pipeline.jpg) 6 | 7 | ## How to use 8 | We have confirmed that the sample codes work with python 3.12.1 9 | 10 | Modify the [auth.env](./auth.env) 11 | 12 | ### If you use Azure OpenAI 13 | - AZURE_OPENAI_DEPLOYMENT_NAME 14 | - AZURE_OPENAI_ENDPOINT 15 | - AZURE_OPENAI_API_KEY 16 | ### If you use OpenAI 17 | - OPENAI_API_KEY 18 | 19 | ### Install dependencies 20 | ```bash 21 | > pip install -r requirements.txt 22 | ``` 23 | 24 | ### Run the sample code 25 | ```bash 26 | python example.py --credentials auth.env --video sample_video/sample.mp4 --grid 3 --action "Grasping the can" 27 | ``` 28 | The `--grid N` option specifies the number of frames to extract, creating an image with an NxN grid tiling. 29 | ## Bibliography 30 | ``` 31 | @article{wake2025open, 32 | author={Wake, Naoki and Kanehira, Atsushi and Sasabuchi, Kazuhiro and Takamatsu, Jun and Ikeuchi, Katsushi}, 33 | journal={IEEE Access}, 34 | title={Open-Vocabulary Action Localization With Iterative Visual Prompting}, 35 | year={2025}, 36 | volume={13}, 37 | number={}, 38 | pages={56908--56917}, 39 | doi={10.1109/ACCESS.2025.3555167}} 40 | ``` 41 | 42 | ## Contributing 43 | 44 | This project welcomes contributions and suggestions. Most contributions require you to agree to a 45 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us 46 | the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com. 47 | 48 | When you submit a pull request, a CLA bot will automatically determine whether you need to provide 49 | a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions 50 | provided by the bot. You will only need to do this once across all repos using our CLA. 51 | 52 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 53 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or 54 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. 55 | 56 | ## Trademarks 57 | 58 | This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft 59 | trademarks or logos is subject to and must follow 60 | [Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general). 61 | Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship. 62 | Any use of third-party trademarks or logos are subject to those third-party's policies. 63 | -------------------------------------------------------------------------------- /finegrained-breakfast-dataset/clip_original_videos.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import re 4 | import subprocess 5 | 6 | # Configuration 7 | json_file = "label_data_gt_right.json" # Path to the JSON file 8 | frame_rate = 30 # Frame rate of the videos 9 | 10 | # Read the JSON file 11 | with open(json_file, "r", encoding="utf-8") as f: 12 | data = json.load(f) 13 | 14 | # Process each entry in the JSON file 15 | for entry in data: 16 | # Get the video_path field (expected format, e.g., "original_videos/subject_1_gopro_seg_1_2162-2284.mp4") 17 | video_path_field = entry.get("video_path") 18 | if not video_path_field: 19 | print("Missing video_path field. Skipping entry.") 20 | continue 21 | 22 | # Normalize path separators (in case backslashes are used) 23 | video_path_field = video_path_field.replace("\\", "/") 24 | 25 | # Split the video_path into directory and filename 26 | directory, filename = os.path.split(video_path_field) 27 | basename, ext = os.path.splitext(filename) 28 | 29 | # Use regex to extract segment info from the end of the basename. 30 | # Expected pattern: an underscore followed by two numbers separated by a dash, e.g., "_2162-2284" 31 | match = re.search(r'_(\d+)-(\d+)$', basename) 32 | if not match: 33 | print(f"Segment info not found in filename: {filename}. Skipping entry.") 34 | continue 35 | 36 | segment_start_str, segment_end_str = match.groups() 37 | try: 38 | segment_start_frame = int(segment_start_str) 39 | segment_end_frame = int(segment_end_str) 40 | except ValueError: 41 | print(f"Invalid segment frame numbers in filename: {filename}. Skipping entry.") 42 | continue 43 | 44 | # Calculate the duration in frames and verify the range is valid 45 | duration_frames = segment_end_frame - segment_start_frame 46 | if duration_frames <= 0: 47 | print(f"Invalid frame range in filename: {filename}. Skipping entry.") 48 | continue 49 | 50 | # Convert frame numbers to seconds for FFmpeg 51 | start_time_sec = segment_start_frame / frame_rate 52 | duration_sec = duration_frames / frame_rate 53 | 54 | # Determine the original video filename by removing the segment info from the basename. 55 | # For example, if basename is "subject_1_gopro_seg_1_2162-2284", the original basename will be "subject_1_gopro_seg_1". 56 | original_basename = basename[:match.start()] # Everything before the segment info 57 | original_filename = original_basename + ext 58 | original_video_path = os.path.join(directory, original_filename) 59 | 60 | if not os.path.exists(original_video_path): 61 | print(f"Original video file not found: {original_video_path}. Skipping entry.") 62 | continue 63 | 64 | # Use FFmpeg to extract the clip from the original video. 65 | # The clip starts at 'start_time_sec' (in seconds) and lasts for 'duration_sec' seconds. 66 | # The output file will be saved with the same name as specified in the video_path field. 67 | output_video_path = video_path_field 68 | ffmpeg_cmd = [ 69 | "ffmpeg", 70 | "-i", original_video_path, 71 | "-ss", str(start_time_sec), 72 | "-t", str(duration_sec), 73 | "-c:v", "libx264", 74 | "-crf", "23", 75 | "-preset", "fast", 76 | "-c:a", "aac", 77 | "-b:a", "128k", 78 | "-y", output_video_path 79 | ] 80 | 81 | print(f"Extracting clip: {output_video_path}") 82 | subprocess.run(ffmpeg_cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) 83 | 84 | print("All extractions completed!") 85 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 110 | .pdm.toml 111 | .pdm-python 112 | .pdm-build/ 113 | 114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 115 | __pypackages__/ 116 | 117 | # Celery stuff 118 | celerybeat-schedule 119 | celerybeat.pid 120 | 121 | # SageMath parsed files 122 | *.sage.py 123 | 124 | # Environments 125 | .env 126 | .venv 127 | env/ 128 | venv/ 129 | ENV/ 130 | env.bak/ 131 | venv.bak/ 132 | 133 | # Spyder project settings 134 | .spyderproject 135 | .spyproject 136 | 137 | # Rope project settings 138 | .ropeproject 139 | 140 | # mkdocs documentation 141 | /site 142 | 143 | # mypy 144 | .mypy_cache/ 145 | .dmypy.json 146 | dmypy.json 147 | 148 | # Pyre type checker 149 | .pyre/ 150 | 151 | # pytype static type analyzer 152 | .pytype/ 153 | 154 | # Cython debug symbols 155 | cython_debug/ 156 | 157 | # PyCharm 158 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 159 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 160 | # and can be added to the global gitignore or merged into this file. For a more nuclear 161 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 162 | #.idea/ 163 | -------------------------------------------------------------------------------- /finegrained-breakfast-dataset/README.md: -------------------------------------------------------------------------------- 1 | # Fine-grained Breakfast dataset 2 | 3 | This folder provides resources for evaluating action label predictions on videos from the Fine-grained Breakfast dataset. It includes ground-truth annotations and an evaluation script. 4 | 5 | This dataset is provided as supplementary material for the paper: 6 | 7 | > **Open-vocabulary action localization with iterative visual prompting** 8 | > *Naoki Wake, Atsushi Kanehira, Kazuhiro Sasabuchi, Jun Takamatsu, Katsushi Ikeuchi (2025), [IEEE Access, 5, 56908-56917](https://ieeexplore.ieee.org/abstract/document/10942370)* 9 | > 10 | > ```bibtex 11 | >@article{wake2025open, 12 | > author={Wake, Naoki and Kanehira, Atsushi and Sasabuchi, Kazuhiro and Takamatsu, Jun and Ikeuchi, Katsushi}, 13 | > journal={IEEE Access}, 14 | > title={Open-vocabulary action localization with iterative visual prompting}, 15 | > year={2025}, 16 | > volume={13}, 17 | > number={}, 18 | > pages={56908--56917}, 19 | > doi={10.1109/ACCESS.2025.3555167}} 20 | > ``` 21 | 22 | The original data is derived from the dataset described below. We have manually annotated a subset of these videos: 23 | 24 | > **Human grasping database for activities of daily living with depth, color and kinematic data streams** 25 | > *Artur Saudabayev, Zhanibek Rysbek, Raykhan Khassenova, Huseyin Atakan Varol (2018), Scientific Data, 5(1), 1–13* 26 | > 27 | > ```bibtex 28 | > @article{saudabayev2018human, 29 | > title={Human grasping database for activities of daily living with depth, color and kinematic data streams}, 30 | > author={Saudabayev, Artur and Rysbek, Zhanibek and Khassenova, Raykhan and Varol, Huseyin Atakan}, 31 | > journal={Scientific data}, 32 | > volume={5}, 33 | > number={1}, 34 | > pages={1--13}, 35 | > year={2018}, 36 | > publisher={Nature Publishing Group} 37 | > } 38 | > ``` 39 | 40 | ## Directory and File Structure 41 | 42 | - **original_videos** 43 | Download the original videos from `Human grasping database for activities of daily living with depth, color and kinematic data streams` and place them in this folder. 44 | 45 | - **label_data_gt_right.json** 46 | This JSON file holds the ground-truth annotations for the videos. Each entry in the JSON contains: 47 | - **action**: A sequence of action labels that occur in the video. 48 | *Example*: `["Grasp with the right hand", "Picking with the right hand", ...]` 49 | - **gt_time**: The frame index annotations corresponding to each action label (FPS=30.0). 50 | *Example*: `[[0, 23], [24, 48], ...]` 51 | - **video_path**: The relative path to the corresponding video file. 52 | *Example*: `"original_videos/subject_9_gopro_seg_1_2324-2575.mp4"` 53 | **Note**: This file name is constructed from the original video name with the appended frame range. Since this repository does not provide the original videos, you need to download the original dataset, extract the clips corresponding to the specified frame numbers, and place them in the `original_videos` folder. We provide the script `clip_original_videos.py` to extract these clips. The list of original video files is provided in `original_videos/original_videos.txt`. 54 | 55 | - **label_data_estimate_baseline.json** 56 | This is an example file that contains estimated action labels. It is used as an input to the evaluation script. 57 | 58 | - **compute_mof_iou_f1.py** 59 | This evaluation script computes performance metrics (e.g., MOF, IoU, and F1 score) by comparing predicted action labels with the ground truth. 60 | ```bash 61 | python compute_mof_iou_f1.py --file label_data_estimate_baseline.json 62 | ``` 63 | 64 | - **clip_original_videos.py** 65 | This script extracts video clips from the original videos based on the frame indices specified in `label_data_gt_right.json`. Running this script will generate the video dataset with filenames as indicated in the JSON annotations. 66 | 67 | ## Usage Instructions 68 | 69 | 1. **Place the Video Files** 70 | - Download the original videos from the Fine-grained Breakfast dataset. 71 | - Place the downloaded video files in the `original_videos` folder. Refer to `original_videos/original_videos.txt` for the list of required files. 72 | 73 | 2. **Generate the Video Dataset** 74 | After placing the original videos in the `original_videos` folder, run the `clip_original_videos.py` script to extract the annotated clips. This script uses the frame index annotations provided in `label_data_gt_right.json` to cut the clips from the original videos and save them using the specified naming convention. Run the script with the following command. Note that this script leverages ffmpeg. 75 | ```bash 76 | python clip_original_videos.py 77 | ``` -------------------------------------------------------------------------------- /docs/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 70 | 71 | 72 |

Open-vocabulary Temporal Action Localization using VLMs

73 |
74 | Naoki Wake, 75 | Atsushi Kanehira, 76 | Kazuhiro Sasabuchi, 77 | Jun Takamatsu, 78 | Katsushi Ikeuchi 79 |
80 | Applied Robotics Research, Microsoft, Redmond 81 |
82 | 83 |
*For inquiries,
84 | The use of this work: Katsu Ikeuchi (
katsuike@microsoft.com) 85 |
86 | Technical issues: Naoki Wake (naoki.wake@microsoft.com) 87 | 88 |
89 | 99 | Top Level Schema 100 |

Abstract

101 |

Video action localization aims to find timings of a specific action from a long video. Although existing learning-based approaches have been successful, those require annotating videos that come with a considerable labor cost. This paper proposes a learning-free, open-vocabulary approach based on emerging off-the-shelf vision-language models (VLM). The challenge stems from the fact that VLMs are neither designed to process long videos nor tailored for finding actions. We overcome these problems by extending an iterative visual prompting technique. Specifically, we sample video frames into a concatenated image with frame index labels, making a VLM guess a frame that is considered to be closest to the start/end of the action. Iterating this process by narrowing a sampling time window results in finding a specific frame of start and end of an action. We demonstrate that this sampling technique yields reasonable results, illustrating a practical extension of VLMs for understanding videos.

102 |

Pipeline

103 | Pipeline 104 |

The proposed pipeline for open-vocabulary video action localization using a VLM consists of the following steps: (a) Frames are sampled at regular intervals from a time window, covering the entire video in the first iteration. (b) The sampled frames are then tiled in an image with annotations indicating the time order of the frames. (c) This image is then fed into a VLM to identify the frames closest to a specific timing of an action (e.g., the start timing of an action). (d) The sampling window is updated by centering on the selected frame with a narrower sampling interval. Bottom panel (1) For general action localization, the start time of the action in the video is determined by iterating steps (a) to (d). Bottom panel (2) By estimating the end time of the action in the same manner, the action is localized in the video.

105 | 106 |

Qualitative Results

107 |

We qualitatively checked our proposed pipeline using a cooking-preparation video that we recorded in-house. This 10-minute first-person video included actions such as taking out, washing, and cutting vegetables. The figure below shows the examples of the identified video segments for actions of "cutting vegetables,""washing vegetables," and "turning on a faucet," demonstrating that reasonable outputs were obtained.

108 |
109 | qualitative Results 110 |
111 |
112 |

Quantitative Results

113 |

The table below compares our proposed method with an existing method [1] on the Breakfast Dataset [2]. While our proposed method does not surpass the latest model-based approaches, this approach demonstrates its feasibility. Importantly, this method offers significant advantages: it eliminates the need for data collection or training and can extract actions specified by open-vocabulary free-text queries, thereby enhancing its adaptability to diverse applications such as video annotation and video editing.

114 |
115 | Quantitative Results 116 |
117 | 121 |
122 | 123 | 124 | -------------------------------------------------------------------------------- /breakfast-dataset/compute_mof_iou_f1.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | import cv2 5 | import matplotlib.pyplot as plt 6 | 7 | 8 | def parse_arguments(): 9 | """Parse command line arguments.""" 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument("--file", help="Input JSON file containing estimated labels", required=True) 12 | parser.add_argument("--outdir", help="Output directory", default="out/visualize") 13 | return parser.parse_args() 14 | 15 | 16 | def compute_tiou(pred_interval, gt_interval): 17 | """ 18 | Compute the temporal Intersection over Union (tIoU) between two intervals. 19 | 20 | Args: 21 | pred_interval (tuple): (start_frame, end_frame) of the prediction. 22 | gt_interval (tuple): (start_frame, end_frame) of the ground truth. 23 | 24 | Returns: 25 | float: The tIoU value. 26 | """ 27 | intersection = max(0, min(pred_interval[1], gt_interval[1]) - max(pred_interval[0], gt_interval[0])) 28 | union = max(pred_interval[1], gt_interval[1]) - min(pred_interval[0], gt_interval[0]) 29 | return intersection / union if union > 0 else 0 30 | 31 | 32 | def compute_map(pred_intervals, gt_intervals, tiou_thresholds): 33 | """ 34 | Compute the mean Average Precision (mAP) over a set of tIoU thresholds. 35 | 36 | Args: 37 | pred_intervals (list of tuple): List of predicted intervals. 38 | gt_intervals (list of tuple): List of ground truth intervals. 39 | tiou_thresholds (list of float): List of tIoU thresholds. 40 | 41 | Returns: 42 | float: The computed mAP value. 43 | """ 44 | assert len(pred_intervals) == len(gt_intervals) 45 | ap_values = [] 46 | 47 | for threshold in tiou_thresholds: 48 | matches = [] 49 | # Evaluate each prediction 50 | for pred in pred_intervals: 51 | match_found = False 52 | for gt in gt_intervals: 53 | tiou = compute_tiou(pred, gt) 54 | if tiou >= threshold: 55 | matches.append((1, tiou)) # True Positive 56 | match_found = True 57 | break 58 | if not match_found: 59 | matches.append((0, 0)) # False Positive 60 | 61 | # Sort by tIoU (descending order) 62 | matches.sort(key=lambda x: x[1], reverse=True) 63 | tp_cum, fp_cum = 0, 0 64 | precisions = [] 65 | recalls = [] 66 | 67 | for match, _ in matches: 68 | if match == 1: 69 | tp_cum += 1 70 | else: 71 | fp_cum += 1 72 | precision = tp_cum / (tp_cum + fp_cum) 73 | recall = tp_cum / len(gt_intervals) 74 | precisions.append(precision) 75 | recalls.append(recall) 76 | 77 | # Compute AP using a simple approximation (area under the precision-recall curve) 78 | ap = 0.0 79 | for i in range(1, len(recalls)): 80 | ap += (recalls[i] - recalls[i - 1]) * precisions[i] 81 | ap_values.append(ap) 82 | 83 | return sum(ap_values) / len(ap_values) if ap_values else 0 84 | 85 | 86 | def time_to_frame(time_in_seconds, fps): 87 | """Convert time in seconds to frame number based on fps.""" 88 | return int(round(time_in_seconds * fps)) 89 | 90 | 91 | def create_label_array(total_frames, intervals): 92 | """ 93 | Create a label array of length total_frames from a list of intervals. 94 | 95 | Each interval is assigned a unique label (based on its index). Every frame 96 | in the interval (inclusive) is assigned that label. 97 | 98 | Args: 99 | total_frames (int): Total number of frames. 100 | intervals (list of tuple): List of intervals (start_frame, end_frame). 101 | 102 | Returns: 103 | list: An array of labels for each frame. 104 | """ 105 | labels = [-1] * total_frames 106 | for idx, (start, end) in enumerate(intervals): 107 | for frame in range(start, end + 1): 108 | labels[frame] = idx 109 | return labels 110 | 111 | 112 | def compute_metrics(video_data, fps): 113 | """ 114 | Compute various evaluation metrics (MoF, IoU per class, mean IoU, and F1 per class) 115 | for a single video's predictions. 116 | 117 | Assumes that ground truth time intervals are 1-indexed and converts them to 0-indexed. 118 | 119 | Args: 120 | video_data (dict): Dictionary containing ground truth and predicted data. 121 | fps (float): Frames per second of the video. 122 | 123 | Returns: 124 | tuple: (MoF, IoU per class, mean IoU, F1 per class, mean F1) 125 | """ 126 | gt_actions = video_data['action'] 127 | gt_intervals = video_data['gt_time'] 128 | pred_start_times = video_data['start_times'] 129 | pred_end_times = video_data['completed_times'] 130 | 131 | # Convert ground truth intervals from 1-indexed to 0-indexed 132 | gt_intervals = [(start - 1, end - 1) for start, end in gt_intervals] 133 | total_frames = gt_intervals[-1][1] + 1 134 | 135 | # Create ground truth label array 136 | label_gt = create_label_array(total_frames, gt_intervals) 137 | 138 | # Create predicted label array (initialized with -1) 139 | label_pred = [-1] * total_frames 140 | pred_keys = list(pred_start_times.keys()) 141 | for idx, key in enumerate(pred_keys): 142 | start_time = pred_start_times[key] 143 | end_time = pred_end_times[key] 144 | start_frame = time_to_frame(start_time, fps) 145 | end_frame = time_to_frame(end_time, fps) 146 | for frame in range(start_frame, end_frame): 147 | if frame < total_frames: 148 | label_pred[frame] = idx 149 | 150 | # Fill any leading -1 values with 0 151 | for i in range(total_frames): 152 | if label_pred[i] == -1: 153 | label_pred[i] = 0 154 | else: 155 | break 156 | 157 | # Fill trailing -1 values with the last action's index 158 | last_index = len(pred_keys) - 1 159 | for i in range(total_frames - 1, -1, -1): 160 | if label_pred[i] == -1: 161 | label_pred[i] = last_index 162 | else: 163 | break 164 | 165 | # Ensure no -1 values remain 166 | if -1 in label_gt or -1 in label_pred: 167 | raise ValueError("Label array contains unassigned frames.") 168 | 169 | # Calculate Mean over Frames (MoF) 170 | correct_frames = sum(1 for gt, pred in zip(label_gt, label_pred) if gt == pred) 171 | mof = correct_frames / total_frames if total_frames > 0 else 0 172 | 173 | # Calculate IoU and F1 per action class 174 | iou_per_class = {} 175 | f1_per_class = {} 176 | for idx, action in enumerate(gt_actions): 177 | gt_count = sum(1 for label in label_gt if label == idx) 178 | pred_count = sum(1 for label in label_pred if label == idx) 179 | intersection = sum(1 for gt, pred in zip(label_gt, label_pred) if gt == pred == idx) 180 | union = gt_count + pred_count - intersection 181 | iou = intersection / union if union > 0 else 0 182 | iou_per_class[action] = iou 183 | 184 | tp = intersection 185 | fp = pred_count - intersection 186 | fn = gt_count - intersection 187 | precision = tp / (tp + fp) if (tp + fp) > 0 else 0 188 | recall = tp / (tp + fn) if (tp + fn) > 0 else 0 189 | f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0 190 | f1_per_class[action] = f1 191 | 192 | mean_iou = sum(iou_per_class.values()) / len(iou_per_class) if iou_per_class else 0 193 | mean_f1 = sum(f1_per_class.values()) / len(f1_per_class) if f1_per_class else 0 194 | 195 | return mof, iou_per_class, mean_iou, f1_per_class, mean_f1 196 | 197 | 198 | def process_videos(label_data_estimates, tiou_thresholds): 199 | """ 200 | Process each video's data, compute evaluation metrics, and collect statistics. 201 | 202 | Args: 203 | label_data_estimates (list): List of video annotation dictionaries. 204 | tiou_thresholds (list): List of tIoU thresholds for mAP calculation. 205 | 206 | Returns: 207 | dict: A dictionary with per-video metrics. 208 | dict: A dictionary containing lists of overall metrics for plotting. 209 | """ 210 | mof_list = [] 211 | miou_list = [] 212 | mf1_list = [] 213 | map_list = [] 214 | action_steps = [] 215 | action_frames = [] 216 | results = {} 217 | 218 | for video_entry in label_data_estimates: 219 | video_path = video_entry['video_path'] 220 | 221 | fps = 15.0 222 | 223 | # Skip entries where start_times is a string (invalid data) 224 | if isinstance(video_entry.get('start_times'), str): 225 | print("Skipping video:", video_path) 226 | continue 227 | 228 | mof, iou_per_class, mean_iou, f1_per_class, mean_f1 = compute_metrics(video_entry, fps) 229 | mof_list.append(mof) 230 | miou_list.append(mean_iou) 231 | mf1_list.append(mean_f1) 232 | results[video_path] = {"MoF": mof, "mIoU": mean_iou, "mF1": mean_f1} 233 | 234 | # Compute durations for predicted actions 235 | pred_start_times = video_entry['start_times'] 236 | pred_end_times = video_entry['completed_times'] 237 | durations = [pred_end_times[key] - pred_start_times[key] for key in pred_start_times.keys()] 238 | results[video_path]["duration"] = durations 239 | 240 | # Convert predicted intervals to frames and compute mAP 241 | pred_intervals = [ 242 | ( 243 | time_to_frame(pred_start_times[key], fps), 244 | time_to_frame(pred_end_times[key], fps) 245 | ) 246 | for key in pred_start_times.keys() 247 | ] 248 | gt_intervals = video_entry['gt_time'] 249 | map_value = compute_map(pred_intervals, gt_intervals, tiou_thresholds) 250 | map_list.append(map_value) 251 | 252 | action_steps.append(len(video_entry['action'])) 253 | 254 | # Compute total frames from ground truth (adjust for 0-index) 255 | gt_intervals_zero_indexed = [(start - 1, end - 1) for start, end in video_entry['gt_time']] 256 | total_frames = gt_intervals_zero_indexed[-1][1] + 1 257 | action_frames.append(total_frames) 258 | 259 | metrics = { 260 | "MoF": mof_list, 261 | "mIoU": miou_list, 262 | "mF1": mf1_list, 263 | "mAP": map_list, 264 | "action_steps": action_steps, 265 | "action_frames": action_frames 266 | } 267 | return results, metrics 268 | 269 | 270 | def plot_metrics(metrics, output_path): 271 | """ 272 | Generate scatter plots for the evaluation metrics and save the figure. 273 | 274 | Args: 275 | metrics (dict): Dictionary containing lists of metrics. 276 | output_path (str): Path to save the output plot image. 277 | """ 278 | plt.figure(figsize=(12, 6)) 279 | 280 | # Plot metrics against action steps 281 | ax1 = plt.subplot(2, 4, 1) 282 | plt.scatter(metrics["action_steps"], metrics["MoF"], alpha=0.6) 283 | plt.xlabel('Action Length (steps)') 284 | plt.ylabel('MoF') 285 | plt.ylim(0, 1) 286 | plt.title('Action Length vs MoF') 287 | plt.grid(True, linestyle='--', linewidth=0.5, color='gray', alpha=0.7) 288 | 289 | ax2 = plt.subplot(2, 4, 2) 290 | plt.scatter(metrics["action_steps"], metrics["mIoU"], alpha=0.6) 291 | plt.xlabel('Action Length (steps)') 292 | plt.ylabel('mIoU') 293 | plt.ylim(0, 1) 294 | plt.title('Action Length vs mIoU') 295 | plt.grid(True, linestyle='--', linewidth=0.5, color='gray', alpha=0.7) 296 | 297 | ax3 = plt.subplot(2, 4, 3) 298 | plt.scatter(metrics["action_steps"], metrics["mF1"], alpha=0.6) 299 | plt.xlabel('Action Length (steps)') 300 | plt.ylabel('mF1') 301 | plt.ylim(0, 1) 302 | plt.title('Action Length vs mF1') 303 | plt.grid(True, linestyle='--', linewidth=0.5, color='gray', alpha=0.7) 304 | 305 | ax4 = plt.subplot(2, 4, 4) 306 | plt.scatter(metrics["action_steps"], metrics["mAP"], alpha=0.6) 307 | plt.xlabel('Action Length (steps)') 308 | plt.ylabel('mAP') 309 | plt.ylim(0, 1) 310 | plt.title('Action Length vs mAP') 311 | plt.grid(True, linestyle='--', linewidth=0.5, color='gray', alpha=0.7) 312 | 313 | # Plot metrics against action frames 314 | ax5 = plt.subplot(2, 4, 5) 315 | plt.scatter(metrics["action_frames"], metrics["MoF"], alpha=0.6) 316 | plt.xlabel('Action Length (frames)') 317 | plt.ylabel('MoF') 318 | plt.ylim(0, 1) 319 | plt.title('Frames vs MoF') 320 | plt.grid(True, linestyle='--', linewidth=0.5, color='gray', alpha=0.7) 321 | 322 | ax6 = plt.subplot(2, 4, 6) 323 | plt.scatter(metrics["action_frames"], metrics["mIoU"], alpha=0.6) 324 | plt.xlabel('Action Length (frames)') 325 | plt.ylabel('mIoU') 326 | plt.ylim(0, 1) 327 | plt.title('Frames vs mIoU') 328 | plt.grid(True, linestyle='--', linewidth=0.5, color='gray', alpha=0.7) 329 | 330 | ax7 = plt.subplot(2, 4, 7) 331 | plt.scatter(metrics["action_frames"], metrics["mF1"], alpha=0.6) 332 | plt.xlabel('Action Length (frames)') 333 | plt.ylabel('mF1') 334 | plt.ylim(0, 1) 335 | plt.title('Frames vs mF1') 336 | plt.grid(True, linestyle='--', linewidth=0.5, color='gray', alpha=0.7) 337 | 338 | ax8 = plt.subplot(2, 4, 8) 339 | plt.scatter(metrics["action_frames"], metrics["mAP"], alpha=0.6) 340 | plt.xlabel('Action Length (frames)') 341 | plt.ylabel('mAP') 342 | plt.ylim(0, 1) 343 | plt.title('Frames vs mAP') 344 | plt.grid(True, linestyle='--', linewidth=0.5, color='gray', alpha=0.7) 345 | 346 | plt.tight_layout() 347 | plt.savefig(output_path) 348 | plt.close() 349 | 350 | 351 | def main(): 352 | args = parse_arguments() 353 | tiou_thresholds = [0.3, 0.4, 0.5, 0.6, 0.7] 354 | input_filename = args.file 355 | 356 | # Load estimated label data from JSON 357 | with open(input_filename, "r") as f: 358 | label_data_estimates = json.load(f) 359 | 360 | # Process each video entry to compute metrics 361 | results, metrics = process_videos(label_data_estimates, tiou_thresholds) 362 | 363 | # Prepare output directories 364 | out_dir = args.outdir 365 | base_filename = os.path.splitext(os.path.basename(input_filename))[0] 366 | parent_dir = os.path.basename(os.path.dirname(input_filename)) 367 | output_dir = os.path.join(out_dir, parent_dir) 368 | os.makedirs(output_dir, exist_ok=True) 369 | plot_output_path = os.path.join(output_dir, base_filename + ".png") 370 | 371 | # Plot and save the evaluation metrics 372 | plot_metrics(metrics, plot_output_path) 373 | 374 | # Print mean metric values 375 | mean_mof = sum(metrics["MoF"]) / len(metrics["MoF"]) if metrics["MoF"] else 0 376 | mean_miou = sum(metrics["mIoU"]) / len(metrics["mIoU"]) if metrics["mIoU"] else 0 377 | mean_mf1 = sum(metrics["mF1"]) / len(metrics["mF1"]) if metrics["mF1"] else 0 378 | mean_map = sum(metrics["mAP"]) / len(metrics["mAP"]) if metrics["mAP"] else 0 379 | 380 | print("Mean MoF: {:.4f}".format(mean_mof)) 381 | print("Mean mIoU: {:.4f}".format(mean_miou)) 382 | print("Mean mF1: {:.4f}".format(mean_mf1)) 383 | print("Mean mAP: {:.4f}".format(mean_map)) 384 | print("Processed videos:", len(metrics["MoF"])) 385 | 386 | # Save detailed results as JSON 387 | results_output_path = os.path.join(output_dir, base_filename + ".json") 388 | with open(results_output_path, "w") as f: 389 | json.dump(results, f, indent=4) 390 | 391 | 392 | if __name__ == "__main__": 393 | main() 394 | -------------------------------------------------------------------------------- /finegrained-breakfast-dataset/compute_mof_iou_f1.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | import cv2 5 | import matplotlib.pyplot as plt 6 | 7 | 8 | def parse_arguments(): 9 | """Parse command line arguments.""" 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument("--file", help="Input JSON file containing estimated labels", required=True) 12 | parser.add_argument("--outdir", help="Output directory", default="out/visualize") 13 | return parser.parse_args() 14 | 15 | 16 | def compute_tiou(pred_interval, gt_interval): 17 | """ 18 | Compute the temporal Intersection over Union (tIoU) between two intervals. 19 | 20 | Args: 21 | pred_interval (tuple): (start_frame, end_frame) of the prediction. 22 | gt_interval (tuple): (start_frame, end_frame) of the ground truth. 23 | 24 | Returns: 25 | float: The tIoU value. 26 | """ 27 | intersection = max(0, min(pred_interval[1], gt_interval[1]) - max(pred_interval[0], gt_interval[0])) 28 | union = max(pred_interval[1], gt_interval[1]) - min(pred_interval[0], gt_interval[0]) 29 | return intersection / union if union > 0 else 0 30 | 31 | 32 | def compute_map(pred_intervals, gt_intervals, tiou_thresholds): 33 | """ 34 | Compute the mean Average Precision (mAP) over a set of tIoU thresholds. 35 | 36 | Args: 37 | pred_intervals (list of tuple): List of predicted intervals. 38 | gt_intervals (list of tuple): List of ground truth intervals. 39 | tiou_thresholds (list of float): List of tIoU thresholds. 40 | 41 | Returns: 42 | float: The computed mAP value. 43 | """ 44 | assert len(pred_intervals) == len(gt_intervals) 45 | ap_values = [] 46 | 47 | for threshold in tiou_thresholds: 48 | matches = [] 49 | # Evaluate each prediction 50 | for pred in pred_intervals: 51 | match_found = False 52 | for gt in gt_intervals: 53 | tiou = compute_tiou(pred, gt) 54 | if tiou >= threshold: 55 | matches.append((1, tiou)) # True Positive 56 | match_found = True 57 | break 58 | if not match_found: 59 | matches.append((0, 0)) # False Positive 60 | 61 | # Sort by tIoU (descending order) 62 | matches.sort(key=lambda x: x[1], reverse=True) 63 | tp_cum, fp_cum = 0, 0 64 | precisions = [] 65 | recalls = [] 66 | 67 | for match, _ in matches: 68 | if match == 1: 69 | tp_cum += 1 70 | else: 71 | fp_cum += 1 72 | precision = tp_cum / (tp_cum + fp_cum) 73 | recall = tp_cum / len(gt_intervals) 74 | precisions.append(precision) 75 | recalls.append(recall) 76 | 77 | # Compute AP using a simple approximation (area under the precision-recall curve) 78 | ap = 0.0 79 | for i in range(1, len(recalls)): 80 | ap += (recalls[i] - recalls[i - 1]) * precisions[i] 81 | ap_values.append(ap) 82 | 83 | return sum(ap_values) / len(ap_values) if ap_values else 0 84 | 85 | 86 | def time_to_frame(time_in_seconds, fps): 87 | """Convert time in seconds to frame number based on fps.""" 88 | return int(round(time_in_seconds * fps)) 89 | 90 | 91 | def create_label_array(total_frames, intervals): 92 | """ 93 | Create a label array of length total_frames from a list of intervals. 94 | 95 | Each interval is assigned a unique label (based on its index). Every frame 96 | in the interval (inclusive) is assigned that label. 97 | 98 | Args: 99 | total_frames (int): Total number of frames. 100 | intervals (list of tuple): List of intervals (start_frame, end_frame). 101 | 102 | Returns: 103 | list: An array of labels for each frame. 104 | """ 105 | labels = [-1] * total_frames 106 | for idx, (start, end) in enumerate(intervals): 107 | for frame in range(start, end + 1): 108 | labels[frame] = idx 109 | return labels 110 | 111 | 112 | def compute_metrics(video_data, fps): 113 | """ 114 | Compute various evaluation metrics (MoF, IoU per class, mean IoU, and F1 per class) 115 | for a single video's predictions. 116 | 117 | Assumes that ground truth time intervals are 1-indexed and converts them to 0-indexed. 118 | 119 | Args: 120 | video_data (dict): Dictionary containing ground truth and predicted data. 121 | fps (float): Frames per second of the video. 122 | 123 | Returns: 124 | tuple: (MoF, IoU per class, mean IoU, F1 per class, mean F1) 125 | """ 126 | gt_actions = video_data['action'] 127 | gt_intervals = video_data['gt_time'] 128 | pred_start_times = video_data['start_times'] 129 | pred_end_times = video_data['completed_times'] 130 | 131 | # Convert ground truth intervals from 1-indexed to 0-indexed 132 | gt_intervals = [(start - 1, end - 1) for start, end in gt_intervals] 133 | total_frames = gt_intervals[-1][1] + 1 134 | 135 | # Create ground truth label array 136 | label_gt = create_label_array(total_frames, gt_intervals) 137 | 138 | # Create predicted label array (initialized with -1) 139 | label_pred = [-1] * total_frames 140 | pred_keys = list(pred_start_times.keys()) 141 | for idx, key in enumerate(pred_keys): 142 | start_time = pred_start_times[key] 143 | end_time = pred_end_times[key] 144 | start_frame = time_to_frame(start_time, fps) 145 | end_frame = time_to_frame(end_time, fps) 146 | for frame in range(start_frame, end_frame): 147 | if frame < total_frames: 148 | label_pred[frame] = idx 149 | 150 | # Fill any leading -1 values with 0 151 | for i in range(total_frames): 152 | if label_pred[i] == -1: 153 | label_pred[i] = 0 154 | else: 155 | break 156 | 157 | # Fill trailing -1 values with the last action's index 158 | last_index = len(pred_keys) - 1 159 | for i in range(total_frames - 1, -1, -1): 160 | if label_pred[i] == -1: 161 | label_pred[i] = last_index 162 | else: 163 | break 164 | 165 | # Ensure no -1 values remain 166 | if -1 in label_gt or -1 in label_pred: 167 | raise ValueError("Label array contains unassigned frames.") 168 | 169 | # Calculate Mean over Frames (MoF) 170 | correct_frames = sum(1 for gt, pred in zip(label_gt, label_pred) if gt == pred) 171 | mof = correct_frames / total_frames if total_frames > 0 else 0 172 | 173 | # Calculate IoU and F1 per action class 174 | iou_per_class = {} 175 | f1_per_class = {} 176 | for idx, action in enumerate(gt_actions): 177 | gt_count = sum(1 for label in label_gt if label == idx) 178 | pred_count = sum(1 for label in label_pred if label == idx) 179 | intersection = sum(1 for gt, pred in zip(label_gt, label_pred) if gt == pred == idx) 180 | union = gt_count + pred_count - intersection 181 | iou = intersection / union if union > 0 else 0 182 | iou_per_class[action] = iou 183 | 184 | tp = intersection 185 | fp = pred_count - intersection 186 | fn = gt_count - intersection 187 | precision = tp / (tp + fp) if (tp + fp) > 0 else 0 188 | recall = tp / (tp + fn) if (tp + fn) > 0 else 0 189 | f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0 190 | f1_per_class[action] = f1 191 | 192 | mean_iou = sum(iou_per_class.values()) / len(iou_per_class) if iou_per_class else 0 193 | mean_f1 = sum(f1_per_class.values()) / len(f1_per_class) if f1_per_class else 0 194 | 195 | return mof, iou_per_class, mean_iou, f1_per_class, mean_f1 196 | 197 | 198 | def process_videos(label_data_estimates, tiou_thresholds): 199 | """ 200 | Process each video's data, compute evaluation metrics, and collect statistics. 201 | 202 | Args: 203 | label_data_estimates (list): List of video annotation dictionaries. 204 | tiou_thresholds (list): List of tIoU thresholds for mAP calculation. 205 | 206 | Returns: 207 | dict: A dictionary with per-video metrics. 208 | dict: A dictionary containing lists of overall metrics for plotting. 209 | """ 210 | mof_list = [] 211 | miou_list = [] 212 | mf1_list = [] 213 | map_list = [] 214 | action_steps = [] 215 | action_frames = [] 216 | results = {} 217 | 218 | for video_entry in label_data_estimates: 219 | video_path = video_entry['video_path'] 220 | 221 | fps = 30.0 222 | 223 | # Skip entries where start_times is a string (invalid data) 224 | if isinstance(video_entry.get('start_times'), str): 225 | print("Skipping video:", video_path) 226 | continue 227 | 228 | mof, iou_per_class, mean_iou, f1_per_class, mean_f1 = compute_metrics(video_entry, fps) 229 | mof_list.append(mof) 230 | miou_list.append(mean_iou) 231 | mf1_list.append(mean_f1) 232 | results[video_path] = {"MoF": mof, "mIoU": mean_iou, "mF1": mean_f1} 233 | 234 | # Compute durations for predicted actions 235 | pred_start_times = video_entry['start_times'] 236 | pred_end_times = video_entry['completed_times'] 237 | durations = [pred_end_times[key] - pred_start_times[key] for key in pred_start_times.keys()] 238 | results[video_path]["duration"] = durations 239 | 240 | # Convert predicted intervals to frames and compute mAP 241 | pred_intervals = [ 242 | ( 243 | time_to_frame(pred_start_times[key], fps), 244 | time_to_frame(pred_end_times[key], fps) 245 | ) 246 | for key in pred_start_times.keys() 247 | ] 248 | gt_intervals = video_entry['gt_time'] 249 | map_value = compute_map(pred_intervals, gt_intervals, tiou_thresholds) 250 | map_list.append(map_value) 251 | 252 | action_steps.append(len(video_entry['action'])) 253 | 254 | # Compute total frames from ground truth (adjust for 0-index) 255 | gt_intervals_zero_indexed = [(start - 1, end - 1) for start, end in video_entry['gt_time']] 256 | total_frames = gt_intervals_zero_indexed[-1][1] + 1 257 | action_frames.append(total_frames) 258 | 259 | metrics = { 260 | "MoF": mof_list, 261 | "mIoU": miou_list, 262 | "mF1": mf1_list, 263 | "mAP": map_list, 264 | "action_steps": action_steps, 265 | "action_frames": action_frames 266 | } 267 | return results, metrics 268 | 269 | 270 | def plot_metrics(metrics, output_path): 271 | """ 272 | Generate scatter plots for the evaluation metrics and save the figure. 273 | 274 | Args: 275 | metrics (dict): Dictionary containing lists of metrics. 276 | output_path (str): Path to save the output plot image. 277 | """ 278 | plt.figure(figsize=(12, 6)) 279 | 280 | # Plot metrics against action steps 281 | ax1 = plt.subplot(2, 4, 1) 282 | plt.scatter(metrics["action_steps"], metrics["MoF"], alpha=0.6) 283 | plt.xlabel('Action Length (steps)') 284 | plt.ylabel('MoF') 285 | plt.ylim(0, 1) 286 | plt.title('Action Length vs MoF') 287 | plt.grid(True, linestyle='--', linewidth=0.5, color='gray', alpha=0.7) 288 | 289 | ax2 = plt.subplot(2, 4, 2) 290 | plt.scatter(metrics["action_steps"], metrics["mIoU"], alpha=0.6) 291 | plt.xlabel('Action Length (steps)') 292 | plt.ylabel('mIoU') 293 | plt.ylim(0, 1) 294 | plt.title('Action Length vs mIoU') 295 | plt.grid(True, linestyle='--', linewidth=0.5, color='gray', alpha=0.7) 296 | 297 | ax3 = plt.subplot(2, 4, 3) 298 | plt.scatter(metrics["action_steps"], metrics["mF1"], alpha=0.6) 299 | plt.xlabel('Action Length (steps)') 300 | plt.ylabel('mF1') 301 | plt.ylim(0, 1) 302 | plt.title('Action Length vs mF1') 303 | plt.grid(True, linestyle='--', linewidth=0.5, color='gray', alpha=0.7) 304 | 305 | ax4 = plt.subplot(2, 4, 4) 306 | plt.scatter(metrics["action_steps"], metrics["mAP"], alpha=0.6) 307 | plt.xlabel('Action Length (steps)') 308 | plt.ylabel('mAP') 309 | plt.ylim(0, 1) 310 | plt.title('Action Length vs mAP') 311 | plt.grid(True, linestyle='--', linewidth=0.5, color='gray', alpha=0.7) 312 | 313 | # Plot metrics against action frames 314 | ax5 = plt.subplot(2, 4, 5) 315 | plt.scatter(metrics["action_frames"], metrics["MoF"], alpha=0.6) 316 | plt.xlabel('Action Length (frames)') 317 | plt.ylabel('MoF') 318 | plt.ylim(0, 1) 319 | plt.title('Frames vs MoF') 320 | plt.grid(True, linestyle='--', linewidth=0.5, color='gray', alpha=0.7) 321 | 322 | ax6 = plt.subplot(2, 4, 6) 323 | plt.scatter(metrics["action_frames"], metrics["mIoU"], alpha=0.6) 324 | plt.xlabel('Action Length (frames)') 325 | plt.ylabel('mIoU') 326 | plt.ylim(0, 1) 327 | plt.title('Frames vs mIoU') 328 | plt.grid(True, linestyle='--', linewidth=0.5, color='gray', alpha=0.7) 329 | 330 | ax7 = plt.subplot(2, 4, 7) 331 | plt.scatter(metrics["action_frames"], metrics["mF1"], alpha=0.6) 332 | plt.xlabel('Action Length (frames)') 333 | plt.ylabel('mF1') 334 | plt.ylim(0, 1) 335 | plt.title('Frames vs mF1') 336 | plt.grid(True, linestyle='--', linewidth=0.5, color='gray', alpha=0.7) 337 | 338 | ax8 = plt.subplot(2, 4, 8) 339 | plt.scatter(metrics["action_frames"], metrics["mAP"], alpha=0.6) 340 | plt.xlabel('Action Length (frames)') 341 | plt.ylabel('mAP') 342 | plt.ylim(0, 1) 343 | plt.title('Frames vs mAP') 344 | plt.grid(True, linestyle='--', linewidth=0.5, color='gray', alpha=0.7) 345 | 346 | plt.tight_layout() 347 | plt.savefig(output_path) 348 | plt.close() 349 | 350 | 351 | def main(): 352 | args = parse_arguments() 353 | tiou_thresholds = [0.3, 0.4, 0.5, 0.6, 0.7] 354 | input_filename = args.file 355 | 356 | # Load estimated label data from JSON 357 | with open(input_filename, "r") as f: 358 | label_data_estimates = json.load(f) 359 | 360 | # Process each video entry to compute metrics 361 | results, metrics = process_videos(label_data_estimates, tiou_thresholds) 362 | 363 | # Prepare output directories 364 | out_dir = args.outdir 365 | base_filename = os.path.splitext(os.path.basename(input_filename))[0] 366 | parent_dir = os.path.basename(os.path.dirname(input_filename)) 367 | output_dir = os.path.join(out_dir, parent_dir) 368 | os.makedirs(output_dir, exist_ok=True) 369 | plot_output_path = os.path.join(output_dir, base_filename + ".png") 370 | 371 | # Plot and save the evaluation metrics 372 | plot_metrics(metrics, plot_output_path) 373 | 374 | # Print mean metric values 375 | mean_mof = sum(metrics["MoF"]) / len(metrics["MoF"]) if metrics["MoF"] else 0 376 | mean_miou = sum(metrics["mIoU"]) / len(metrics["mIoU"]) if metrics["mIoU"] else 0 377 | mean_mf1 = sum(metrics["mF1"]) / len(metrics["mF1"]) if metrics["mF1"] else 0 378 | mean_map = sum(metrics["mAP"]) / len(metrics["mAP"]) if metrics["mAP"] else 0 379 | 380 | print("Mean MoF: {:.4f}".format(mean_mof)) 381 | print("Mean mIoU: {:.4f}".format(mean_miou)) 382 | print("Mean mF1: {:.4f}".format(mean_mf1)) 383 | print("Mean mAP: {:.4f}".format(mean_map)) 384 | print("Processed videos:", len(metrics["MoF"])) 385 | 386 | # Save detailed results as JSON 387 | results_output_path = os.path.join(output_dir, base_filename + ".json") 388 | with open(results_output_path, "w") as f: 389 | json.dump(results, f, indent=4) 390 | 391 | 392 | if __name__ == "__main__": 393 | main() 394 | -------------------------------------------------------------------------------- /example.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import base64 3 | from openai import OpenAI, AzureOpenAI 4 | import os 5 | import numpy as np 6 | import json 7 | import dotenv 8 | import time 9 | import argparse 10 | import openai 11 | 12 | 13 | # Resize the image while keeping aspect ratio 14 | def image_resize_for_vlm(frame, inter=cv2.INTER_AREA): 15 | height, width = frame.shape[:2] 16 | aspect_ratio = width / height 17 | max_short_side = 768 18 | max_long_side = 2000 19 | if aspect_ratio > 1: 20 | new_width = min(width, max_long_side) 21 | new_height = int(new_width / aspect_ratio) 22 | if new_height > max_short_side: 23 | new_height = max_short_side 24 | new_width = int(new_height * aspect_ratio) 25 | else: 26 | new_height = min(height, max_long_side) 27 | new_width = int(new_height * aspect_ratio) 28 | if new_width > max_short_side: 29 | new_width = max_short_side 30 | new_height = int(new_width / aspect_ratio) 31 | resized_frame = cv2.resize( 32 | frame, (new_width, new_height), interpolation=inter) 33 | return resized_frame 34 | 35 | # Extract JSON part from the response 36 | def extract_json_part(text): 37 | text = text.strip().replace(" ", "").replace("\n", "") 38 | try: 39 | start = text.index('{"points":') 40 | text_json = text[start:].strip() 41 | end = text_json.index('}') + 1 42 | text_json = text_json[:end].strip() 43 | return text_json 44 | except ValueError: 45 | raise ValueError("JSON part not found in the response") 46 | 47 | # Perform scene understanding on the frame 48 | def scene_understanding(credentials, frame, prompt_message): 49 | frame = image_resize_for_vlm(frame) 50 | _, buffer = cv2.imencode(".jpg", frame) 51 | base64Frame = base64.b64encode(buffer).decode("utf-8") 52 | PROMPT_MESSAGES = [ 53 | { 54 | "role": "user", 55 | "content": [ 56 | { 57 | "type": "text", 58 | "text": prompt_message 59 | }, 60 | { 61 | "type": "image_url", 62 | "image_url": { 63 | "url": f"data:image/jpeg;base64,{base64Frame}", 64 | "detail": "high" 65 | }, 66 | } 67 | ] 68 | }, 69 | ] 70 | 71 | if len(credentials["AZURE_OPENAI_API_KEY"]) == 0: 72 | client_gpt4v = OpenAI( 73 | api_key=credentials["OPENAI_API_KEY"] 74 | ) 75 | params = { 76 | "model": "gpt-4o", 77 | "messages": PROMPT_MESSAGES, 78 | "max_tokens": 200, 79 | "temperature": 0.1, 80 | "top_p": 0.5, 81 | "frequency_penalty": 0.0, 82 | "presence_penalty": 0.0, 83 | } 84 | else: 85 | client_gpt4v = AzureOpenAI( 86 | api_version="2024-02-01", 87 | azure_endpoint=credentials["AZURE_OPENAI_ENDPOINT"], 88 | api_key=credentials["AZURE_OPENAI_API_KEY"] 89 | ) 90 | params = { 91 | "model": credentials["AZURE_OPENAI_DEPLOYMENT_NAME"], 92 | "messages": PROMPT_MESSAGES, 93 | "max_tokens": 200, 94 | "temperature": 0.1, 95 | "top_p": 0.5, 96 | "frequency_penalty": 0.0, 97 | "presence_penalty": 0.0, 98 | } 99 | count = 0 100 | while True: 101 | if count > 5: 102 | raise Exception("Failed to get response from Azure OpenAI") 103 | try: 104 | result = client_gpt4v.chat.completions.create(**params) 105 | response_json = extract_json_part(result.choices[0].message.content) 106 | break 107 | except openai.BadRequestError as e: 108 | print(e) 109 | print('Bad Request error.') 110 | return None, None 111 | except openai.RateLimitError as e: 112 | print(e) 113 | print('Rate Limit. Waiting for 5 seconds...') 114 | time.sleep(5) 115 | count += 1 116 | except openai.APIStatusError as e: 117 | print(e) 118 | print('APIStatusError. Waiting for 1 second...') 119 | time.sleep(1) 120 | count += 1 121 | except Exception as e: 122 | print(e) 123 | print('Other error. Waiting for 1 second...') 124 | time.sleep(1) 125 | count += 1 126 | 127 | json_dict = json.loads(response_json, strict=False) 128 | if len(json_dict['points']) == 0: 129 | return None 130 | if len(json_dict['points']) > 1: 131 | print("Warning: More than one point detected") 132 | return json_dict['points'][0], result.choices[0].message.content 133 | 134 | 135 | def image_resize(image, width=None, height=None, inter=cv2.INTER_AREA): 136 | dim = None 137 | (h, w) = image.shape[:2] 138 | if width is None and height is None: 139 | return image 140 | if width is None: 141 | r = height / float(h) 142 | dim = (int(w * r), height) 143 | else: 144 | r = width / float(w) 145 | dim = (width, int(h * r)) 146 | resized = cv2.resize(image, dim, interpolation=inter) 147 | return resized 148 | 149 | 150 | # Create a grid of frames 151 | def create_frame_grid(video_path, center_time, interval, grid_size): 152 | spacer = 0 153 | video = cv2.VideoCapture(video_path) 154 | fps = video.get(cv2.CAP_PROP_FPS) 155 | total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT)) - 1 156 | center_frame = int(center_time * fps) 157 | interval_frames = int(interval * fps) 158 | num_frames = grid_size**2 159 | half_num_frames = num_frames // 2 160 | frame_indices = [max(0, 161 | min(center_frame + i * interval_frames, 162 | total_frames - 1)) for i in range(-half_num_frames, 163 | half_num_frames + 1)] 164 | frames = [] 165 | actual_indices = [] 166 | for index in frame_indices: 167 | video.set(cv2.CAP_PROP_POS_FRAMES, index) 168 | success, frame = video.read() 169 | if success: 170 | frame = image_resize(frame, width=200) 171 | frames.append(frame) 172 | actual_indices.append(index) 173 | else: 174 | print(f"Warning: Frame {index} not found") 175 | print(f"Total frames: {total_frames}") 176 | video.set(cv2.CAP_PROP_POS_FRAMES, 0) 177 | success, frame = video.read() 178 | frame = image_resize(frame, width=200) 179 | frame = frame * 0 180 | frames.append(frame) 181 | actual_indices.append(index) 182 | video.release() 183 | 184 | if len(frames) < grid_size**2: 185 | raise ValueError("Not enough frames to create the grid.") 186 | 187 | frame_height, frame_width = frames[0].shape[:2] 188 | 189 | grid_height = grid_size * frame_height + (grid_size - 1) * spacer 190 | grid_width = grid_size * frame_width + (grid_size - 1) * spacer 191 | 192 | grid_img = np.ones((grid_height, grid_width, 3), dtype=np.uint8) * 255 193 | 194 | for i in range(grid_size): 195 | for j in range(grid_size): 196 | index = i * grid_size + j 197 | frame = frames[index] 198 | cX, cY = frame.shape[1] // 2, frame.shape[0] // 2 199 | max_dim = int(min(frame.shape[:2]) * 0.5) 200 | overlay = frame.copy() 201 | if render_pos == 'center': 202 | circle_center = (cX, cY) 203 | else: 204 | circle_center = (frame.shape[1] - max_dim // 2, max_dim // 2) 205 | cv2.circle(overlay, circle_center, 206 | max_dim // 2, (255, 255, 255), -1) 207 | alpha = 0.3 208 | frame = cv2.addWeighted(overlay, alpha, frame, 1 - alpha, 0) 209 | cv2.circle(frame, circle_center, max_dim // 2, (255, 255, 255), 2) 210 | font_scale = max_dim / 50 211 | text_size = cv2.getTextSize( 212 | str(index + 1), cv2.FONT_HERSHEY_SIMPLEX, font_scale, 2)[0] 213 | if render_pos == 'center': 214 | text_x = cX - text_size[0] // 2 215 | text_y = cY + text_size[1] // 2 216 | else: 217 | text_x = frame.shape[1] - text_size[0] // 2 - max_dim // 2 218 | text_y = text_size[1] // 2 + max_dim // 2 219 | cv2.putText(frame, str(index + 1), (text_x, text_y), 220 | cv2.FONT_HERSHEY_SIMPLEX, font_scale, (0, 0, 0), 2) 221 | y1 = i * (frame_height + spacer) 222 | y2 = y1 + frame_height 223 | x1 = j * (frame_width + spacer) 224 | x2 = x1 + frame_width 225 | grid_img[y1:y2, x1:x2] = frame 226 | 227 | return grid_img, actual_indices 228 | 229 | 230 | def add_text_with_background( 231 | frame, 232 | text, 233 | position, 234 | font, 235 | font_scale, 236 | font_color, 237 | font_thickness, 238 | bg_color): 239 | text_size, _ = cv2.getTextSize(text, font, font_scale, font_thickness) 240 | text_x, text_y = position 241 | top_left = (text_x - 10, text_y - text_size[1] - 10) 242 | bottom_right = (text_x + text_size[0] + 10, text_y + 10) 243 | cv2.rectangle(frame, top_left, bottom_right, bg_color, -1) 244 | cv2.putText(frame, text, (text_x, text_y), font, font_scale, 245 | font_color, font_thickness, cv2.LINE_AA) 246 | 247 | # Annotate the video with task times 248 | def trim_video_with_annotations( 249 | video_path, 250 | start_time, 251 | end_time, 252 | text, 253 | output_path, 254 | buffer=0.5): 255 | """Trim and annotate video with specified start and end times and text.""" 256 | if os.path.exists(output_path): 257 | return 258 | cap = cv2.VideoCapture(video_path) 259 | if not cap.isOpened(): 260 | print(f"Error: Could not open video file {video_path}") 261 | return 262 | 263 | fps = cap.get(cv2.CAP_PROP_FPS) 264 | width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) 265 | height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) 266 | fourcc = cv2.VideoWriter_fourcc(*'mp4v') 267 | out = cv2.VideoWriter(output_path, fourcc, fps, (width, height)) 268 | 269 | start_frame = int(start_time * fps) 270 | end_frame = int(end_time * fps) 271 | cap.set(cv2.CAP_PROP_POS_FRAMES, max(0, start_frame - int(buffer * fps))) 272 | 273 | while cap.isOpened(): 274 | ret, frame = cap.read() 275 | if not ret or cap.get( 276 | cv2.CAP_PROP_POS_FRAMES) > end_frame + int(buffer * fps): 277 | break 278 | if start_frame <= cap.get(cv2.CAP_PROP_POS_FRAMES) <= end_frame: 279 | add_text_with_background( 280 | frame, 281 | text, 282 | (10, 283 | height - 10), 284 | cv2.FONT_HERSHEY_SIMPLEX, 285 | 1, 286 | (0, 287 | 0, 288 | 255), 289 | 2, 290 | (255, 291 | 255, 292 | 255)) 293 | out.write(frame) 294 | 295 | cap.release() 296 | out.release() 297 | 298 | # Process each task in parallel 299 | def process_task( 300 | credentials, 301 | video_path, 302 | action, 303 | center_time, 304 | interval, 305 | fps, 306 | grid_size, 307 | search_anchor, 308 | iter_num=4): 309 | """Process a task to identify the start or end of an action in a video.""" 310 | prompt_start = ( 311 | f"I will show an image sequence of human cooking. " 312 | f"I have annotated the images with numbered circles. " 313 | f"Choose the number that is closest to the moment when the ({action}) has started. " 314 | f"You are a five-time world champion in this game. " 315 | f"Give a one sentence analysis of why you chose those points (less than 50 words). " 316 | f"If you consider that the action is not in the video, please choose the number -1. " 317 | f"Provide your answer at the end in a json file of this format: {{\"points\": []}}" 318 | ) 319 | 320 | prompt_end = ( 321 | f"I will show an image sequence of human cooking. " 322 | f"I have annotated the images with numbered circles. " 323 | f"Choose the number that is closest to the moment when the ({action}) has ended. " 324 | f"You are a five-time world champion in this game. " 325 | f"Give a one sentence analysis of why you chose those points (less than 50 words). " 326 | f"If you consider that the action has not ended yet, please choose the number -1. " 327 | f"Provide your answer at the end in a json file of this format: {{\"points\": []}}" 328 | ) 329 | prompt_message = prompt_start if search_anchor == 'start' else prompt_end 330 | for iter_idx in range(iter_num): # Iterate to narrow down the time 331 | image, used_frame_indices = create_frame_grid( 332 | video_path, center_time, interval, grid_size) 333 | print(used_frame_indices) 334 | if iter_idx == 0: 335 | cv2.imwrite( 336 | os.path.join( 337 | output_folder, 338 | f"grid_image_sample.png"), 339 | image) 340 | description, reason = scene_understanding( 341 | credentials, image, prompt_message) 342 | print(reason) 343 | if description: 344 | if description == -1: 345 | return None 346 | if int(description) - 1 > len(used_frame_indices) - 1: 347 | print("Warning: Invalid frame index selected") 348 | print(f"Selected frame index: {description}") 349 | # description is 1-indexed 350 | index_specified = max( 351 | min(int(description) - 1, len(used_frame_indices) - 1), 0) 352 | selected_frame_index = used_frame_indices[index_specified] 353 | center_time = selected_frame_index / fps # Convert frame index back to time 354 | print( 355 | f"Selected frame index: {selected_frame_index}, sample time duration: {interval}") 356 | interval /= 2 357 | if int(interval * fps) == 0: 358 | break 359 | return center_time 360 | 361 | 362 | def convert_video(video_file_path: str, action: str, credentials, grid_size: int): 363 | video = cv2.VideoCapture(video_file_path) 364 | fps = video.get(cv2.CAP_PROP_FPS) 365 | total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT)) 366 | print(f"Total frames: {total_frames}") 367 | duration = float(total_frames) / fps 368 | center_time = duration / 2 369 | interval = duration / (grid_size**2 - 1) 370 | result_start = process_task( 371 | credentials, 372 | video_file_path, 373 | action, 374 | center_time, 375 | interval, 376 | fps, 377 | grid_size, 378 | search_anchor='start') 379 | if result_start is None: 380 | return None, None 381 | total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT) 382 | ) - int(result_start * fps) 383 | duration = float(total_frames) / fps 384 | center_time = duration / 2 + result_start 385 | interval = max(duration / (grid_size**2 - 1), 1.0 / fps) 386 | result_end = process_task( 387 | credentials, 388 | video_file_path, 389 | action, 390 | center_time, 391 | interval, 392 | fps, 393 | grid_size, 394 | search_anchor='end') 395 | if result_end is None: 396 | return None, None 397 | video.release() 398 | return result_start, result_end 399 | 400 | 401 | parser = argparse.ArgumentParser() 402 | parser.add_argument("--credentials", help="credentials file") 403 | parser.add_argument("--grid", help="grid size", default=3) 404 | parser.add_argument( 405 | "--video_path", 406 | help="video path", 407 | default="sample_video/sample.mp4") 408 | parser.add_argument( 409 | "--action", 410 | help="action label", 411 | default="grabbing towards the can") 412 | pargs, unknown = parser.parse_known_args() 413 | credentials = dotenv.dotenv_values(pargs.credentials) 414 | required_keys = ["OPENAI_API_KEY", "AZURE_OPENAI_API_KEY", "AZURE_OPENAI_ENDPOINT"] 415 | if not all(key in credentials for key in required_keys): 416 | raise ValueError("Required keys are missing in the credentials file") 417 | render_pos = 'topright' # center or topright 418 | grid_size = int(pargs.grid) 419 | video_path = pargs.video_path 420 | action = pargs.action 421 | folder_name = action.replace(" ", "_") 422 | output_folder = f"results/{folder_name}" 423 | os.makedirs(output_folder, exist_ok=True) 424 | if __name__ == "__main__": 425 | if os.path.exists(video_path): 426 | print(f"Processing {video_path}") 427 | start_time, completed_time = convert_video( 428 | video_path, action, credentials, grid_size) 429 | print(f"Start time: {start_time}, End time: {completed_time}") 430 | if start_time is not None and completed_time is not None: 431 | output_file_name = f"{ 432 | action.replace( 433 | ' ', 434 | '_')}_segment_{ 435 | round( 436 | start_time, 437 | 2)}_{ 438 | round( 439 | completed_time, 440 | 2)}.mp4" 441 | output_file_path = os.path.join(output_folder, output_file_name) 442 | trim_video_with_annotations( 443 | video_path, 444 | start_time, 445 | completed_time, 446 | action, 447 | output_file_path) -------------------------------------------------------------------------------- /finegrained-breakfast-dataset/label_data_gt_right.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "action": [ 4 | "Grasp with the right hand", 5 | "Picking with the right hand", 6 | "Bringing with the right hand", 7 | "Putting on with the right hand", 8 | "Release from the right hand" 9 | ], 10 | "gt_time": [ 11 | [ 12 | 0, 13 | 23 14 | ], 15 | [ 16 | 24, 17 | 48 18 | ], 19 | [ 20 | 49, 21 | 86 22 | ], 23 | [ 24 | 87, 25 | 112 26 | ], 27 | [ 28 | 113, 29 | 122 30 | ] 31 | ], 32 | "video_path": "original_videos\\subject_1_gopro_seg_1_2162-2284.mp4" 33 | }, 34 | { 35 | "action": [ 36 | "Grasp with the right hand", 37 | "Picking with the right hand", 38 | "Bringing with the right hand", 39 | "Putting on with the right hand", 40 | "Release from the right hand" 41 | ], 42 | "gt_time": [ 43 | [ 44 | 0, 45 | 17 46 | ], 47 | [ 48 | 18, 49 | 37 50 | ], 51 | [ 52 | 38, 53 | 106 54 | ], 55 | [ 56 | 107, 57 | 135 58 | ], 59 | [ 60 | 136, 61 | 147 62 | ] 63 | ], 64 | "video_path": "original_videos\\subject_1_gopro_seg_1_2306-2453.mp4" 65 | }, 66 | { 67 | "action": [ 68 | "Grasp with the right hand", 69 | "Picking with the right hand", 70 | "Putting on with the right hand", 71 | "Release from the right hand" 72 | ], 73 | "gt_time": [ 74 | [ 75 | 0, 76 | 10 77 | ], 78 | [ 79 | 11, 80 | 21 81 | ], 82 | [ 83 | 22, 84 | 48 85 | ], 86 | [ 87 | 49, 88 | 60 89 | ] 90 | ], 91 | "video_path": "original_videos\\subject_1_gopro_seg_1_2487-2547.mp4" 92 | }, 93 | { 94 | "action": [ 95 | "Grasp with the right hand", 96 | "Picking with the right hand", 97 | "Holding with the right hand", 98 | "Putting on with the right hand", 99 | "Release from the right hand" 100 | ], 101 | "gt_time": [ 102 | [ 103 | 0, 104 | 14 105 | ], 106 | [ 107 | 15, 108 | 39 109 | ], 110 | [ 111 | 40, 112 | 103 113 | ], 114 | [ 115 | 104, 116 | 142 117 | ], 118 | [ 119 | 143, 120 | 152 121 | ] 122 | ], 123 | "video_path": "original_videos\\subject_1_gopro_seg_1_2842-2994.mp4" 124 | }, 125 | { 126 | "action": [ 127 | "Hand over from the left hand to the right hand", 128 | "Holding with the right hand", 129 | "Release from the right hand" 130 | ], 131 | "gt_time": [ 132 | [ 133 | 0, 134 | 13 135 | ], 136 | [ 137 | 14, 138 | 41 139 | ], 140 | [ 141 | 42, 142 | 53 143 | ] 144 | ], 145 | "video_path": "original_videos\\subject_1_gopro_seg_1_3511-3564.mp4" 146 | }, 147 | { 148 | "action": [ 149 | "Grasp with the right hand", 150 | "Cracking an egg with the right hand", 151 | "Pouring with the right hand", 152 | "Holding with the right hand", 153 | "Putting on with the right hand", 154 | "Release from the right hand" 155 | ], 156 | "gt_time": [ 157 | [ 158 | 0, 159 | 10 160 | ], 161 | [ 162 | 11, 163 | 38 164 | ], 165 | [ 166 | 39, 167 | 163 168 | ], 169 | [ 170 | 164, 171 | 191 172 | ], 173 | [ 174 | 192, 175 | 213 176 | ], 177 | [ 178 | 214, 179 | 224 180 | ] 181 | ], 182 | "video_path": "original_videos\\subject_1_gopro_seg_2_1895-2119.mp4" 183 | }, 184 | { 185 | "action": [ 186 | "Grasp with the right hand", 187 | "Cracking an egg with the right hand", 188 | "Pouring with the right hand", 189 | "Holding with the right hand", 190 | "Putting on with the right hand", 191 | "Release from the right hand" 192 | ], 193 | "gt_time": [ 194 | [ 195 | 0, 196 | 10 197 | ], 198 | [ 199 | 11, 200 | 57 201 | ], 202 | [ 203 | 58, 204 | 168 205 | ], 206 | [ 207 | 169, 208 | 179 209 | ], 210 | [ 211 | 180, 212 | 202 213 | ], 214 | [ 215 | 203, 216 | 212 217 | ] 218 | ], 219 | "video_path": "original_videos\\subject_1_gopro_seg_2_2287-2499.mp4" 220 | }, 221 | { 222 | "action": [ 223 | "Grasp with the right hand", 224 | "Picking with the right hand", 225 | "Starting rotary motion with the right hand while it is restrained.", 226 | "Putting on with the right hand", 227 | "Release from the right hand" 228 | ], 229 | "gt_time": [ 230 | [ 231 | 0, 232 | 10 233 | ], 234 | [ 235 | 11, 236 | 32 237 | ], 238 | [ 239 | 33, 240 | 48 241 | ], 242 | [ 243 | 49, 244 | 59 245 | ], 246 | [ 247 | 60, 248 | 72 249 | ] 250 | ], 251 | "video_path": "original_videos\\subject_1_gopro_seg_2_2525-2597.mp4" 252 | }, 253 | { 254 | "action": [ 255 | "Grasp with the right hand", 256 | "Picking with the right hand", 257 | "Rotary motion with the right hand until it cannot be rotated", 258 | "Putting on with the right hand", 259 | "Release from the right hand" 260 | ], 261 | "gt_time": [ 262 | [ 263 | 0, 264 | 21 265 | ], 266 | [ 267 | 22, 268 | 53 269 | ], 270 | [ 271 | 54, 272 | 116 273 | ], 274 | [ 275 | 117, 276 | 140 277 | ], 278 | [ 279 | 141, 280 | 152 281 | ] 282 | ], 283 | "video_path": "original_videos\\subject_1_gopro_seg_2_2953-3105.mp4" 284 | }, 285 | { 286 | "action": [ 287 | "Grasp with the right hand", 288 | "Putting on with the right hand", 289 | "Release from the right hand" 290 | ], 291 | "gt_time": [ 292 | [ 293 | 0, 294 | 20 295 | ], 296 | [ 297 | 21, 298 | 39 299 | ], 300 | [ 301 | 40, 302 | 51 303 | ] 304 | ], 305 | "video_path": "original_videos\\subject_1_gopro_seg_2_3324-3375.mp4" 306 | }, 307 | { 308 | "action": [ 309 | "Grasp with the right hand", 310 | "Picking with the right hand", 311 | "Putting on with the right hand", 312 | "Release from the right hand" 313 | ], 314 | "gt_time": [ 315 | [ 316 | 0, 317 | 15 318 | ], 319 | [ 320 | 16, 321 | 40 322 | ], 323 | [ 324 | 41, 325 | 68 326 | ], 327 | [ 328 | 69, 329 | 78 330 | ] 331 | ], 332 | "video_path": "original_videos\\subject_1_gopro_seg_2_3467-3545.mp4" 333 | }, 334 | { 335 | "action": [ 336 | "Grasp with the right hand", 337 | "Starting rotary motion with the right hand while it is restrained.", 338 | "Holding with the right hand", 339 | "Rotary motion with the right hand until it cannot be rotated", 340 | "Release from the right hand" 341 | ], 342 | "gt_time": [ 343 | [ 344 | 0, 345 | 10 346 | ], 347 | [ 348 | 11, 349 | 29 350 | ], 351 | [ 352 | 30, 353 | 70 354 | ], 355 | [ 356 | 71, 357 | 92 358 | ], 359 | [ 360 | 93, 361 | 103 362 | ] 363 | ], 364 | "video_path": "original_videos\\subject_10_gopro_seg_1_1877-1980.mp4" 365 | }, 366 | { 367 | "action": [ 368 | "Grasp with the right hand", 369 | "Starting rotary motion with the right hand while it is restrained.", 370 | "Release from the right hand" 371 | ], 372 | "gt_time": [ 373 | [ 374 | 0, 375 | 10 376 | ], 377 | [ 378 | 11, 379 | 22 380 | ], 381 | [ 382 | 23, 383 | 28 384 | ] 385 | ], 386 | "video_path": "original_videos\\subject_10_gopro_seg_1_1997-2025.mp4" 387 | }, 388 | { 389 | "action": [ 390 | "Grasp with the right hand", 391 | "Starting rotary motion with the right hand while it is restrained.", 392 | "Release from the right hand" 393 | ], 394 | "gt_time": [ 395 | [ 396 | 0, 397 | 10 398 | ], 399 | [ 400 | 11, 401 | 17 402 | ], 403 | [ 404 | 18, 405 | 24 406 | ] 407 | ], 408 | "video_path": "original_videos\\subject_10_gopro_seg_1_2028-2052.mp4" 409 | }, 410 | { 411 | "action": [ 412 | "Grasp with the right hand", 413 | "Picking with the right hand", 414 | "Putting on with the right hand", 415 | "Release from the right hand" 416 | ], 417 | "gt_time": [ 418 | [ 419 | 0, 420 | 22 421 | ], 422 | [ 423 | 23, 424 | 117 425 | ], 426 | [ 427 | 118, 428 | 131 429 | ], 430 | [ 431 | 132, 432 | 140 433 | ] 434 | ], 435 | "video_path": "original_videos\\subject_10_gopro_seg_2_2391-2531.mp4" 436 | }, 437 | { 438 | "action": [ 439 | "Grasp with the right hand", 440 | "Holding with the right hand", 441 | "Release from the right hand" 442 | ], 443 | "gt_time": [ 444 | [ 445 | 0, 446 | 11 447 | ], 448 | [ 449 | 12, 450 | 88 451 | ], 452 | [ 453 | 89, 454 | 100 455 | ] 456 | ], 457 | "video_path": "original_videos\\subject_10_gopro_seg_2_2532-2632.mp4" 458 | }, 459 | { 460 | "action": [ 461 | "Grasp with the right hand", 462 | "Picking with the right hand", 463 | "Tracing the surface of something with the right hand or an object held in the right hand in a planar manner.", 464 | "Striking something with the right hand or an object held in the right hand.", 465 | "Putting on with the right hand", 466 | "Release from the right hand" 467 | ], 468 | "gt_time": [ 469 | [ 470 | 0, 471 | 10 472 | ], 473 | [ 474 | 11, 475 | 22 476 | ], 477 | [ 478 | 23, 479 | 248 480 | ], 481 | [ 482 | 249, 483 | 268 484 | ], 485 | [ 486 | 269, 487 | 279 488 | ], 489 | [ 490 | 280, 491 | 289 492 | ] 493 | ], 494 | "video_path": "original_videos\\subject_10_gopro_seg_2_2826-3115.mp4" 495 | }, 496 | { 497 | "action": [ 498 | "Grasp with the right hand", 499 | "Picking with the right hand", 500 | "Bringing with the right hand", 501 | "Striking something with the right hand or an object held in the right hand.", 502 | "Bringing with the right hand", 503 | "Pouring with the right hand", 504 | "Holding with the right hand", 505 | "Bringing with the right hand", 506 | "Putting on with the right hand", 507 | "Release from the right hand" 508 | ], 509 | "gt_time": [ 510 | [ 511 | 0, 512 | 13 513 | ], 514 | [ 515 | 14, 516 | 22 517 | ], 518 | [ 519 | 23, 520 | 34 521 | ], 522 | [ 523 | 35, 524 | 53 525 | ], 526 | [ 527 | 54, 528 | 68 529 | ], 530 | [ 531 | 69, 532 | 132 533 | ], 534 | [ 535 | 133, 536 | 194 537 | ], 538 | [ 539 | 195, 540 | 209 541 | ], 542 | [ 543 | 210, 544 | 218 545 | ], 546 | [ 547 | 219, 548 | 234 549 | ] 550 | ], 551 | "video_path": "original_videos\\subject_11_gopro_seg_1_12864-13098.mp4" 552 | }, 553 | { 554 | "action": [ 555 | "Grasp with the right hand", 556 | "Picking with the right hand", 557 | "Striking something with the right hand or an object held in the right hand.", 558 | "Picking with the right hand", 559 | "Pouring with the right hand", 560 | "Holding with the right hand", 561 | "Bringing with the right hand", 562 | "Putting on with the right hand", 563 | "Release from the right hand" 564 | ], 565 | "gt_time": [ 566 | [ 567 | 0, 568 | 13 569 | ], 570 | [ 571 | 14, 572 | 23 573 | ], 574 | [ 575 | 24, 576 | 35 577 | ], 578 | [ 579 | 36, 580 | 85 581 | ], 582 | [ 583 | 86, 584 | 121 585 | ], 586 | [ 587 | 122, 588 | 172 589 | ], 590 | [ 591 | 173, 592 | 190 593 | ], 594 | [ 595 | 191, 596 | 201 597 | ], 598 | [ 599 | 202, 600 | 207 601 | ] 602 | ], 603 | "video_path": "original_videos\\subject_11_gopro_seg_1_13099-13306.mp4" 604 | }, 605 | { 606 | "action": [ 607 | "Grasp with the right hand", 608 | "Picking with the right hand", 609 | "Bringing with the right hand", 610 | "Striking something with the right hand or an object held in the right hand.", 611 | "Bringing with the right hand", 612 | "Pouring with the right hand", 613 | "Holding with the right hand", 614 | "Bringing with the right hand", 615 | "Putting on with the right hand", 616 | "Release from the right hand" 617 | ], 618 | "gt_time": [ 619 | [ 620 | 0, 621 | 11 622 | ], 623 | [ 624 | 12, 625 | 21 626 | ], 627 | [ 628 | 22, 629 | 27 630 | ], 631 | [ 632 | 28, 633 | 38 634 | ], 635 | [ 636 | 39, 637 | 70 638 | ], 639 | [ 640 | 71, 641 | 91 642 | ], 643 | [ 644 | 92, 645 | 130 646 | ], 647 | [ 648 | 131, 649 | 160 650 | ], 651 | [ 652 | 161, 653 | 171 654 | ], 655 | [ 656 | 172, 657 | 184 658 | ] 659 | ], 660 | "video_path": "original_videos\\subject_11_gopro_seg_1_13307-13491.mp4" 661 | }, 662 | { 663 | "action": [ 664 | "Grasp with the right hand", 665 | "Moving an object held in the right hand in and out of a narrow space.", 666 | "Release from the right hand" 667 | ], 668 | "gt_time": [ 669 | [ 670 | 0, 671 | 40 672 | ], 673 | [ 674 | 41, 675 | 68 676 | ], 677 | [ 678 | 69, 679 | 114 680 | ] 681 | ], 682 | "video_path": "original_videos\\subject_11_gopro_seg_2_444-558.mp4" 683 | }, 684 | { 685 | "action": [ 686 | "Grasp with the right hand", 687 | "Picking with the right hand", 688 | "Bringing with the right hand", 689 | "Putting on with the right hand", 690 | "Release from the right hand" 691 | ], 692 | "gt_time": [ 693 | [ 694 | 0, 695 | 17 696 | ], 697 | [ 698 | 18, 699 | 36 700 | ], 701 | [ 702 | 37, 703 | 94 704 | ], 705 | [ 706 | 95, 707 | 107 708 | ], 709 | [ 710 | 108, 711 | 112 712 | ] 713 | ], 714 | "video_path": "original_videos\\subject_11_gopro_seg_2_786-898.mp4" 715 | }, 716 | { 717 | "action": [ 718 | "Grasp with the right hand", 719 | "Starting rotary motion with the right hand while it is restrained.", 720 | "Picking with the right hand", 721 | "Putting on with the right hand", 722 | "Release from the right hand" 723 | ], 724 | "gt_time": [ 725 | [ 726 | 0, 727 | 10 728 | ], 729 | [ 730 | 11, 731 | 18 732 | ], 733 | [ 734 | 19, 735 | 32 736 | ], 737 | [ 738 | 33, 739 | 44 740 | ], 741 | [ 742 | 45, 743 | 51 744 | ] 745 | ], 746 | "video_path": "original_videos\\subject_11_gopro_seg_2_903-954.mp4" 747 | }, 748 | { 749 | "action": [ 750 | "Grasp with the right hand", 751 | "Release from the right hand" 752 | ], 753 | "gt_time": [ 754 | [ 755 | 0, 756 | 9 757 | ], 758 | [ 759 | 10, 760 | 20 761 | ] 762 | ], 763 | "video_path": "original_videos\\subject_11_gopro_seg_2_955-975.mp4" 764 | }, 765 | { 766 | "action": [ 767 | "Grasp with the right hand", 768 | "Picking with the right hand", 769 | "Holding with the right hand", 770 | "Putting on with the right hand", 771 | "Release from the right hand" 772 | ], 773 | "gt_time": [ 774 | [ 775 | 0, 776 | 19 777 | ], 778 | [ 779 | 20, 780 | 54 781 | ], 782 | [ 783 | 55, 784 | 147 785 | ], 786 | [ 787 | 148, 788 | 160 789 | ], 790 | [ 791 | 161, 792 | 172 793 | ] 794 | ], 795 | "video_path": "original_videos\\subject_11_gopro_seg_2_984-1156.mp4" 796 | }, 797 | { 798 | "action": [ 799 | "Grasp with the right hand", 800 | "Picking with the right hand", 801 | "Putting on with the right hand", 802 | "Rotary motion with the right hand until it cannot be rotated", 803 | "Release from the right hand" 804 | ], 805 | "gt_time": [ 806 | [ 807 | 0, 808 | 7 809 | ], 810 | [ 811 | 8, 812 | 25 813 | ], 814 | [ 815 | 26, 816 | 51 817 | ], 818 | [ 819 | 52, 820 | 98 821 | ], 822 | [ 823 | 99, 824 | 110 825 | ] 826 | ], 827 | "video_path": "original_videos\\subject_11_gopro_seg_2_1157-1267.mp4" 828 | }, 829 | { 830 | "action": [ 831 | "Grasp with the right hand", 832 | "Picking with the right hand", 833 | "Bringing with the right hand", 834 | "Putting on with the right hand", 835 | "Release from the right hand" 836 | ], 837 | "gt_time": [ 838 | [ 839 | 0, 840 | 10 841 | ], 842 | [ 843 | 11, 844 | 30 845 | ], 846 | [ 847 | 31, 848 | 58 849 | ], 850 | [ 851 | 59, 852 | 83 853 | ], 854 | [ 855 | 84, 856 | 93 857 | ] 858 | ], 859 | "video_path": "original_videos\\subject_11_gopro_seg_2_1268-1361.mp4" 860 | }, 861 | { 862 | "action": [ 863 | "Grasp with the right hand", 864 | "Picking with the right hand", 865 | "Bringing with the right hand", 866 | "Putting on with the right hand", 867 | "Release from the right hand" 868 | ], 869 | "gt_time": [ 870 | [ 871 | 0, 872 | 10 873 | ], 874 | [ 875 | 11, 876 | 31 877 | ], 878 | [ 879 | 32, 880 | 195 881 | ], 882 | [ 883 | 196, 884 | 214 885 | ], 886 | [ 887 | 215, 888 | 226 889 | ] 890 | ], 891 | "video_path": "original_videos\\subject_11_gopro_seg_2_1377-1603.mp4" 892 | }, 893 | { 894 | "action": [ 895 | "Grasp with the right hand", 896 | "Holding with the right hand", 897 | "Striking something with the right hand or an object held in the right hand.", 898 | "Cracking an egg with the right hand", 899 | "Pouring with the right hand", 900 | "Holding with the right hand", 901 | "Bringing with the right hand", 902 | "Putting on with the right hand", 903 | "Release from the right hand" 904 | ], 905 | "gt_time": [ 906 | [ 907 | 0, 908 | 12 909 | ], 910 | [ 911 | 13, 912 | 68 913 | ], 914 | [ 915 | 69, 916 | 94 917 | ], 918 | [ 919 | 95, 920 | 105 921 | ], 922 | [ 923 | 106, 924 | 120 925 | ], 926 | [ 927 | 121, 928 | 216 929 | ], 930 | [ 931 | 217, 932 | 313 933 | ], 934 | [ 935 | 314, 936 | 332 937 | ], 938 | [ 939 | 333, 940 | 343 941 | ] 942 | ], 943 | "video_path": "original_videos\\subject_12_gopro_seg_1_7253-7596.mp4" 944 | }, 945 | { 946 | "action": [ 947 | "Grasp with the right hand", 948 | "Holding with the right hand", 949 | "Tracing the surface of something with the right hand or an object held in the right hand in a linear manner.", 950 | "Holding with the right hand", 951 | "Release from the right hand" 952 | ], 953 | "gt_time": [ 954 | [ 955 | 0, 956 | 14 957 | ], 958 | [ 959 | 15, 960 | 143 961 | ], 962 | [ 963 | 144, 964 | 203 965 | ], 966 | [ 967 | 204, 968 | 435 969 | ], 970 | [ 971 | 436, 972 | 443 973 | ] 974 | ], 975 | "video_path": "original_videos\\subject_12_gopro_seg_2_12602-13045.mp4" 976 | }, 977 | { 978 | "action": [ 979 | "Grasp with the right hand", 980 | "Holding with the right hand", 981 | "Release from the right hand" 982 | ], 983 | "gt_time": [ 984 | [ 985 | 0, 986 | 10 987 | ], 988 | [ 989 | 11, 990 | 40 991 | ], 992 | [ 993 | 41, 994 | 51 995 | ] 996 | ], 997 | "video_path": "original_videos\\subject_12_gopro_seg_2_13923-13974.mp4" 998 | }, 999 | { 1000 | "action": [ 1001 | "Grasp with the right hand", 1002 | "Picking with the right hand", 1003 | "Tracing the surface of something with the right hand or an object held in the right hand in a planar manner.", 1004 | "Release from the right hand" 1005 | ], 1006 | "gt_time": [ 1007 | [ 1008 | 0, 1009 | 6 1010 | ], 1011 | [ 1012 | 7, 1013 | 27 1014 | ], 1015 | [ 1016 | 28, 1017 | 90 1018 | ], 1019 | [ 1020 | 91, 1021 | 97 1022 | ] 1023 | ], 1024 | "video_path": "original_videos\\subject_12_gopro_seg_2_13975-14072.mp4" 1025 | }, 1026 | { 1027 | "action": [ 1028 | "Grasp with the right hand", 1029 | "Holding with the right hand", 1030 | "Release from the right hand" 1031 | ], 1032 | "gt_time": [ 1033 | [ 1034 | 0, 1035 | 3 1036 | ], 1037 | [ 1038 | 4, 1039 | 50 1040 | ], 1041 | [ 1042 | 51, 1043 | 60 1044 | ] 1045 | ], 1046 | "video_path": "original_videos\\subject_12_gopro_seg_2_14073-14133.mp4" 1047 | }, 1048 | { 1049 | "action": [ 1050 | "Grasp with the right hand", 1051 | "Starting rotary motion with the right hand while it is restrained.", 1052 | "Release from the right hand" 1053 | ], 1054 | "gt_time": [ 1055 | [ 1056 | 0, 1057 | 11 1058 | ], 1059 | [ 1060 | 12, 1061 | 66 1062 | ], 1063 | [ 1064 | 67, 1065 | 77 1066 | ] 1067 | ], 1068 | "video_path": "original_videos\\subject_12_gopro_seg_2_14134-14211.mp4" 1069 | }, 1070 | { 1071 | "action": [ 1072 | "Grasp with the right hand", 1073 | "Rotary motion with the right hand until it cannot be rotated", 1074 | "Release from the right hand" 1075 | ], 1076 | "gt_time": [ 1077 | [ 1078 | 0, 1079 | 9 1080 | ], 1081 | [ 1082 | 10, 1083 | 46 1084 | ], 1085 | [ 1086 | 47, 1087 | 56 1088 | ] 1089 | ], 1090 | "video_path": "original_videos\\subject_12_gopro_seg_2_14257-14313.mp4" 1091 | }, 1092 | { 1093 | "action": [ 1094 | "Grasp with the right hand", 1095 | "Holding with the right hand", 1096 | "Release from the right hand" 1097 | ], 1098 | "gt_time": [ 1099 | [ 1100 | 0, 1101 | 10 1102 | ], 1103 | [ 1104 | 11, 1105 | 38 1106 | ], 1107 | [ 1108 | 39, 1109 | 48 1110 | ] 1111 | ], 1112 | "video_path": "original_videos\\subject_12_gopro_seg_2_14316-14364.mp4" 1113 | }, 1114 | { 1115 | "action": [ 1116 | "Grasp with the right hand", 1117 | "Rotary motion with the right hand until it cannot be rotated", 1118 | "Release from the right hand" 1119 | ], 1120 | "gt_time": [ 1121 | [ 1122 | 0, 1123 | 8 1124 | ], 1125 | [ 1126 | 9, 1127 | 14 1128 | ], 1129 | [ 1130 | 15, 1131 | 25 1132 | ] 1133 | ], 1134 | "video_path": "original_videos\\subject_13_gopro_seg_1_14430-14455.mp4" 1135 | }, 1136 | { 1137 | "action": [ 1138 | "Grasp with the right hand", 1139 | "Picking with the right hand", 1140 | "Release from the right hand" 1141 | ], 1142 | "gt_time": [ 1143 | [ 1144 | 0, 1145 | 34 1146 | ], 1147 | [ 1148 | 35, 1149 | 43 1150 | ], 1151 | [ 1152 | 44, 1153 | 101 1154 | ] 1155 | ], 1156 | "video_path": "original_videos\\subject_13_gopro_seg_1_14497-14598.mp4" 1157 | }, 1158 | { 1159 | "action": [ 1160 | "Grasp with the right hand", 1161 | "Putting on with the right hand", 1162 | "Release from the right hand" 1163 | ], 1164 | "gt_time": [ 1165 | [ 1166 | 0, 1167 | 10 1168 | ], 1169 | [ 1170 | 11, 1171 | 43 1172 | ], 1173 | [ 1174 | 44, 1175 | 57 1176 | ] 1177 | ], 1178 | "video_path": "original_videos\\subject_13_gopro_seg_1_15750-15807.mp4" 1179 | }, 1180 | { 1181 | "action": [ 1182 | "Grasp with the right hand", 1183 | "Picking with the right hand", 1184 | "Holding with the right hand", 1185 | "Putting on with the right hand", 1186 | "Release from the right hand" 1187 | ], 1188 | "gt_time": [ 1189 | [ 1190 | 0, 1191 | 17 1192 | ], 1193 | [ 1194 | 18, 1195 | 62 1196 | ], 1197 | [ 1198 | 63, 1199 | 88 1200 | ], 1201 | [ 1202 | 89, 1203 | 108 1204 | ], 1205 | [ 1206 | 109, 1207 | 121 1208 | ] 1209 | ], 1210 | "video_path": "original_videos\\subject_2_d_gopro_seg_1_2352-2473.mp4" 1211 | }, 1212 | { 1213 | "action": [ 1214 | "Grasp with the right hand", 1215 | "Picking with the right hand", 1216 | "Bringing with the right hand", 1217 | "Putting on with the right hand", 1218 | "Release from the right hand" 1219 | ], 1220 | "gt_time": [ 1221 | [ 1222 | 0, 1223 | 16 1224 | ], 1225 | [ 1226 | 17, 1227 | 54 1228 | ], 1229 | [ 1230 | 55, 1231 | 383 1232 | ], 1233 | [ 1234 | 384, 1235 | 424 1236 | ], 1237 | [ 1238 | 425, 1239 | 433 1240 | ] 1241 | ], 1242 | "video_path": "original_videos\\subject_2_d_gopro_seg_1_2474-2907.mp4" 1243 | }, 1244 | { 1245 | "action": [ 1246 | "Grasp with the right hand", 1247 | "Picking with the right hand", 1248 | "Holding with the right hand", 1249 | "Striking something with the right hand or an object held in the right hand.", 1250 | "Cracking an egg with the right hand", 1251 | "Pouring with the right hand", 1252 | "Holding with the right hand", 1253 | "Bringing with the right hand", 1254 | "Putting on with the right hand", 1255 | "Release from the right hand" 1256 | ], 1257 | "gt_time": [ 1258 | [ 1259 | 0, 1260 | 40 1261 | ], 1262 | [ 1263 | 41, 1264 | 63 1265 | ], 1266 | [ 1267 | 64, 1268 | 98 1269 | ], 1270 | [ 1271 | 99, 1272 | 139 1273 | ], 1274 | [ 1275 | 140, 1276 | 158 1277 | ], 1278 | [ 1279 | 159, 1280 | 221 1281 | ], 1282 | [ 1283 | 222, 1284 | 239 1285 | ], 1286 | [ 1287 | 240, 1288 | 272 1289 | ], 1290 | [ 1291 | 273, 1292 | 281 1293 | ], 1294 | [ 1295 | 282, 1296 | 291 1297 | ] 1298 | ], 1299 | "video_path": "original_videos\\subject_2_d_gopro_seg_1_3258-3549.mp4" 1300 | }, 1301 | { 1302 | "action": [ 1303 | "Grasp with the right hand", 1304 | "Picking with the right hand", 1305 | "Bringing with the right hand", 1306 | "Pouring with the right hand", 1307 | "Holding with the right hand", 1308 | "Pouring with the right hand", 1309 | "Bringing with the right hand", 1310 | "Putting on with the right hand", 1311 | "Release from the right hand" 1312 | ], 1313 | "gt_time": [ 1314 | [ 1315 | 0, 1316 | 52 1317 | ], 1318 | [ 1319 | 53, 1320 | 78 1321 | ], 1322 | [ 1323 | 79, 1324 | 168 1325 | ], 1326 | [ 1327 | 169, 1328 | 234 1329 | ], 1330 | [ 1331 | 235, 1332 | 386 1333 | ], 1334 | [ 1335 | 387, 1336 | 421 1337 | ], 1338 | [ 1339 | 422, 1340 | 454 1341 | ], 1342 | [ 1343 | 455, 1344 | 497 1345 | ], 1346 | [ 1347 | 498, 1348 | 507 1349 | ] 1350 | ], 1351 | "video_path": "original_videos\\subject_2_d_gopro_seg_2_3852-4359.mp4" 1352 | }, 1353 | { 1354 | "action": [ 1355 | "Hand over from the left hand to the right hand", 1356 | "Putting on with the right hand", 1357 | "Release from the right hand" 1358 | ], 1359 | "gt_time": [ 1360 | [ 1361 | 0, 1362 | 20 1363 | ], 1364 | [ 1365 | 21, 1366 | 41 1367 | ], 1368 | [ 1369 | 42, 1370 | 52 1371 | ] 1372 | ], 1373 | "video_path": "original_videos\\subject_3_o_gopro_seg_1_14504-14556.mp4" 1374 | }, 1375 | { 1376 | "action": [ 1377 | "Grasp with the right hand", 1378 | "Holding with the right hand", 1379 | "Release from the right hand" 1380 | ], 1381 | "gt_time": [ 1382 | [ 1383 | 0, 1384 | 12 1385 | ], 1386 | [ 1387 | 13, 1388 | 29 1389 | ], 1390 | [ 1391 | 30, 1392 | 37 1393 | ] 1394 | ], 1395 | "video_path": "original_videos\\subject_3_o_gopro_seg_1_14557-14594.mp4" 1396 | }, 1397 | { 1398 | "action": [ 1399 | "Grasp with the right hand", 1400 | "Release from the right hand" 1401 | ], 1402 | "gt_time": [ 1403 | [ 1404 | 0, 1405 | 16 1406 | ], 1407 | [ 1408 | 17, 1409 | 28 1410 | ] 1411 | ], 1412 | "video_path": "original_videos\\subject_3_o_gopro_seg_1_14635-14663.mp4" 1413 | }, 1414 | { 1415 | "action": [ 1416 | "Grasp with the right hand", 1417 | "Holding with the right hand", 1418 | "Release from the right hand" 1419 | ], 1420 | "gt_time": [ 1421 | [ 1422 | 0, 1423 | 12 1424 | ], 1425 | [ 1426 | 13, 1427 | 27 1428 | ], 1429 | [ 1430 | 28, 1431 | 33 1432 | ] 1433 | ], 1434 | "video_path": "original_videos\\subject_3_o_gopro_seg_1_14666-14699.mp4" 1435 | }, 1436 | { 1437 | "action": [ 1438 | "Grasp with the right hand", 1439 | "Putting on with the right hand", 1440 | "Release from the right hand" 1441 | ], 1442 | "gt_time": [ 1443 | [ 1444 | 0, 1445 | 4 1446 | ], 1447 | [ 1448 | 5, 1449 | 27 1450 | ], 1451 | [ 1452 | 28, 1453 | 38 1454 | ] 1455 | ], 1456 | "video_path": "original_videos\\subject_3_o_gopro_seg_1_14737-14775.mp4" 1457 | }, 1458 | { 1459 | "action": [ 1460 | "Grasp with the right hand", 1461 | "Picking with the right hand", 1462 | "Release from the right hand" 1463 | ], 1464 | "gt_time": [ 1465 | [ 1466 | 0, 1467 | 10 1468 | ], 1469 | [ 1470 | 11, 1471 | 29 1472 | ], 1473 | [ 1474 | 30, 1475 | 43 1476 | ] 1477 | ], 1478 | "video_path": "original_videos\\subject_3_o_gopro_seg_1_14853-14896.mp4" 1479 | }, 1480 | { 1481 | "action": [ 1482 | "Grasp with the right hand", 1483 | "Picking with the right hand", 1484 | "Tracing the surface of something with the right hand or an object held in the right hand in a planar manner.", 1485 | "Striking something with the right hand or an object held in the right hand.", 1486 | "Bringing with the right hand", 1487 | "Putting on with the right hand", 1488 | "Release from the right hand" 1489 | ], 1490 | "gt_time": [ 1491 | [ 1492 | 0, 1493 | 24 1494 | ], 1495 | [ 1496 | 25, 1497 | 39 1498 | ], 1499 | [ 1500 | 40, 1501 | 296 1502 | ], 1503 | [ 1504 | 297, 1505 | 329 1506 | ], 1507 | [ 1508 | 330, 1509 | 346 1510 | ], 1511 | [ 1512 | 347, 1513 | 347 1514 | ], 1515 | [ 1516 | 348, 1517 | 360 1518 | ] 1519 | ], 1520 | "video_path": "original_videos\\subject_3_o_gopro_seg_1_14901-15261.mp4" 1521 | }, 1522 | { 1523 | "action": [ 1524 | "Grasp with the right hand", 1525 | "Putting on with the right hand", 1526 | "Release from the right hand" 1527 | ], 1528 | "gt_time": [ 1529 | [ 1530 | 0, 1531 | 7 1532 | ], 1533 | [ 1534 | 8, 1535 | 28 1536 | ], 1537 | [ 1538 | 29, 1539 | 39 1540 | ] 1541 | ], 1542 | "video_path": "original_videos\\subject_3_o_gopro_seg_1_15262-15301.mp4" 1543 | }, 1544 | { 1545 | "action": [ 1546 | "Hand over from the left hand to the right hand", 1547 | "Putting on with the right hand", 1548 | "Release from the right hand" 1549 | ], 1550 | "gt_time": [ 1551 | [ 1552 | 0, 1553 | 20 1554 | ], 1555 | [ 1556 | 21, 1557 | 36 1558 | ], 1559 | [ 1560 | 37, 1561 | 44 1562 | ] 1563 | ], 1564 | "video_path": "original_videos\\subject_3_o_gopro_seg_1_15395-15439.mp4" 1565 | }, 1566 | { 1567 | "action": [ 1568 | "Grasp with the right hand", 1569 | "Pressing a button with the right hand", 1570 | "Release from the right hand" 1571 | ], 1572 | "gt_time": [ 1573 | [ 1574 | 0, 1575 | 9 1576 | ], 1577 | [ 1578 | 10, 1579 | 41 1580 | ], 1581 | [ 1582 | 42, 1583 | 49 1584 | ] 1585 | ], 1586 | "video_path": "original_videos\\subject_3_o_gopro_seg_1_15440-15489.mp4" 1587 | }, 1588 | { 1589 | "action": [ 1590 | "Grasp with the right hand", 1591 | "Picking with the right hand", 1592 | "Bringing with the right hand", 1593 | "Release from the right hand" 1594 | ], 1595 | "gt_time": [ 1596 | [ 1597 | 0, 1598 | 18 1599 | ], 1600 | [ 1601 | 19, 1602 | 33 1603 | ], 1604 | [ 1605 | 34, 1606 | 51 1607 | ], 1608 | [ 1609 | 52, 1610 | 59 1611 | ] 1612 | ], 1613 | "video_path": "original_videos\\subject_3_o_gopro_seg_1_15574-15633.mp4" 1614 | }, 1615 | { 1616 | "action": [ 1617 | "Grasp with the right hand", 1618 | "Starting rotary motion with the right hand while it is restrained.", 1619 | "Release from the right hand" 1620 | ], 1621 | "gt_time": [ 1622 | [ 1623 | 0, 1624 | 4 1625 | ], 1626 | [ 1627 | 5, 1628 | 9 1629 | ], 1630 | [ 1631 | 10, 1632 | 17 1633 | ] 1634 | ], 1635 | "video_path": "original_videos\\subject_3_o_gopro_seg_1_15634-15651.mp4" 1636 | }, 1637 | { 1638 | "action": [ 1639 | "Grasp with the right hand", 1640 | "Tracing the surface of something with the right hand or an object held in the right hand in a linear manner.", 1641 | "Release from the right hand" 1642 | ], 1643 | "gt_time": [ 1644 | [ 1645 | 0, 1646 | 8 1647 | ], 1648 | [ 1649 | 9, 1650 | 77 1651 | ], 1652 | [ 1653 | 78, 1654 | 90 1655 | ] 1656 | ], 1657 | "video_path": "original_videos\\subject_3_o_gopro_seg_1_15652-15742.mp4" 1658 | }, 1659 | { 1660 | "action": [ 1661 | "Grasp with the right hand", 1662 | "Picking with the right hand", 1663 | "Tracing the surface of something with the right hand or an object held in the right hand in a linear manner.", 1664 | "Release from the right hand" 1665 | ], 1666 | "gt_time": [ 1667 | [ 1668 | 0, 1669 | 4 1670 | ], 1671 | [ 1672 | 5, 1673 | 24 1674 | ], 1675 | [ 1676 | 25, 1677 | 57 1678 | ], 1679 | [ 1680 | 58, 1681 | 69 1682 | ] 1683 | ], 1684 | "video_path": "original_videos\\subject_3_o_gopro_seg_1_15743-15812.mp4" 1685 | }, 1686 | { 1687 | "action": [ 1688 | "Grasp with the right hand", 1689 | "Tracing the surface of something with the right hand or an object held in the right hand in a linear manner.", 1690 | "Release from the right hand" 1691 | ], 1692 | "gt_time": [ 1693 | [ 1694 | 0, 1695 | 6 1696 | ], 1697 | [ 1698 | 7, 1699 | 38 1700 | ], 1701 | [ 1702 | 39, 1703 | 40 1704 | ] 1705 | ], 1706 | "video_path": "original_videos\\subject_3_o_gopro_seg_1_15813-15853.mp4" 1707 | }, 1708 | { 1709 | "action": [ 1710 | "Grasp with the right hand", 1711 | "Rotary motion with the right hand until it cannot be rotated", 1712 | "Release from the right hand" 1713 | ], 1714 | "gt_time": [ 1715 | [ 1716 | 0, 1717 | 12 1718 | ], 1719 | [ 1720 | 13, 1721 | 18 1722 | ], 1723 | [ 1724 | 19, 1725 | 25 1726 | ] 1727 | ], 1728 | "video_path": "original_videos\\subject_3_o_gopro_seg_1_15854-15879.mp4" 1729 | }, 1730 | { 1731 | "action": [ 1732 | "Grasp with the right hand", 1733 | "Starting rotary motion with the right hand while it is restrained.", 1734 | "Release from the right hand" 1735 | ], 1736 | "gt_time": [ 1737 | [ 1738 | 0, 1739 | 2 1740 | ], 1741 | [ 1742 | 3, 1743 | 7 1744 | ], 1745 | [ 1746 | 8, 1747 | 15 1748 | ] 1749 | ], 1750 | "video_path": "original_videos\\subject_3_o_gopro_seg_2_14670-14685.mp4" 1751 | }, 1752 | { 1753 | "action": [ 1754 | "Grasp with the right hand", 1755 | "Tracing the surface of something with the right hand or an object held in the right hand in a linear manner.", 1756 | "Tracing the surface of something with the right hand or an object held in the right hand in a planar manner.", 1757 | "Putting on with the right hand", 1758 | "Release from the right hand" 1759 | ], 1760 | "gt_time": [ 1761 | [ 1762 | 0, 1763 | 4 1764 | ], 1765 | [ 1766 | 5, 1767 | 19 1768 | ], 1769 | [ 1770 | 20, 1771 | 164 1772 | ], 1773 | [ 1774 | 165, 1775 | 175 1776 | ], 1777 | [ 1778 | 176, 1779 | 186 1780 | ] 1781 | ], 1782 | "video_path": "original_videos\\subject_3_o_gopro_seg_2_15622-15808.mp4" 1783 | }, 1784 | { 1785 | "action": [ 1786 | "Grasp with the right hand", 1787 | "Starting rotary motion with the right hand while it is restrained.", 1788 | "Release from the right hand" 1789 | ], 1790 | "gt_time": [ 1791 | [ 1792 | 0, 1793 | 6 1794 | ], 1795 | [ 1796 | 7, 1797 | 15 1798 | ], 1799 | [ 1800 | 16, 1801 | 25 1802 | ] 1803 | ], 1804 | "video_path": "original_videos\\subject_3_o_gopro_seg_2_15809-15834.mp4" 1805 | }, 1806 | { 1807 | "action": [ 1808 | "Grasp with the right hand", 1809 | "Rotary motion with the right hand until it cannot be rotated", 1810 | "Release from the right hand" 1811 | ], 1812 | "gt_time": [ 1813 | [ 1814 | 0, 1815 | 6 1816 | ], 1817 | [ 1818 | 7, 1819 | 15 1820 | ], 1821 | [ 1822 | 16, 1823 | 21 1824 | ] 1825 | ], 1826 | "video_path": "original_videos\\subject_3_o_gopro_seg_2_15972-15993.mp4" 1827 | }, 1828 | { 1829 | "action": [ 1830 | "Grasp with the right hand", 1831 | "Bringing with the right hand", 1832 | "Linear motion with the right hand until it cannot be moved", 1833 | "Release from the right hand" 1834 | ], 1835 | "gt_time": [ 1836 | [ 1837 | 0, 1838 | 66 1839 | ], 1840 | [ 1841 | 67, 1842 | 76 1843 | ], 1844 | [ 1845 | 77, 1846 | 88 1847 | ], 1848 | [ 1849 | 89, 1850 | 99 1851 | ] 1852 | ], 1853 | "video_path": "original_videos\\subject_3_o_gopro_seg_2_16025-16124.mp4" 1854 | }, 1855 | { 1856 | "action": [ 1857 | "Grasp with the right hand", 1858 | "Rotary motion with the right hand until it cannot be rotated", 1859 | "Release from the right hand" 1860 | ], 1861 | "gt_time": [ 1862 | [ 1863 | 0, 1864 | 5 1865 | ], 1866 | [ 1867 | 6, 1868 | 16 1869 | ], 1870 | [ 1871 | 17, 1872 | 26 1873 | ] 1874 | ], 1875 | "video_path": "original_videos\\subject_4_gopro_seg_1_7675-7701.mp4" 1876 | }, 1877 | { 1878 | "action": [ 1879 | "Grasp with the right hand", 1880 | "Starting rotary motion with the right hand while it is restrained.", 1881 | "Release from the right hand" 1882 | ], 1883 | "gt_time": [ 1884 | [ 1885 | 0, 1886 | 16 1887 | ], 1888 | [ 1889 | 17, 1890 | 41 1891 | ], 1892 | [ 1893 | 42, 1894 | 51 1895 | ] 1896 | ], 1897 | "video_path": "original_videos\\subject_4_gopro_seg_2_16670-16721.mp4" 1898 | }, 1899 | { 1900 | "action": [ 1901 | "Grasp with the right hand", 1902 | "Tracing the surface of something with the right hand or an object held in the right hand in a linear manner.", 1903 | "Striking something with the right hand or an object held in the right hand.", 1904 | "Bringing with the right hand", 1905 | "Linear motion with the right hand until it cannot be moved", 1906 | "Release from the right hand" 1907 | ], 1908 | "gt_time": [ 1909 | [ 1910 | 0, 1911 | 10 1912 | ], 1913 | [ 1914 | 11, 1915 | 152 1916 | ], 1917 | [ 1918 | 153, 1919 | 189 1920 | ], 1921 | [ 1922 | 190, 1923 | 211 1924 | ], 1925 | [ 1926 | 212, 1927 | 234 1928 | ], 1929 | [ 1930 | 235, 1931 | 247 1932 | ] 1933 | ], 1934 | "video_path": "original_videos\\subject_4_gopro_seg_2_17147-17394.mp4" 1935 | }, 1936 | { 1937 | "action": [ 1938 | "Grasp with the right hand", 1939 | "Picking with the right hand", 1940 | "Release from the right hand" 1941 | ], 1942 | "gt_time": [ 1943 | [ 1944 | 0, 1945 | 16 1946 | ], 1947 | [ 1948 | 17, 1949 | 48 1950 | ], 1951 | [ 1952 | 49, 1953 | 60 1954 | ] 1955 | ], 1956 | "video_path": "original_videos\\subject_4_gopro_seg_2_17398-17458.mp4" 1957 | }, 1958 | { 1959 | "action": [ 1960 | "Grasp with the right hand", 1961 | "Tracing the surface of something with the right hand or an object held in the right hand in a planar manner.", 1962 | "Holding with the right hand", 1963 | "Tracing the surface of something with the right hand or an object held in the right hand in a planar manner.", 1964 | "Holding with the right hand", 1965 | "Tracing the surface of something with the right hand or an object held in the right hand in a planar manner.", 1966 | "Release from the right hand" 1967 | ], 1968 | "gt_time": [ 1969 | [ 1970 | 0, 1971 | 11 1972 | ], 1973 | [ 1974 | 12, 1975 | 62 1976 | ], 1977 | [ 1978 | 63, 1979 | 107 1980 | ], 1981 | [ 1982 | 108, 1983 | 165 1984 | ], 1985 | [ 1986 | 166, 1987 | 205 1988 | ], 1989 | [ 1990 | 206, 1991 | 216 1992 | ], 1993 | [ 1994 | 217, 1995 | 224 1996 | ] 1997 | ], 1998 | "video_path": "original_videos\\subject_4_gopro_seg_2_17460-17684.mp4" 1999 | }, 2000 | { 2001 | "action": [ 2002 | "Grasp with the right hand", 2003 | "Rotary motion with the right hand until it cannot be rotated", 2004 | "Release from the right hand" 2005 | ], 2006 | "gt_time": [ 2007 | [ 2008 | 0, 2009 | 8 2010 | ], 2011 | [ 2012 | 9, 2013 | 19 2014 | ], 2015 | [ 2016 | 20, 2017 | 22 2018 | ] 2019 | ], 2020 | "video_path": "original_videos\\subject_4_gopro_seg_2_17722-17744.mp4" 2021 | }, 2022 | { 2023 | "action": [ 2024 | "Grasp with the right hand", 2025 | "Striking something with the right hand or an object held in the right hand.", 2026 | "Bringing with the right hand", 2027 | "Moving an object held in the right hand in and out of a narrow space.", 2028 | "Release from the right hand" 2029 | ], 2030 | "gt_time": [ 2031 | [ 2032 | 0, 2033 | 9 2034 | ], 2035 | [ 2036 | 10, 2037 | 26 2038 | ], 2039 | [ 2040 | 27, 2041 | 111 2042 | ], 2043 | [ 2044 | 112, 2045 | 126 2046 | ], 2047 | [ 2048 | 127, 2049 | 136 2050 | ] 2051 | ], 2052 | "video_path": "original_videos\\subject_4_gopro_seg_2_17745-17881.mp4" 2053 | }, 2054 | { 2055 | "action": [ 2056 | "Grasp with the right hand", 2057 | "Starting rotary motion with the right hand while it is restrained.", 2058 | "Release from the right hand" 2059 | ], 2060 | "gt_time": [ 2061 | [ 2062 | 0, 2063 | 15 2064 | ], 2065 | [ 2066 | 16, 2067 | 34 2068 | ], 2069 | [ 2070 | 35, 2071 | 44 2072 | ] 2073 | ], 2074 | "video_path": "original_videos\\subject_4_gopro_seg_2_17955-17999.mp4" 2075 | }, 2076 | { 2077 | "action": [ 2078 | "Grasp with the right hand", 2079 | "Cracking an egg with the right hand", 2080 | "Bringing with the right hand", 2081 | "Release from the right hand" 2082 | ], 2083 | "gt_time": [ 2084 | [ 2085 | 0, 2086 | 16 2087 | ], 2088 | [ 2089 | 17, 2090 | 148 2091 | ], 2092 | [ 2093 | 149, 2094 | 196 2095 | ], 2096 | [ 2097 | 197, 2098 | 206 2099 | ] 2100 | ], 2101 | "video_path": "original_videos\\subject_4_gopro_seg_2_7354-7560.mp4" 2102 | }, 2103 | { 2104 | "action": [ 2105 | "Grasp with the right hand", 2106 | "Picking with the right hand", 2107 | "Holding with the right hand", 2108 | "Bringing with the right hand", 2109 | "Striking something with the right hand or an object held in the right hand.", 2110 | "Cracking an egg with the right hand", 2111 | "Holding with the right hand", 2112 | "Striking something with the right hand or an object held in the right hand.", 2113 | "Bringing with the right hand", 2114 | "Putting on with the right hand", 2115 | "Release from the right hand" 2116 | ], 2117 | "gt_time": [ 2118 | [ 2119 | 0, 2120 | 14 2121 | ], 2122 | [ 2123 | 15, 2124 | 27 2125 | ], 2126 | [ 2127 | 28, 2128 | 112 2129 | ], 2130 | [ 2131 | 113, 2132 | 160 2133 | ], 2134 | [ 2135 | 161, 2136 | 206 2137 | ], 2138 | [ 2139 | 207, 2140 | 278 2141 | ], 2142 | [ 2143 | 279, 2144 | 562 2145 | ], 2146 | [ 2147 | 563, 2148 | 633 2149 | ], 2150 | [ 2151 | 634, 2152 | 645 2153 | ], 2154 | [ 2155 | 646, 2156 | 671 2157 | ], 2158 | [ 2159 | 672, 2160 | 681 2161 | ] 2162 | ], 2163 | "video_path": "original_videos\\subject_4_gopro_seg_2_8051-8732.mp4" 2164 | }, 2165 | { 2166 | "action": [ 2167 | "Grasp with the right hand", 2168 | "Cracking an egg with the right hand", 2169 | "Bringing with the right hand", 2170 | "Release from the right hand" 2171 | ], 2172 | "gt_time": [ 2173 | [ 2174 | 0, 2175 | 10 2176 | ], 2177 | [ 2178 | 11, 2179 | 110 2180 | ], 2181 | [ 2182 | 111, 2183 | 134 2184 | ], 2185 | [ 2186 | 135, 2187 | 145 2188 | ] 2189 | ], 2190 | "video_path": "original_videos\\subject_4_gopro_seg_2_8736-8881.mp4" 2191 | }, 2192 | { 2193 | "action": [ 2194 | "Grasp with the right hand", 2195 | "Moving an object held in the right hand in and out of a narrow space.", 2196 | "Bringing with the right hand", 2197 | "Putting on with the right hand", 2198 | "Release from the right hand" 2199 | ], 2200 | "gt_time": [ 2201 | [ 2202 | 0, 2203 | 109 2204 | ], 2205 | [ 2206 | 110, 2207 | 130 2208 | ], 2209 | [ 2210 | 131, 2211 | 184 2212 | ], 2213 | [ 2214 | 185, 2215 | 198 2216 | ], 2217 | [ 2218 | 199, 2219 | 209 2220 | ] 2221 | ], 2222 | "video_path": "original_videos\\subject_5_gopro_seg_1_7243-7452.mp4" 2223 | }, 2224 | { 2225 | "action": [ 2226 | "Grasp with the right hand", 2227 | "Picking with the right hand", 2228 | "Pouring with the right hand", 2229 | "Putting on with the right hand", 2230 | "Release from the right hand" 2231 | ], 2232 | "gt_time": [ 2233 | [ 2234 | 0, 2235 | 33 2236 | ], 2237 | [ 2238 | 34, 2239 | 75 2240 | ], 2241 | [ 2242 | 76, 2243 | 220 2244 | ], 2245 | [ 2246 | 221, 2247 | 279 2248 | ], 2249 | [ 2250 | 280, 2251 | 305 2252 | ] 2253 | ], 2254 | "video_path": "original_videos\\subject_5_gopro_seg_2_14524-14829.mp4" 2255 | }, 2256 | { 2257 | "action": [ 2258 | "Grasp with the right hand", 2259 | "Bringing with the right hand", 2260 | "Putting on with the right hand", 2261 | "Release from the right hand" 2262 | ], 2263 | "gt_time": [ 2264 | [ 2265 | 0, 2266 | 32 2267 | ], 2268 | [ 2269 | 33, 2270 | 328 2271 | ], 2272 | [ 2273 | 329, 2274 | 351 2275 | ], 2276 | [ 2277 | 352, 2278 | 362 2279 | ] 2280 | ], 2281 | "video_path": "original_videos\\subject_5_gopro_seg_2_14832-15194.mp4" 2282 | }, 2283 | { 2284 | "action": [ 2285 | "Grasp with the right hand", 2286 | "Bringing with the right hand", 2287 | "Release from the right hand" 2288 | ], 2289 | "gt_time": [ 2290 | [ 2291 | 0, 2292 | 13 2293 | ], 2294 | [ 2295 | 14, 2296 | 26 2297 | ], 2298 | [ 2299 | 27, 2300 | 41 2301 | ] 2302 | ], 2303 | "video_path": "original_videos\\subject_5_gopro_seg_2_2333-2374.mp4" 2304 | }, 2305 | { 2306 | "action": [ 2307 | "Grasp with the right hand", 2308 | "Striking something with the right hand or an object held in the right hand.", 2309 | "Putting on with the right hand", 2310 | "Release from the right hand" 2311 | ], 2312 | "gt_time": [ 2313 | [ 2314 | 0, 2315 | 10 2316 | ], 2317 | [ 2318 | 11, 2319 | 54 2320 | ], 2321 | [ 2322 | 55, 2323 | 66 2324 | ], 2325 | [ 2326 | 67, 2327 | 80 2328 | ] 2329 | ], 2330 | "video_path": "original_videos\\subject_5_gopro_seg_2_2634-2714.mp4" 2331 | }, 2332 | { 2333 | "action": [ 2334 | "Grasp with the right hand", 2335 | "Picking with the right hand", 2336 | "Putting on with the right hand", 2337 | "Release from the right hand" 2338 | ], 2339 | "gt_time": [ 2340 | [ 2341 | 0, 2342 | 16 2343 | ], 2344 | [ 2345 | 17, 2346 | 36 2347 | ], 2348 | [ 2349 | 37, 2350 | 55 2351 | ], 2352 | [ 2353 | 56, 2354 | 64 2355 | ] 2356 | ], 2357 | "video_path": "original_videos\\subject_5_gopro_seg_2_2744-2808.mp4" 2358 | }, 2359 | { 2360 | "action": [ 2361 | "Grasp with the right hand", 2362 | "Starting rotary motion with the right hand while it is restrained.", 2363 | "Release from the right hand" 2364 | ], 2365 | "gt_time": [ 2366 | [ 2367 | 0, 2368 | 8 2369 | ], 2370 | [ 2371 | 9, 2372 | 51 2373 | ], 2374 | [ 2375 | 52, 2376 | 63 2377 | ] 2378 | ], 2379 | "video_path": "original_videos\\subject_5_gopro_seg_2_2809-2872.mp4" 2380 | }, 2381 | { 2382 | "action": [ 2383 | "Grasp with the right hand", 2384 | "Holding with the right hand", 2385 | "Release from the right hand" 2386 | ], 2387 | "gt_time": [ 2388 | [ 2389 | 0, 2390 | 8 2391 | ], 2392 | [ 2393 | 9, 2394 | 61 2395 | ], 2396 | [ 2397 | 62, 2398 | 72 2399 | ] 2400 | ], 2401 | "video_path": "original_videos\\subject_5_gopro_seg_2_2873-2945.mp4" 2402 | }, 2403 | { 2404 | "action": [ 2405 | "Grasp with the right hand", 2406 | "Striking something with the right hand or an object held in the right hand.", 2407 | "Release from the right hand" 2408 | ], 2409 | "gt_time": [ 2410 | [ 2411 | 0, 2412 | 10 2413 | ], 2414 | [ 2415 | 11, 2416 | 29 2417 | ], 2418 | [ 2419 | 30, 2420 | 42 2421 | ] 2422 | ], 2423 | "video_path": "original_videos\\subject_5_gopro_seg_2_3091-3133.mp4" 2424 | }, 2425 | { 2426 | "action": [ 2427 | "Grasp with the right hand", 2428 | "Picking with the right hand", 2429 | "Bringing with the right hand", 2430 | "Putting on with the right hand", 2431 | "Release from the right hand" 2432 | ], 2433 | "gt_time": [ 2434 | [ 2435 | 0, 2436 | 20 2437 | ], 2438 | [ 2439 | 21, 2440 | 41 2441 | ], 2442 | [ 2443 | 42, 2444 | 152 2445 | ], 2446 | [ 2447 | 153, 2448 | 163 2449 | ], 2450 | [ 2451 | 164, 2452 | 175 2453 | ] 2454 | ], 2455 | "video_path": "original_videos\\subject_5_gopro_seg_2_3400-3575.mp4" 2456 | }, 2457 | { 2458 | "action": [ 2459 | "Grasp with the right hand", 2460 | "Picking with the right hand", 2461 | "Bringing with the right hand", 2462 | "Putting on with the right hand", 2463 | "Release from the right hand" 2464 | ], 2465 | "gt_time": [ 2466 | [ 2467 | 0, 2468 | 24 2469 | ], 2470 | [ 2471 | 25, 2472 | 52 2473 | ], 2474 | [ 2475 | 53, 2476 | 70 2477 | ], 2478 | [ 2479 | 71, 2480 | 85 2481 | ], 2482 | [ 2483 | 86, 2484 | 106 2485 | ] 2486 | ], 2487 | "video_path": "original_videos\\subject_6_gopro_seg_1_108-214.mp4" 2488 | }, 2489 | { 2490 | "action": [ 2491 | "Grasp with the right hand", 2492 | "Picking with the right hand", 2493 | "Bringing with the right hand", 2494 | "Release from the right hand" 2495 | ], 2496 | "gt_time": [ 2497 | [ 2498 | 0, 2499 | 10 2500 | ], 2501 | [ 2502 | 11, 2503 | 28 2504 | ], 2505 | [ 2506 | 29, 2507 | 56 2508 | ], 2509 | [ 2510 | 57, 2511 | 67 2512 | ] 2513 | ], 2514 | "video_path": "original_videos\\subject_6_gopro_seg_1_216-283.mp4" 2515 | }, 2516 | { 2517 | "action": [ 2518 | "Grasp with the right hand", 2519 | "Widening a bag with the right hand", 2520 | "Release from the right hand" 2521 | ], 2522 | "gt_time": [ 2523 | [ 2524 | 0, 2525 | 6 2526 | ], 2527 | [ 2528 | 7, 2529 | 28 2530 | ], 2531 | [ 2532 | 29, 2533 | 34 2534 | ] 2535 | ], 2536 | "video_path": "original_videos\\subject_6_gopro_seg_1_818-852.mp4" 2537 | }, 2538 | { 2539 | "action": [ 2540 | "Grasp with the right hand", 2541 | "Picking with the right hand", 2542 | "Bringing with the right hand", 2543 | "Putting on with the right hand", 2544 | "Release from the right hand" 2545 | ], 2546 | "gt_time": [ 2547 | [ 2548 | 0, 2549 | 35 2550 | ], 2551 | [ 2552 | 36, 2553 | 75 2554 | ], 2555 | [ 2556 | 76, 2557 | 115 2558 | ], 2559 | [ 2560 | 116, 2561 | 148 2562 | ], 2563 | [ 2564 | 149, 2565 | 163 2566 | ] 2567 | ], 2568 | "video_path": "original_videos\\subject_6_gopro_seg_1_1230-1393.mp4" 2569 | }, 2570 | { 2571 | "action": [ 2572 | "Grasp with the right hand", 2573 | "Holding with the right hand", 2574 | "Release from the right hand" 2575 | ], 2576 | "gt_time": [ 2577 | [ 2578 | 0, 2579 | 22 2580 | ], 2581 | [ 2582 | 23, 2583 | 89 2584 | ], 2585 | [ 2586 | 90, 2587 | 101 2588 | ] 2589 | ], 2590 | "video_path": "original_videos\\subject_6_gopro_seg_1_11201-11302.mp4" 2591 | }, 2592 | { 2593 | "action": [ 2594 | "Grasp with the right hand", 2595 | "Putting on with the right hand", 2596 | "Release from the right hand" 2597 | ], 2598 | "gt_time": [ 2599 | [ 2600 | 0, 2601 | 18 2602 | ], 2603 | [ 2604 | 19, 2605 | 63 2606 | ], 2607 | [ 2608 | 64, 2609 | 71 2610 | ] 2611 | ], 2612 | "video_path": "original_videos\\subject_6_gopro_seg_1_11303-11374.mp4" 2613 | }, 2614 | { 2615 | "action": [ 2616 | "Grasp with the right hand", 2617 | "Starting rotary motion with the right hand while it is restrained.", 2618 | "Release from the right hand" 2619 | ], 2620 | "gt_time": [ 2621 | [ 2622 | 0, 2623 | 20 2624 | ], 2625 | [ 2626 | 21, 2627 | 41 2628 | ], 2629 | [ 2630 | 42, 2631 | 53 2632 | ] 2633 | ], 2634 | "video_path": "original_videos\\subject_6_gopro_seg_1_11853-11906.mp4" 2635 | }, 2636 | { 2637 | "action": [ 2638 | "Grasp with the right hand", 2639 | "Rotary motion with the right hand until it cannot be rotated", 2640 | "Release from the right hand" 2641 | ], 2642 | "gt_time": [ 2643 | [ 2644 | 0, 2645 | 14 2646 | ], 2647 | [ 2648 | 15, 2649 | 21 2650 | ], 2651 | [ 2652 | 22, 2653 | 32 2654 | ] 2655 | ], 2656 | "video_path": "original_videos\\subject_6_gopro_seg_1_12107-12139.mp4" 2657 | }, 2658 | { 2659 | "action": [ 2660 | "Grasp with the right hand", 2661 | "Starting rotary motion with the right hand while it is restrained.", 2662 | "Release from the right hand" 2663 | ], 2664 | "gt_time": [ 2665 | [ 2666 | 0, 2667 | 8 2668 | ], 2669 | [ 2670 | 9, 2671 | 46 2672 | ], 2673 | [ 2674 | 47, 2675 | 56 2676 | ] 2677 | ], 2678 | "video_path": "original_videos\\subject_6_gopro_seg_1_12471-12527.mp4" 2679 | }, 2680 | { 2681 | "action": [ 2682 | "Grasp with the right hand", 2683 | "Widening a bag with the right hand", 2684 | "Release from the right hand" 2685 | ], 2686 | "gt_time": [ 2687 | [ 2688 | 0, 2689 | 10 2690 | ], 2691 | [ 2692 | 11, 2693 | 29 2694 | ], 2695 | [ 2696 | 30, 2697 | 40 2698 | ] 2699 | ], 2700 | "video_path": "original_videos\\subject_6_gopro_seg_2_3665-3705.mp4" 2701 | }, 2702 | { 2703 | "action": [ 2704 | "Grasp with the right hand", 2705 | "Starting rotary motion with the right hand while it is restrained.", 2706 | "Release from the right hand" 2707 | ], 2708 | "gt_time": [ 2709 | [ 2710 | 0, 2711 | 99 2712 | ], 2713 | [ 2714 | 100, 2715 | 141 2716 | ], 2717 | [ 2718 | 142, 2719 | 150 2720 | ] 2721 | ], 2722 | "video_path": "original_videos\\subject_6_gopro_seg_2_3793-3943.mp4" 2723 | }, 2724 | { 2725 | "action": [ 2726 | "Grasp with the right hand", 2727 | "Picking with the right hand", 2728 | "Putting on with the right hand", 2729 | "Release from the right hand" 2730 | ], 2731 | "gt_time": [ 2732 | [ 2733 | 0, 2734 | 59 2735 | ], 2736 | [ 2737 | 60, 2738 | 71 2739 | ], 2740 | [ 2741 | 72, 2742 | 94 2743 | ], 2744 | [ 2745 | 95, 2746 | 104 2747 | ] 2748 | ], 2749 | "video_path": "original_videos\\subject_6_gopro_seg_2_4258-4362.mp4" 2750 | }, 2751 | { 2752 | "action": [ 2753 | "Grasp with the right hand", 2754 | "Picking with the right hand", 2755 | "Tracing the surface of something with the right hand or an object held in the right hand in a planar manner.", 2756 | "Putting on with the right hand", 2757 | "Release from the right hand" 2758 | ], 2759 | "gt_time": [ 2760 | [ 2761 | 0, 2762 | 10 2763 | ], 2764 | [ 2765 | 11, 2766 | 33 2767 | ], 2768 | [ 2769 | 34, 2770 | 414 2771 | ], 2772 | [ 2773 | 415, 2774 | 451 2775 | ], 2776 | [ 2777 | 452, 2778 | 462 2779 | ] 2780 | ], 2781 | "video_path": "original_videos\\subject_6_gopro_seg_2_4367-4829.mp4" 2782 | }, 2783 | { 2784 | "action": [ 2785 | "Grasp with the right hand", 2786 | "Picking with the right hand", 2787 | "Holding with the right hand", 2788 | "Pouring with the right hand", 2789 | "Holding with the right hand", 2790 | "Pouring with the right hand", 2791 | "Holding with the right hand", 2792 | "Putting on with the right hand", 2793 | "Release from the right hand" 2794 | ], 2795 | "gt_time": [ 2796 | [ 2797 | 0, 2798 | 19 2799 | ], 2800 | [ 2801 | 20, 2802 | 55 2803 | ], 2804 | [ 2805 | 56, 2806 | 81 2807 | ], 2808 | [ 2809 | 82, 2810 | 130 2811 | ], 2812 | [ 2813 | 131, 2814 | 140 2815 | ], 2816 | [ 2817 | 141, 2818 | 164 2819 | ], 2820 | [ 2821 | 165, 2822 | 206 2823 | ], 2824 | [ 2825 | 207, 2826 | 234 2827 | ], 2828 | [ 2829 | 235, 2830 | 246 2831 | ] 2832 | ], 2833 | "video_path": "original_videos\\subject_6_gopro_seg_2_4834-5080.mp4" 2834 | }, 2835 | { 2836 | "action": [ 2837 | "Grasp with the right hand", 2838 | "Picking with the right hand", 2839 | "Bringing with the right hand", 2840 | "Putting on with the right hand", 2841 | "Release from the right hand" 2842 | ], 2843 | "gt_time": [ 2844 | [ 2845 | 0, 2846 | 24 2847 | ], 2848 | [ 2849 | 25, 2850 | 51 2851 | ], 2852 | [ 2853 | 52, 2854 | 248 2855 | ], 2856 | [ 2857 | 249, 2858 | 266 2859 | ], 2860 | [ 2861 | 267, 2862 | 279 2863 | ] 2864 | ], 2865 | "video_path": "original_videos\\subject_7_gopro_seg_1_16498-16777.mp4" 2866 | }, 2867 | { 2868 | "action": [ 2869 | "Grasp with the right hand", 2870 | "Rotary motion with the right hand until it cannot be rotated", 2871 | "Release from the right hand" 2872 | ], 2873 | "gt_time": [ 2874 | [ 2875 | 0, 2876 | 62 2877 | ], 2878 | [ 2879 | 63, 2880 | 284 2881 | ], 2882 | [ 2883 | 285, 2884 | 287 2885 | ] 2886 | ], 2887 | "video_path": "original_videos\\subject_7_gopro_seg_1_17070-17357.mp4" 2888 | }, 2889 | { 2890 | "action": [ 2891 | "Grasp with the right hand", 2892 | "Starting rotary motion with the right hand while it is restrained.", 2893 | "Release from the right hand" 2894 | ], 2895 | "gt_time": [ 2896 | [ 2897 | 0, 2898 | 10 2899 | ], 2900 | [ 2901 | 11, 2902 | 36 2903 | ], 2904 | [ 2905 | 37, 2906 | 49 2907 | ] 2908 | ], 2909 | "video_path": "original_videos\\subject_7_gopro_seg_1_17625-17674.mp4" 2910 | }, 2911 | { 2912 | "action": [ 2913 | "Grasp with the right hand", 2914 | "Picking with the right hand", 2915 | "Release from the right hand" 2916 | ], 2917 | "gt_time": [ 2918 | [ 2919 | 0, 2920 | 10 2921 | ], 2922 | [ 2923 | 11, 2924 | 24 2925 | ], 2926 | [ 2927 | 25, 2928 | 36 2929 | ] 2930 | ], 2931 | "video_path": "original_videos\\subject_7_gopro_seg_2_12265-12301.mp4" 2932 | }, 2933 | { 2934 | "action": [ 2935 | "Grasp with the right hand", 2936 | "Cracking an egg with the right hand", 2937 | "Pouring with the right hand", 2938 | "Striking something with the right hand or an object held in the right hand.", 2939 | "Putting on with the right hand", 2940 | "Release from the right hand" 2941 | ], 2942 | "gt_time": [ 2943 | [ 2944 | 0, 2945 | 10 2946 | ], 2947 | [ 2948 | 11, 2949 | 50 2950 | ], 2951 | [ 2952 | 51, 2953 | 153 2954 | ], 2955 | [ 2956 | 154, 2957 | 204 2958 | ], 2959 | [ 2960 | 205, 2961 | 252 2962 | ], 2963 | [ 2964 | 253, 2965 | 268 2966 | ] 2967 | ], 2968 | "video_path": "original_videos\\subject_8_gopro_seg_1_3685-3953.mp4" 2969 | }, 2970 | { 2971 | "action": [ 2972 | "Grasp with the right hand", 2973 | "Picking with the right hand", 2974 | "Putting on with the right hand", 2975 | "Release from the right hand" 2976 | ], 2977 | "gt_time": [ 2978 | [ 2979 | 0, 2980 | 17 2981 | ], 2982 | [ 2983 | 18, 2984 | 41 2985 | ], 2986 | [ 2987 | 42, 2988 | 60 2989 | ], 2990 | [ 2991 | 61, 2992 | 69 2993 | ] 2994 | ], 2995 | "video_path": "original_videos\\subject_8_gopro_seg_1_5019-5088.mp4" 2996 | }, 2997 | { 2998 | "action": [ 2999 | "Grasp with the right hand", 3000 | "Picking with the right hand", 3001 | "Putting on with the right hand", 3002 | "Release from the right hand" 3003 | ], 3004 | "gt_time": [ 3005 | [ 3006 | 0, 3007 | 0 3008 | ], 3009 | [ 3010 | 1, 3011 | 29 3012 | ], 3013 | [ 3014 | 30, 3015 | 84 3016 | ], 3017 | [ 3018 | 85, 3019 | 95 3020 | ] 3021 | ], 3022 | "video_path": "original_videos\\subject_8_gopro_seg_2_9473-9568.mp4" 3023 | }, 3024 | { 3025 | "action": [ 3026 | "Grasp with the right hand", 3027 | "Picking with the right hand", 3028 | "Putting on with the right hand", 3029 | "Release from the right hand" 3030 | ], 3031 | "gt_time": [ 3032 | [ 3033 | 0, 3034 | 22 3035 | ], 3036 | [ 3037 | 23, 3038 | 45 3039 | ], 3040 | [ 3041 | 46, 3042 | 149 3043 | ], 3044 | [ 3045 | 150, 3046 | 159 3047 | ] 3048 | ], 3049 | "video_path": "original_videos\\subject_8_gopro_seg_2_9577-9736.mp4" 3050 | }, 3051 | { 3052 | "action": [ 3053 | "Grasp with the right hand", 3054 | "Picking with the right hand", 3055 | "Release from the right hand" 3056 | ], 3057 | "gt_time": [ 3058 | [ 3059 | 0, 3060 | 16 3061 | ], 3062 | [ 3063 | 17, 3064 | 29 3065 | ], 3066 | [ 3067 | 30, 3068 | 40 3069 | ] 3070 | ], 3071 | "video_path": "original_videos\\subject_8_gopro_seg_2_10056-10096.mp4" 3072 | }, 3073 | { 3074 | "action": [ 3075 | "Grasp with the right hand", 3076 | "Picking with the right hand", 3077 | "Release from the right hand" 3078 | ], 3079 | "gt_time": [ 3080 | [ 3081 | 0, 3082 | 29 3083 | ], 3084 | [ 3085 | 30, 3086 | 43 3087 | ], 3088 | [ 3089 | 44, 3090 | 54 3091 | ] 3092 | ], 3093 | "video_path": "original_videos\\subject_8_gopro_seg_2_10097-10151.mp4" 3094 | }, 3095 | { 3096 | "action": [ 3097 | "Grasp with the right hand", 3098 | "Tracing the surface of something with the right hand or an object held in the right hand in a planar manner.", 3099 | "Picking with the right hand", 3100 | "Release from the right hand" 3101 | ], 3102 | "gt_time": [ 3103 | [ 3104 | 0, 3105 | 10 3106 | ], 3107 | [ 3108 | 11, 3109 | 397 3110 | ], 3111 | [ 3112 | 398, 3113 | 431 3114 | ], 3115 | [ 3116 | 432, 3117 | 458 3118 | ] 3119 | ], 3120 | "video_path": "original_videos\\subject_8_gopro_seg_2_10248-10706.mp4" 3121 | }, 3122 | { 3123 | "action": [ 3124 | "Grasp with the right hand", 3125 | "Release from the right hand" 3126 | ], 3127 | "gt_time": [ 3128 | [ 3129 | 0, 3130 | 26 3131 | ], 3132 | [ 3133 | 27, 3134 | 36 3135 | ] 3136 | ], 3137 | "video_path": "original_videos\\subject_9_gopro_seg_1_1884-1920.mp4" 3138 | }, 3139 | { 3140 | "action": [ 3141 | "Grasp with the right hand", 3142 | "Picking with the right hand", 3143 | "Bringing with the right hand", 3144 | "Pouring with the right hand", 3145 | "Putting on with the right hand", 3146 | "Rotary motion with the right hand until it cannot be rotated", 3147 | "Release from the right hand" 3148 | ], 3149 | "gt_time": [ 3150 | [ 3151 | 0, 3152 | 15 3153 | ], 3154 | [ 3155 | 16, 3156 | 28 3157 | ], 3158 | [ 3159 | 29, 3160 | 50 3161 | ], 3162 | [ 3163 | 51, 3164 | 185 3165 | ], 3166 | [ 3167 | 186, 3168 | 209 3169 | ], 3170 | [ 3171 | 210, 3172 | 233 3173 | ], 3174 | [ 3175 | 234, 3176 | 251 3177 | ] 3178 | ], 3179 | "video_path": "original_videos\\subject_9_gopro_seg_1_2324-2575.mp4" 3180 | }, 3181 | { 3182 | "action": [ 3183 | "Grasp with the right hand", 3184 | "Release from the right hand" 3185 | ], 3186 | "gt_time": [ 3187 | [ 3188 | 0, 3189 | 12 3190 | ], 3191 | [ 3192 | 13, 3193 | 28 3194 | ] 3195 | ], 3196 | "video_path": "original_videos\\subject_9_gopro_seg_1_3469-3497.mp4" 3197 | } 3198 | ] --------------------------------------------------------------------------------