├── finegrained-breakfast-dataset
├── .gitignore
├── original_videos
│ └── original_videos.txt
├── clip_original_videos.py
├── README.md
├── compute_mof_iou_f1.py
└── label_data_gt_right.json
├── requirements.txt
├── src
└── pipeline.jpg
├── auth.env
├── docs
├── src
│ ├── arxiv.png
│ ├── table.jpg
│ ├── pipeline.jpg
│ ├── github-mark.png
│ ├── top-level-schema.jpg
│ └── qualitative_results.jpg
└── index.html
├── sample_video
└── sample.mp4
├── results
├── Grasping_the_can
│ ├── grid_image_sample.png
│ └── Grasping_the_can._segment_0.5_1.4.mp4
├── Moving_the_can_upwards
│ ├── grid_image_sample.png
│ └── Moving_the_can_upwards_segment_2.1_4.9.mp4
└── Releasing_the_can_placed_on_the_shelf
│ ├── grid_image_sample.png
│ └── Releasing_the_can_placed_on_the_shelf_segment_4.5_4.9.mp4
├── CODE_OF_CONDUCT.md
├── LICENSE
├── SUPPORT.md
├── breakfast-dataset
├── README.md
└── compute_mof_iou_f1.py
├── thumos14-dataset
└── README.md
├── SECURITY.md
├── README.md
├── .gitignore
└── example.py
/finegrained-breakfast-dataset/.gitignore:
--------------------------------------------------------------------------------
1 | out/
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | openai
2 | opencv-python
3 |
--------------------------------------------------------------------------------
/src/pipeline.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/VLM-Video-Action-Localization/HEAD/src/pipeline.jpg
--------------------------------------------------------------------------------
/auth.env:
--------------------------------------------------------------------------------
1 | AZURE_OPENAI_ENDPOINT=
2 | AZURE_OPENAI_API_KEY=
3 | AZURE_OPENAI_DEPLOYMENT_NAME=
4 | OPENAI_API_KEY=
--------------------------------------------------------------------------------
/docs/src/arxiv.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/VLM-Video-Action-Localization/HEAD/docs/src/arxiv.png
--------------------------------------------------------------------------------
/docs/src/table.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/VLM-Video-Action-Localization/HEAD/docs/src/table.jpg
--------------------------------------------------------------------------------
/docs/src/pipeline.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/VLM-Video-Action-Localization/HEAD/docs/src/pipeline.jpg
--------------------------------------------------------------------------------
/docs/src/github-mark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/VLM-Video-Action-Localization/HEAD/docs/src/github-mark.png
--------------------------------------------------------------------------------
/sample_video/sample.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/VLM-Video-Action-Localization/HEAD/sample_video/sample.mp4
--------------------------------------------------------------------------------
/docs/src/top-level-schema.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/VLM-Video-Action-Localization/HEAD/docs/src/top-level-schema.jpg
--------------------------------------------------------------------------------
/docs/src/qualitative_results.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/VLM-Video-Action-Localization/HEAD/docs/src/qualitative_results.jpg
--------------------------------------------------------------------------------
/results/Grasping_the_can/grid_image_sample.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/VLM-Video-Action-Localization/HEAD/results/Grasping_the_can/grid_image_sample.png
--------------------------------------------------------------------------------
/results/Moving_the_can_upwards/grid_image_sample.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/VLM-Video-Action-Localization/HEAD/results/Moving_the_can_upwards/grid_image_sample.png
--------------------------------------------------------------------------------
/results/Grasping_the_can/Grasping_the_can._segment_0.5_1.4.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/VLM-Video-Action-Localization/HEAD/results/Grasping_the_can/Grasping_the_can._segment_0.5_1.4.mp4
--------------------------------------------------------------------------------
/results/Releasing_the_can_placed_on_the_shelf/grid_image_sample.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/VLM-Video-Action-Localization/HEAD/results/Releasing_the_can_placed_on_the_shelf/grid_image_sample.png
--------------------------------------------------------------------------------
/results/Moving_the_can_upwards/Moving_the_can_upwards_segment_2.1_4.9.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/VLM-Video-Action-Localization/HEAD/results/Moving_the_can_upwards/Moving_the_can_upwards_segment_2.1_4.9.mp4
--------------------------------------------------------------------------------
/results/Releasing_the_can_placed_on_the_shelf/Releasing_the_can_placed_on_the_shelf_segment_4.5_4.9.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/VLM-Video-Action-Localization/HEAD/results/Releasing_the_can_placed_on_the_shelf/Releasing_the_can_placed_on_the_shelf_segment_4.5_4.9.mp4
--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Microsoft Open Source Code of Conduct
2 |
3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
4 |
5 | Resources:
6 |
7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
10 |
--------------------------------------------------------------------------------
/finegrained-breakfast-dataset/original_videos/original_videos.txt:
--------------------------------------------------------------------------------
1 | subject_10_gopro_seg_1.mp4
2 | subject_10_gopro_seg_2.mp4
3 | subject_11_gopro_seg_1.mp4
4 | subject_11_gopro_seg_2.mp4
5 | subject_12_gopro_seg_1.mp4
6 | subject_12_gopro_seg_2.mp4
7 | subject_13_gopro_seg_1.mp4
8 | subject_1_gopro_seg_1.mp4
9 | subject_1_gopro_seg_2.mp4
10 | subject_2_d_gopro_seg_1.mp4
11 | subject_2_d_gopro_seg_2.mp4
12 | subject_3_o_gopro_seg_1.mp4
13 | subject_3_o_gopro_seg_2.mp4
14 | subject_4_gopro_seg_1.mp4
15 | subject_4_gopro_seg_2.mp4
16 | subject_5_gopro_seg_1.mp4
17 | subject_5_gopro_seg_2.mp4
18 | subject_6_gopro_seg_1.mp4
19 | subject_6_gopro_seg_2.mp4
20 | subject_7_gopro_seg_1.mp4
21 | subject_7_gopro_seg_2.mp4
22 | subject_8_gopro_seg_1.mp4
23 | subject_8_gopro_seg_2.mp4
24 | subject_9_gopro_seg_1.mp4
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) Microsoft Corporation.
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE
22 |
--------------------------------------------------------------------------------
/SUPPORT.md:
--------------------------------------------------------------------------------
1 | # TODO: The maintainer of this repo has not yet edited this file
2 |
3 | **REPO OWNER**: Do you want Customer Service & Support (CSS) support for this product/project?
4 |
5 | - **No CSS support:** Fill out this template with information about how to file issues and get help.
6 | - **Yes CSS support:** Fill out an intake form at [aka.ms/onboardsupport](https://aka.ms/onboardsupport). CSS will work with/help you to determine next steps.
7 | - **Not sure?** Fill out an intake as though the answer were "Yes". CSS will help you decide.
8 |
9 | *Then remove this first heading from this SUPPORT.MD file before publishing your repo.*
10 |
11 | # Support
12 |
13 | ## How to file issues and get help
14 |
15 | This project uses GitHub Issues to track bugs and feature requests. Please search the existing
16 | issues before filing new issues to avoid duplicates. For new issues, file your bug or
17 | feature request as a new Issue.
18 |
19 | For help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE
20 | FOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER
21 | CHANNEL. WHERE WILL YOU HELP PEOPLE?**.
22 |
23 | ## Microsoft Support Policy
24 |
25 | Support for this **PROJECT or PRODUCT** is limited to the resources listed above.
26 |
--------------------------------------------------------------------------------
/breakfast-dataset/README.md:
--------------------------------------------------------------------------------
1 | # Breakfast dataset
2 |
3 | This folder provides resources for evaluating action label predictions on videos from the Breakfast dataset. It includes ground-truth annotations and an evaluation script.
4 |
5 | This dataset is provided as supplementary material for the paper:
6 |
7 | > **Open-vocabulary action localization with iterative visual prompting**
8 | > *Naoki Wake, Atsushi Kanehira, Kazuhiro Sasabuchi, Jun Takamatsu, Katsushi Ikeuchi (2025), [IEEE Access, 5, 56908-56917](https://ieeexplore.ieee.org/abstract/document/10942370)*
9 | >
10 | > ```bibtex
11 | >@article{wake2025open,
12 | > author={Wake, Naoki and Kanehira, Atsushi and Sasabuchi, Kazuhiro and Takamatsu, Jun and Ikeuchi, Katsushi},
13 | > journal={IEEE Access},
14 | > title={Open-vocabulary action localization with iterative visual prompting},
15 | > year={2025},
16 | > volume={13},
17 | > number={},
18 | > pages={56908--56917},
19 | > doi={10.1109/ACCESS.2025.3555167}}
20 | > ```
21 |
22 | The original data is derived from the paper below:
23 |
24 | > **Human grasping database for activities of daily living with depth, color and kinematic data streams**
25 | > *Hilde Kuehne, Ali Arslan, and Thomas Serre (2014), CVPR, 780--787*
26 | >
27 | > ```bibtex
28 | >@inproceedings{kuehne2014language,
29 | > title={The language of actions: Recovering the syntax and semantics of goal-directed human activities},
30 | > author={Kuehne, Hilde and Arslan, Ali and Serre, Thomas},
31 | > booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
32 | > pages={780--787},
33 | > year={2014}
34 | >}
35 | > ```
36 |
37 | ## Directory and File Structure
38 |
39 | - **label_data_gt_breakfast.json**
40 | This JSON file holds the ground-truth annotations for the videos. Each entry in the JSON contains:
41 | - **action**: A sequence of action labels that occur in the video.
42 | - **gt_time**: The frame index annotations corresponding to each action label (FPS=15.0).
43 | - **video_path**: The relative path to the corresponding video file.
44 |
45 | - **label_data_estimate_baseline_breakfast.json**
46 | This is an example file that contains estimated action labels. It is used as an input to the evaluation script.
47 |
48 | - **compute_mof_iou_f1.py**
49 | This evaluation script computes performance metrics (e.g., MOF, IoU, and F1 score) by comparing predicted action labels with the ground truth.
50 | ```bash
51 | python compute_mof_iou_f1.py --file label_data_estimate_baseline.json
52 | ```
--------------------------------------------------------------------------------
/thumos14-dataset/README.md:
--------------------------------------------------------------------------------
1 | # THUMOS14 dataset
2 | s
3 | This folder provides resources for evaluating action label predictions on videos from the THUMOS14 dataset. Most of the necessary code, including the evaluation script and ground truth labels, needs to be downloaded from the official [THUMOS14 page](https://www.crcv.ucf.edu/THUMOS14/).
4 |
5 | This dataset is provided as supplementary material for the paper:
6 |
7 | > **Open-vocabulary action localization with iterative visual prompting**
8 | > *Naoki Wake, Atsushi Kanehira, Kazuhiro Sasabuchi, Jun Takamatsu, Katsushi Ikeuchi (2025), [IEEE Access, 5, 56908-56917](https://ieeexplore.ieee.org/abstract/document/10942370)*
9 | >
10 | > ```bibtex
11 | >@article{wake2025open,
12 | > author={Wake, Naoki and Kanehira, Atsushi and Sasabuchi, Kazuhiro and Takamatsu, Jun and Ikeuchi, Katsushi},
13 | > journal={IEEE Access},
14 | > title={Open-vocabulary action localization with iterative visual prompting},
15 | > year={2025},
16 | > volume={13},
17 | > number={},
18 | > pages={56908--56917},
19 | > doi={10.1109/ACCESS.2025.3555167}}
20 | > ```
21 |
22 | The following is the citation for the THUMOS challenge, taken from the official [THUMOS14 page](https://www.crcv.ucf.edu/THUMOS14/):
23 | > ```bibtex
24 | >@misc{THUMOS14,
25 | > author = "Jiang, Y.-G. and Liu, J. and Roshan Zamir, A. and Toderici, G. and Laptev, I. and Shah, M. and Sukthankar, R.",
26 | > title = "{THUMOS} Challenge: Action Recognition with a Large Number of Classes",
27 | > howpublished = "\url{http://crcv.ucf.edu/THUMOS14/}",
28 | > Year = {2014}}
29 | > ```
30 |
31 | ## Directory and File Structure
32 |
33 | - **label_data_estimate_thumos14.txt**
34 | This is an example file that contains estimated action labels. It is used as input to the evaluation script (see below). For details on the file format, please refer to the [THUMOS14 challenge documentation](https://www.crcv.ucf.edu/THUMOS14/THUMOS14_Evaluation.pdf).
35 |
36 | ## Usage Instructions
37 |
38 | 1. **Download the THUMOS14 Evaluation Toolkit**
39 | - As described in the [THUMOS14 challenge documentation](https://www.crcv.ucf.edu/THUMOS14/THUMOS14_Evaluation.pdf), download the evaluation toolkit (see "Section 4 Development kit").
40 | - Unzip the downloaded file (`THUMOS14_evalkit_20140818.zip`).
41 | 2. **Place the Label Data File**
42 | - Move `label_data_estimate_thumos14.txt` into the `THUMOS14_evalkit_20140818/TH14evalkit/results` directory.
43 | - Install MATLAB or Octave, and from within the `THUMOS14_evalkit_20140818/TH14evalkit` directory, run the following command to compute the evaluation metrics:
44 |
45 | ```matlab
46 | [pr_all, ap_all, map] = TH14evaldet('results/label_data_estimate_thumos14.txt', 'groundtruth', 'test')
47 | ```
--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | ## Security
4 |
5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet) and [Xamarin](https://github.com/xamarin).
6 |
7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/security.md/definition), please report it to us as described below.
8 |
9 | ## Reporting Security Issues
10 |
11 | **Please do not report security vulnerabilities through public GitHub issues.**
12 |
13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/security.md/msrc/create-report).
14 |
15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/security.md/msrc/pgp).
16 |
17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc).
18 |
19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
20 |
21 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
22 | * Full paths of source file(s) related to the manifestation of the issue
23 | * The location of the affected source code (tag/branch/commit or direct URL)
24 | * Any special configuration required to reproduce the issue
25 | * Step-by-step instructions to reproduce the issue
26 | * Proof-of-concept or exploit code (if possible)
27 | * Impact of the issue, including how an attacker might exploit the issue
28 |
29 | This information will help us triage your report more quickly.
30 |
31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/security.md/msrc/bounty) page for more details about our active programs.
32 |
33 | ## Preferred Languages
34 |
35 | We prefer all communications to be in English.
36 |
37 | ## Policy
38 |
39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/security.md/cvd).
40 |
41 |
42 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # VLM-Video-Action-Localization
2 | This repository provides a sample code of a paper, [Open-vocabulary action localization with iterative visual prompting (IEEE Access)](https://ieeexplore.ieee.org/abstract/document/10942370). This paper was authored by the [Applied Robotics Research](https://www.microsoft.com/en-us/research/group/applied-robotics-research/) team.
3 |
4 | ### Overview of the pipeline:
5 | 
6 |
7 | ## How to use
8 | We have confirmed that the sample codes work with python 3.12.1
9 |
10 | Modify the [auth.env](./auth.env)
11 |
12 | ### If you use Azure OpenAI
13 | - AZURE_OPENAI_DEPLOYMENT_NAME
14 | - AZURE_OPENAI_ENDPOINT
15 | - AZURE_OPENAI_API_KEY
16 | ### If you use OpenAI
17 | - OPENAI_API_KEY
18 |
19 | ### Install dependencies
20 | ```bash
21 | > pip install -r requirements.txt
22 | ```
23 |
24 | ### Run the sample code
25 | ```bash
26 | python example.py --credentials auth.env --video sample_video/sample.mp4 --grid 3 --action "Grasping the can"
27 | ```
28 | The `--grid N` option specifies the number of frames to extract, creating an image with an NxN grid tiling.
29 | ## Bibliography
30 | ```
31 | @article{wake2025open,
32 | author={Wake, Naoki and Kanehira, Atsushi and Sasabuchi, Kazuhiro and Takamatsu, Jun and Ikeuchi, Katsushi},
33 | journal={IEEE Access},
34 | title={Open-Vocabulary Action Localization With Iterative Visual Prompting},
35 | year={2025},
36 | volume={13},
37 | number={},
38 | pages={56908--56917},
39 | doi={10.1109/ACCESS.2025.3555167}}
40 | ```
41 |
42 | ## Contributing
43 |
44 | This project welcomes contributions and suggestions. Most contributions require you to agree to a
45 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
46 | the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com.
47 |
48 | When you submit a pull request, a CLA bot will automatically determine whether you need to provide
49 | a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions
50 | provided by the bot. You will only need to do this once across all repos using our CLA.
51 |
52 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
53 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
54 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
55 |
56 | ## Trademarks
57 |
58 | This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft
59 | trademarks or logos is subject to and must follow
60 | [Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general).
61 | Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship.
62 | Any use of third-party trademarks or logos are subject to those third-party's policies.
63 |
--------------------------------------------------------------------------------
/finegrained-breakfast-dataset/clip_original_videos.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 | import re
4 | import subprocess
5 |
6 | # Configuration
7 | json_file = "label_data_gt_right.json" # Path to the JSON file
8 | frame_rate = 30 # Frame rate of the videos
9 |
10 | # Read the JSON file
11 | with open(json_file, "r", encoding="utf-8") as f:
12 | data = json.load(f)
13 |
14 | # Process each entry in the JSON file
15 | for entry in data:
16 | # Get the video_path field (expected format, e.g., "original_videos/subject_1_gopro_seg_1_2162-2284.mp4")
17 | video_path_field = entry.get("video_path")
18 | if not video_path_field:
19 | print("Missing video_path field. Skipping entry.")
20 | continue
21 |
22 | # Normalize path separators (in case backslashes are used)
23 | video_path_field = video_path_field.replace("\\", "/")
24 |
25 | # Split the video_path into directory and filename
26 | directory, filename = os.path.split(video_path_field)
27 | basename, ext = os.path.splitext(filename)
28 |
29 | # Use regex to extract segment info from the end of the basename.
30 | # Expected pattern: an underscore followed by two numbers separated by a dash, e.g., "_2162-2284"
31 | match = re.search(r'_(\d+)-(\d+)$', basename)
32 | if not match:
33 | print(f"Segment info not found in filename: {filename}. Skipping entry.")
34 | continue
35 |
36 | segment_start_str, segment_end_str = match.groups()
37 | try:
38 | segment_start_frame = int(segment_start_str)
39 | segment_end_frame = int(segment_end_str)
40 | except ValueError:
41 | print(f"Invalid segment frame numbers in filename: {filename}. Skipping entry.")
42 | continue
43 |
44 | # Calculate the duration in frames and verify the range is valid
45 | duration_frames = segment_end_frame - segment_start_frame
46 | if duration_frames <= 0:
47 | print(f"Invalid frame range in filename: {filename}. Skipping entry.")
48 | continue
49 |
50 | # Convert frame numbers to seconds for FFmpeg
51 | start_time_sec = segment_start_frame / frame_rate
52 | duration_sec = duration_frames / frame_rate
53 |
54 | # Determine the original video filename by removing the segment info from the basename.
55 | # For example, if basename is "subject_1_gopro_seg_1_2162-2284", the original basename will be "subject_1_gopro_seg_1".
56 | original_basename = basename[:match.start()] # Everything before the segment info
57 | original_filename = original_basename + ext
58 | original_video_path = os.path.join(directory, original_filename)
59 |
60 | if not os.path.exists(original_video_path):
61 | print(f"Original video file not found: {original_video_path}. Skipping entry.")
62 | continue
63 |
64 | # Use FFmpeg to extract the clip from the original video.
65 | # The clip starts at 'start_time_sec' (in seconds) and lasts for 'duration_sec' seconds.
66 | # The output file will be saved with the same name as specified in the video_path field.
67 | output_video_path = video_path_field
68 | ffmpeg_cmd = [
69 | "ffmpeg",
70 | "-i", original_video_path,
71 | "-ss", str(start_time_sec),
72 | "-t", str(duration_sec),
73 | "-c:v", "libx264",
74 | "-crf", "23",
75 | "-preset", "fast",
76 | "-c:a", "aac",
77 | "-b:a", "128k",
78 | "-y", output_video_path
79 | ]
80 |
81 | print(f"Extracting clip: {output_video_path}")
82 | subprocess.run(ffmpeg_cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
83 |
84 | print("All extractions completed!")
85 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | .pybuilder/
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | # For a library or package, you might want to ignore these files since the code is
87 | # intended to run in multiple environments; otherwise, check them in:
88 | # .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # poetry
98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99 | # This is especially recommended for binary packages to ensure reproducibility, and is more
100 | # commonly ignored for libraries.
101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 |
104 | # pdm
105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | # in version control.
109 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
110 | .pdm.toml
111 | .pdm-python
112 | .pdm-build/
113 |
114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115 | __pypackages__/
116 |
117 | # Celery stuff
118 | celerybeat-schedule
119 | celerybeat.pid
120 |
121 | # SageMath parsed files
122 | *.sage.py
123 |
124 | # Environments
125 | .env
126 | .venv
127 | env/
128 | venv/
129 | ENV/
130 | env.bak/
131 | venv.bak/
132 |
133 | # Spyder project settings
134 | .spyderproject
135 | .spyproject
136 |
137 | # Rope project settings
138 | .ropeproject
139 |
140 | # mkdocs documentation
141 | /site
142 |
143 | # mypy
144 | .mypy_cache/
145 | .dmypy.json
146 | dmypy.json
147 |
148 | # Pyre type checker
149 | .pyre/
150 |
151 | # pytype static type analyzer
152 | .pytype/
153 |
154 | # Cython debug symbols
155 | cython_debug/
156 |
157 | # PyCharm
158 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
159 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160 | # and can be added to the global gitignore or merged into this file. For a more nuclear
161 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
162 | #.idea/
163 |
--------------------------------------------------------------------------------
/finegrained-breakfast-dataset/README.md:
--------------------------------------------------------------------------------
1 | # Fine-grained Breakfast dataset
2 |
3 | This folder provides resources for evaluating action label predictions on videos from the Fine-grained Breakfast dataset. It includes ground-truth annotations and an evaluation script.
4 |
5 | This dataset is provided as supplementary material for the paper:
6 |
7 | > **Open-vocabulary action localization with iterative visual prompting**
8 | > *Naoki Wake, Atsushi Kanehira, Kazuhiro Sasabuchi, Jun Takamatsu, Katsushi Ikeuchi (2025), [IEEE Access, 5, 56908-56917](https://ieeexplore.ieee.org/abstract/document/10942370)*
9 | >
10 | > ```bibtex
11 | >@article{wake2025open,
12 | > author={Wake, Naoki and Kanehira, Atsushi and Sasabuchi, Kazuhiro and Takamatsu, Jun and Ikeuchi, Katsushi},
13 | > journal={IEEE Access},
14 | > title={Open-vocabulary action localization with iterative visual prompting},
15 | > year={2025},
16 | > volume={13},
17 | > number={},
18 | > pages={56908--56917},
19 | > doi={10.1109/ACCESS.2025.3555167}}
20 | > ```
21 |
22 | The original data is derived from the dataset described below. We have manually annotated a subset of these videos:
23 |
24 | > **Human grasping database for activities of daily living with depth, color and kinematic data streams**
25 | > *Artur Saudabayev, Zhanibek Rysbek, Raykhan Khassenova, Huseyin Atakan Varol (2018), Scientific Data, 5(1), 1–13*
26 | >
27 | > ```bibtex
28 | > @article{saudabayev2018human,
29 | > title={Human grasping database for activities of daily living with depth, color and kinematic data streams},
30 | > author={Saudabayev, Artur and Rysbek, Zhanibek and Khassenova, Raykhan and Varol, Huseyin Atakan},
31 | > journal={Scientific data},
32 | > volume={5},
33 | > number={1},
34 | > pages={1--13},
35 | > year={2018},
36 | > publisher={Nature Publishing Group}
37 | > }
38 | > ```
39 |
40 | ## Directory and File Structure
41 |
42 | - **original_videos**
43 | Download the original videos from `Human grasping database for activities of daily living with depth, color and kinematic data streams` and place them in this folder.
44 |
45 | - **label_data_gt_right.json**
46 | This JSON file holds the ground-truth annotations for the videos. Each entry in the JSON contains:
47 | - **action**: A sequence of action labels that occur in the video.
48 | *Example*: `["Grasp with the right hand", "Picking with the right hand", ...]`
49 | - **gt_time**: The frame index annotations corresponding to each action label (FPS=30.0).
50 | *Example*: `[[0, 23], [24, 48], ...]`
51 | - **video_path**: The relative path to the corresponding video file.
52 | *Example*: `"original_videos/subject_9_gopro_seg_1_2324-2575.mp4"`
53 | **Note**: This file name is constructed from the original video name with the appended frame range. Since this repository does not provide the original videos, you need to download the original dataset, extract the clips corresponding to the specified frame numbers, and place them in the `original_videos` folder. We provide the script `clip_original_videos.py` to extract these clips. The list of original video files is provided in `original_videos/original_videos.txt`.
54 |
55 | - **label_data_estimate_baseline.json**
56 | This is an example file that contains estimated action labels. It is used as an input to the evaluation script.
57 |
58 | - **compute_mof_iou_f1.py**
59 | This evaluation script computes performance metrics (e.g., MOF, IoU, and F1 score) by comparing predicted action labels with the ground truth.
60 | ```bash
61 | python compute_mof_iou_f1.py --file label_data_estimate_baseline.json
62 | ```
63 |
64 | - **clip_original_videos.py**
65 | This script extracts video clips from the original videos based on the frame indices specified in `label_data_gt_right.json`. Running this script will generate the video dataset with filenames as indicated in the JSON annotations.
66 |
67 | ## Usage Instructions
68 |
69 | 1. **Place the Video Files**
70 | - Download the original videos from the Fine-grained Breakfast dataset.
71 | - Place the downloaded video files in the `original_videos` folder. Refer to `original_videos/original_videos.txt` for the list of required files.
72 |
73 | 2. **Generate the Video Dataset**
74 | After placing the original videos in the `original_videos` folder, run the `clip_original_videos.py` script to extract the annotated clips. This script uses the frame index annotations provided in `label_data_gt_right.json` to cut the clips from the original videos and save them using the specified naming convention. Run the script with the following command. Note that this script leverages ffmpeg.
75 | ```bash
76 | python clip_original_videos.py
77 | ```
--------------------------------------------------------------------------------
/docs/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
70 |
71 |
72 |
Open-vocabulary Temporal Action Localization using VLMs
Video action localization aims to find timings of a specific action from a long video. Although existing learning-based approaches have been successful, those require annotating videos that come with a considerable labor cost. This paper proposes a learning-free, open-vocabulary approach based on emerging off-the-shelf vision-language models (VLM). The challenge stems from the fact that VLMs are neither designed to process long videos nor tailored for finding actions. We overcome these problems by extending an iterative visual prompting technique. Specifically, we sample video frames into a concatenated image with frame index labels, making a VLM guess a frame that is considered to be closest to the start/end of the action. Iterating this process by narrowing a sampling time window results in finding a specific frame of start and end of an action. We demonstrate that this sampling technique yields reasonable results, illustrating a practical extension of VLMs for understanding videos.
102 |
Pipeline
103 |
104 |
The proposed pipeline for open-vocabulary video action localization using a VLM consists of the following steps: (a) Frames are sampled at regular intervals from a time window, covering the entire video in the first iteration. (b) The sampled frames are then tiled in an image with annotations indicating the time order of the frames. (c) This image is then fed into a VLM to identify the frames closest to a specific timing of an action (e.g., the start timing of an action). (d) The sampling window is updated by centering on the selected frame with a narrower sampling interval. Bottom panel (1) For general action localization, the start time of the action in the video is determined by iterating steps (a) to (d). Bottom panel (2) By estimating the end time of the action in the same manner, the action is localized in the video.
105 |
106 |
Qualitative Results
107 |
We qualitatively checked our proposed pipeline using a cooking-preparation video that we recorded in-house. This 10-minute first-person video included actions such as taking out, washing, and cutting vegetables. The figure below shows the examples of the identified video segments for actions of "cutting vegetables,""washing vegetables," and "turning on a faucet," demonstrating that reasonable outputs were obtained.
108 |
109 |
110 |
111 |
112 |
Quantitative Results
113 |
The table below compares our proposed method with an existing method [1] on the Breakfast Dataset [2]. While our proposed method does not surpass the latest model-based approaches, this approach demonstrates its feasibility. Importantly, this method offers significant advantages: it eliminates the need for data collection or training and can extract actions specified by open-vocabulary free-text queries, thereby enhancing its adaptability to diverse applications such as video annotation and video editing.
114 |
115 |
116 |
117 |
118 |
[1] Hilde Kuehne, Ali Arslan, and Thomas Serre. "The language of actions: Recovering the syntax and semantics of goal-directed human activities." In Proceedings of the IEEE conference on computer vision and pattern recognition, pages 780–787, 2014.
119 |
[2] Elena Bueno-Benito, Biel Tura Vecino, and Mariella Dimiccoli. "Leveraging triplet loss for unsupervised action segmentation." In Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition, pages 4922–4930, 2023.
120 |
121 |
122 |
123 |
124 |
--------------------------------------------------------------------------------
/breakfast-dataset/compute_mof_iou_f1.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 | import argparse
4 | import cv2
5 | import matplotlib.pyplot as plt
6 |
7 |
8 | def parse_arguments():
9 | """Parse command line arguments."""
10 | parser = argparse.ArgumentParser()
11 | parser.add_argument("--file", help="Input JSON file containing estimated labels", required=True)
12 | parser.add_argument("--outdir", help="Output directory", default="out/visualize")
13 | return parser.parse_args()
14 |
15 |
16 | def compute_tiou(pred_interval, gt_interval):
17 | """
18 | Compute the temporal Intersection over Union (tIoU) between two intervals.
19 |
20 | Args:
21 | pred_interval (tuple): (start_frame, end_frame) of the prediction.
22 | gt_interval (tuple): (start_frame, end_frame) of the ground truth.
23 |
24 | Returns:
25 | float: The tIoU value.
26 | """
27 | intersection = max(0, min(pred_interval[1], gt_interval[1]) - max(pred_interval[0], gt_interval[0]))
28 | union = max(pred_interval[1], gt_interval[1]) - min(pred_interval[0], gt_interval[0])
29 | return intersection / union if union > 0 else 0
30 |
31 |
32 | def compute_map(pred_intervals, gt_intervals, tiou_thresholds):
33 | """
34 | Compute the mean Average Precision (mAP) over a set of tIoU thresholds.
35 |
36 | Args:
37 | pred_intervals (list of tuple): List of predicted intervals.
38 | gt_intervals (list of tuple): List of ground truth intervals.
39 | tiou_thresholds (list of float): List of tIoU thresholds.
40 |
41 | Returns:
42 | float: The computed mAP value.
43 | """
44 | assert len(pred_intervals) == len(gt_intervals)
45 | ap_values = []
46 |
47 | for threshold in tiou_thresholds:
48 | matches = []
49 | # Evaluate each prediction
50 | for pred in pred_intervals:
51 | match_found = False
52 | for gt in gt_intervals:
53 | tiou = compute_tiou(pred, gt)
54 | if tiou >= threshold:
55 | matches.append((1, tiou)) # True Positive
56 | match_found = True
57 | break
58 | if not match_found:
59 | matches.append((0, 0)) # False Positive
60 |
61 | # Sort by tIoU (descending order)
62 | matches.sort(key=lambda x: x[1], reverse=True)
63 | tp_cum, fp_cum = 0, 0
64 | precisions = []
65 | recalls = []
66 |
67 | for match, _ in matches:
68 | if match == 1:
69 | tp_cum += 1
70 | else:
71 | fp_cum += 1
72 | precision = tp_cum / (tp_cum + fp_cum)
73 | recall = tp_cum / len(gt_intervals)
74 | precisions.append(precision)
75 | recalls.append(recall)
76 |
77 | # Compute AP using a simple approximation (area under the precision-recall curve)
78 | ap = 0.0
79 | for i in range(1, len(recalls)):
80 | ap += (recalls[i] - recalls[i - 1]) * precisions[i]
81 | ap_values.append(ap)
82 |
83 | return sum(ap_values) / len(ap_values) if ap_values else 0
84 |
85 |
86 | def time_to_frame(time_in_seconds, fps):
87 | """Convert time in seconds to frame number based on fps."""
88 | return int(round(time_in_seconds * fps))
89 |
90 |
91 | def create_label_array(total_frames, intervals):
92 | """
93 | Create a label array of length total_frames from a list of intervals.
94 |
95 | Each interval is assigned a unique label (based on its index). Every frame
96 | in the interval (inclusive) is assigned that label.
97 |
98 | Args:
99 | total_frames (int): Total number of frames.
100 | intervals (list of tuple): List of intervals (start_frame, end_frame).
101 |
102 | Returns:
103 | list: An array of labels for each frame.
104 | """
105 | labels = [-1] * total_frames
106 | for idx, (start, end) in enumerate(intervals):
107 | for frame in range(start, end + 1):
108 | labels[frame] = idx
109 | return labels
110 |
111 |
112 | def compute_metrics(video_data, fps):
113 | """
114 | Compute various evaluation metrics (MoF, IoU per class, mean IoU, and F1 per class)
115 | for a single video's predictions.
116 |
117 | Assumes that ground truth time intervals are 1-indexed and converts them to 0-indexed.
118 |
119 | Args:
120 | video_data (dict): Dictionary containing ground truth and predicted data.
121 | fps (float): Frames per second of the video.
122 |
123 | Returns:
124 | tuple: (MoF, IoU per class, mean IoU, F1 per class, mean F1)
125 | """
126 | gt_actions = video_data['action']
127 | gt_intervals = video_data['gt_time']
128 | pred_start_times = video_data['start_times']
129 | pred_end_times = video_data['completed_times']
130 |
131 | # Convert ground truth intervals from 1-indexed to 0-indexed
132 | gt_intervals = [(start - 1, end - 1) for start, end in gt_intervals]
133 | total_frames = gt_intervals[-1][1] + 1
134 |
135 | # Create ground truth label array
136 | label_gt = create_label_array(total_frames, gt_intervals)
137 |
138 | # Create predicted label array (initialized with -1)
139 | label_pred = [-1] * total_frames
140 | pred_keys = list(pred_start_times.keys())
141 | for idx, key in enumerate(pred_keys):
142 | start_time = pred_start_times[key]
143 | end_time = pred_end_times[key]
144 | start_frame = time_to_frame(start_time, fps)
145 | end_frame = time_to_frame(end_time, fps)
146 | for frame in range(start_frame, end_frame):
147 | if frame < total_frames:
148 | label_pred[frame] = idx
149 |
150 | # Fill any leading -1 values with 0
151 | for i in range(total_frames):
152 | if label_pred[i] == -1:
153 | label_pred[i] = 0
154 | else:
155 | break
156 |
157 | # Fill trailing -1 values with the last action's index
158 | last_index = len(pred_keys) - 1
159 | for i in range(total_frames - 1, -1, -1):
160 | if label_pred[i] == -1:
161 | label_pred[i] = last_index
162 | else:
163 | break
164 |
165 | # Ensure no -1 values remain
166 | if -1 in label_gt or -1 in label_pred:
167 | raise ValueError("Label array contains unassigned frames.")
168 |
169 | # Calculate Mean over Frames (MoF)
170 | correct_frames = sum(1 for gt, pred in zip(label_gt, label_pred) if gt == pred)
171 | mof = correct_frames / total_frames if total_frames > 0 else 0
172 |
173 | # Calculate IoU and F1 per action class
174 | iou_per_class = {}
175 | f1_per_class = {}
176 | for idx, action in enumerate(gt_actions):
177 | gt_count = sum(1 for label in label_gt if label == idx)
178 | pred_count = sum(1 for label in label_pred if label == idx)
179 | intersection = sum(1 for gt, pred in zip(label_gt, label_pred) if gt == pred == idx)
180 | union = gt_count + pred_count - intersection
181 | iou = intersection / union if union > 0 else 0
182 | iou_per_class[action] = iou
183 |
184 | tp = intersection
185 | fp = pred_count - intersection
186 | fn = gt_count - intersection
187 | precision = tp / (tp + fp) if (tp + fp) > 0 else 0
188 | recall = tp / (tp + fn) if (tp + fn) > 0 else 0
189 | f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
190 | f1_per_class[action] = f1
191 |
192 | mean_iou = sum(iou_per_class.values()) / len(iou_per_class) if iou_per_class else 0
193 | mean_f1 = sum(f1_per_class.values()) / len(f1_per_class) if f1_per_class else 0
194 |
195 | return mof, iou_per_class, mean_iou, f1_per_class, mean_f1
196 |
197 |
198 | def process_videos(label_data_estimates, tiou_thresholds):
199 | """
200 | Process each video's data, compute evaluation metrics, and collect statistics.
201 |
202 | Args:
203 | label_data_estimates (list): List of video annotation dictionaries.
204 | tiou_thresholds (list): List of tIoU thresholds for mAP calculation.
205 |
206 | Returns:
207 | dict: A dictionary with per-video metrics.
208 | dict: A dictionary containing lists of overall metrics for plotting.
209 | """
210 | mof_list = []
211 | miou_list = []
212 | mf1_list = []
213 | map_list = []
214 | action_steps = []
215 | action_frames = []
216 | results = {}
217 |
218 | for video_entry in label_data_estimates:
219 | video_path = video_entry['video_path']
220 |
221 | fps = 15.0
222 |
223 | # Skip entries where start_times is a string (invalid data)
224 | if isinstance(video_entry.get('start_times'), str):
225 | print("Skipping video:", video_path)
226 | continue
227 |
228 | mof, iou_per_class, mean_iou, f1_per_class, mean_f1 = compute_metrics(video_entry, fps)
229 | mof_list.append(mof)
230 | miou_list.append(mean_iou)
231 | mf1_list.append(mean_f1)
232 | results[video_path] = {"MoF": mof, "mIoU": mean_iou, "mF1": mean_f1}
233 |
234 | # Compute durations for predicted actions
235 | pred_start_times = video_entry['start_times']
236 | pred_end_times = video_entry['completed_times']
237 | durations = [pred_end_times[key] - pred_start_times[key] for key in pred_start_times.keys()]
238 | results[video_path]["duration"] = durations
239 |
240 | # Convert predicted intervals to frames and compute mAP
241 | pred_intervals = [
242 | (
243 | time_to_frame(pred_start_times[key], fps),
244 | time_to_frame(pred_end_times[key], fps)
245 | )
246 | for key in pred_start_times.keys()
247 | ]
248 | gt_intervals = video_entry['gt_time']
249 | map_value = compute_map(pred_intervals, gt_intervals, tiou_thresholds)
250 | map_list.append(map_value)
251 |
252 | action_steps.append(len(video_entry['action']))
253 |
254 | # Compute total frames from ground truth (adjust for 0-index)
255 | gt_intervals_zero_indexed = [(start - 1, end - 1) for start, end in video_entry['gt_time']]
256 | total_frames = gt_intervals_zero_indexed[-1][1] + 1
257 | action_frames.append(total_frames)
258 |
259 | metrics = {
260 | "MoF": mof_list,
261 | "mIoU": miou_list,
262 | "mF1": mf1_list,
263 | "mAP": map_list,
264 | "action_steps": action_steps,
265 | "action_frames": action_frames
266 | }
267 | return results, metrics
268 |
269 |
270 | def plot_metrics(metrics, output_path):
271 | """
272 | Generate scatter plots for the evaluation metrics and save the figure.
273 |
274 | Args:
275 | metrics (dict): Dictionary containing lists of metrics.
276 | output_path (str): Path to save the output plot image.
277 | """
278 | plt.figure(figsize=(12, 6))
279 |
280 | # Plot metrics against action steps
281 | ax1 = plt.subplot(2, 4, 1)
282 | plt.scatter(metrics["action_steps"], metrics["MoF"], alpha=0.6)
283 | plt.xlabel('Action Length (steps)')
284 | plt.ylabel('MoF')
285 | plt.ylim(0, 1)
286 | plt.title('Action Length vs MoF')
287 | plt.grid(True, linestyle='--', linewidth=0.5, color='gray', alpha=0.7)
288 |
289 | ax2 = plt.subplot(2, 4, 2)
290 | plt.scatter(metrics["action_steps"], metrics["mIoU"], alpha=0.6)
291 | plt.xlabel('Action Length (steps)')
292 | plt.ylabel('mIoU')
293 | plt.ylim(0, 1)
294 | plt.title('Action Length vs mIoU')
295 | plt.grid(True, linestyle='--', linewidth=0.5, color='gray', alpha=0.7)
296 |
297 | ax3 = plt.subplot(2, 4, 3)
298 | plt.scatter(metrics["action_steps"], metrics["mF1"], alpha=0.6)
299 | plt.xlabel('Action Length (steps)')
300 | plt.ylabel('mF1')
301 | plt.ylim(0, 1)
302 | plt.title('Action Length vs mF1')
303 | plt.grid(True, linestyle='--', linewidth=0.5, color='gray', alpha=0.7)
304 |
305 | ax4 = plt.subplot(2, 4, 4)
306 | plt.scatter(metrics["action_steps"], metrics["mAP"], alpha=0.6)
307 | plt.xlabel('Action Length (steps)')
308 | plt.ylabel('mAP')
309 | plt.ylim(0, 1)
310 | plt.title('Action Length vs mAP')
311 | plt.grid(True, linestyle='--', linewidth=0.5, color='gray', alpha=0.7)
312 |
313 | # Plot metrics against action frames
314 | ax5 = plt.subplot(2, 4, 5)
315 | plt.scatter(metrics["action_frames"], metrics["MoF"], alpha=0.6)
316 | plt.xlabel('Action Length (frames)')
317 | plt.ylabel('MoF')
318 | plt.ylim(0, 1)
319 | plt.title('Frames vs MoF')
320 | plt.grid(True, linestyle='--', linewidth=0.5, color='gray', alpha=0.7)
321 |
322 | ax6 = plt.subplot(2, 4, 6)
323 | plt.scatter(metrics["action_frames"], metrics["mIoU"], alpha=0.6)
324 | plt.xlabel('Action Length (frames)')
325 | plt.ylabel('mIoU')
326 | plt.ylim(0, 1)
327 | plt.title('Frames vs mIoU')
328 | plt.grid(True, linestyle='--', linewidth=0.5, color='gray', alpha=0.7)
329 |
330 | ax7 = plt.subplot(2, 4, 7)
331 | plt.scatter(metrics["action_frames"], metrics["mF1"], alpha=0.6)
332 | plt.xlabel('Action Length (frames)')
333 | plt.ylabel('mF1')
334 | plt.ylim(0, 1)
335 | plt.title('Frames vs mF1')
336 | plt.grid(True, linestyle='--', linewidth=0.5, color='gray', alpha=0.7)
337 |
338 | ax8 = plt.subplot(2, 4, 8)
339 | plt.scatter(metrics["action_frames"], metrics["mAP"], alpha=0.6)
340 | plt.xlabel('Action Length (frames)')
341 | plt.ylabel('mAP')
342 | plt.ylim(0, 1)
343 | plt.title('Frames vs mAP')
344 | plt.grid(True, linestyle='--', linewidth=0.5, color='gray', alpha=0.7)
345 |
346 | plt.tight_layout()
347 | plt.savefig(output_path)
348 | plt.close()
349 |
350 |
351 | def main():
352 | args = parse_arguments()
353 | tiou_thresholds = [0.3, 0.4, 0.5, 0.6, 0.7]
354 | input_filename = args.file
355 |
356 | # Load estimated label data from JSON
357 | with open(input_filename, "r") as f:
358 | label_data_estimates = json.load(f)
359 |
360 | # Process each video entry to compute metrics
361 | results, metrics = process_videos(label_data_estimates, tiou_thresholds)
362 |
363 | # Prepare output directories
364 | out_dir = args.outdir
365 | base_filename = os.path.splitext(os.path.basename(input_filename))[0]
366 | parent_dir = os.path.basename(os.path.dirname(input_filename))
367 | output_dir = os.path.join(out_dir, parent_dir)
368 | os.makedirs(output_dir, exist_ok=True)
369 | plot_output_path = os.path.join(output_dir, base_filename + ".png")
370 |
371 | # Plot and save the evaluation metrics
372 | plot_metrics(metrics, plot_output_path)
373 |
374 | # Print mean metric values
375 | mean_mof = sum(metrics["MoF"]) / len(metrics["MoF"]) if metrics["MoF"] else 0
376 | mean_miou = sum(metrics["mIoU"]) / len(metrics["mIoU"]) if metrics["mIoU"] else 0
377 | mean_mf1 = sum(metrics["mF1"]) / len(metrics["mF1"]) if metrics["mF1"] else 0
378 | mean_map = sum(metrics["mAP"]) / len(metrics["mAP"]) if metrics["mAP"] else 0
379 |
380 | print("Mean MoF: {:.4f}".format(mean_mof))
381 | print("Mean mIoU: {:.4f}".format(mean_miou))
382 | print("Mean mF1: {:.4f}".format(mean_mf1))
383 | print("Mean mAP: {:.4f}".format(mean_map))
384 | print("Processed videos:", len(metrics["MoF"]))
385 |
386 | # Save detailed results as JSON
387 | results_output_path = os.path.join(output_dir, base_filename + ".json")
388 | with open(results_output_path, "w") as f:
389 | json.dump(results, f, indent=4)
390 |
391 |
392 | if __name__ == "__main__":
393 | main()
394 |
--------------------------------------------------------------------------------
/finegrained-breakfast-dataset/compute_mof_iou_f1.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 | import argparse
4 | import cv2
5 | import matplotlib.pyplot as plt
6 |
7 |
8 | def parse_arguments():
9 | """Parse command line arguments."""
10 | parser = argparse.ArgumentParser()
11 | parser.add_argument("--file", help="Input JSON file containing estimated labels", required=True)
12 | parser.add_argument("--outdir", help="Output directory", default="out/visualize")
13 | return parser.parse_args()
14 |
15 |
16 | def compute_tiou(pred_interval, gt_interval):
17 | """
18 | Compute the temporal Intersection over Union (tIoU) between two intervals.
19 |
20 | Args:
21 | pred_interval (tuple): (start_frame, end_frame) of the prediction.
22 | gt_interval (tuple): (start_frame, end_frame) of the ground truth.
23 |
24 | Returns:
25 | float: The tIoU value.
26 | """
27 | intersection = max(0, min(pred_interval[1], gt_interval[1]) - max(pred_interval[0], gt_interval[0]))
28 | union = max(pred_interval[1], gt_interval[1]) - min(pred_interval[0], gt_interval[0])
29 | return intersection / union if union > 0 else 0
30 |
31 |
32 | def compute_map(pred_intervals, gt_intervals, tiou_thresholds):
33 | """
34 | Compute the mean Average Precision (mAP) over a set of tIoU thresholds.
35 |
36 | Args:
37 | pred_intervals (list of tuple): List of predicted intervals.
38 | gt_intervals (list of tuple): List of ground truth intervals.
39 | tiou_thresholds (list of float): List of tIoU thresholds.
40 |
41 | Returns:
42 | float: The computed mAP value.
43 | """
44 | assert len(pred_intervals) == len(gt_intervals)
45 | ap_values = []
46 |
47 | for threshold in tiou_thresholds:
48 | matches = []
49 | # Evaluate each prediction
50 | for pred in pred_intervals:
51 | match_found = False
52 | for gt in gt_intervals:
53 | tiou = compute_tiou(pred, gt)
54 | if tiou >= threshold:
55 | matches.append((1, tiou)) # True Positive
56 | match_found = True
57 | break
58 | if not match_found:
59 | matches.append((0, 0)) # False Positive
60 |
61 | # Sort by tIoU (descending order)
62 | matches.sort(key=lambda x: x[1], reverse=True)
63 | tp_cum, fp_cum = 0, 0
64 | precisions = []
65 | recalls = []
66 |
67 | for match, _ in matches:
68 | if match == 1:
69 | tp_cum += 1
70 | else:
71 | fp_cum += 1
72 | precision = tp_cum / (tp_cum + fp_cum)
73 | recall = tp_cum / len(gt_intervals)
74 | precisions.append(precision)
75 | recalls.append(recall)
76 |
77 | # Compute AP using a simple approximation (area under the precision-recall curve)
78 | ap = 0.0
79 | for i in range(1, len(recalls)):
80 | ap += (recalls[i] - recalls[i - 1]) * precisions[i]
81 | ap_values.append(ap)
82 |
83 | return sum(ap_values) / len(ap_values) if ap_values else 0
84 |
85 |
86 | def time_to_frame(time_in_seconds, fps):
87 | """Convert time in seconds to frame number based on fps."""
88 | return int(round(time_in_seconds * fps))
89 |
90 |
91 | def create_label_array(total_frames, intervals):
92 | """
93 | Create a label array of length total_frames from a list of intervals.
94 |
95 | Each interval is assigned a unique label (based on its index). Every frame
96 | in the interval (inclusive) is assigned that label.
97 |
98 | Args:
99 | total_frames (int): Total number of frames.
100 | intervals (list of tuple): List of intervals (start_frame, end_frame).
101 |
102 | Returns:
103 | list: An array of labels for each frame.
104 | """
105 | labels = [-1] * total_frames
106 | for idx, (start, end) in enumerate(intervals):
107 | for frame in range(start, end + 1):
108 | labels[frame] = idx
109 | return labels
110 |
111 |
112 | def compute_metrics(video_data, fps):
113 | """
114 | Compute various evaluation metrics (MoF, IoU per class, mean IoU, and F1 per class)
115 | for a single video's predictions.
116 |
117 | Assumes that ground truth time intervals are 1-indexed and converts them to 0-indexed.
118 |
119 | Args:
120 | video_data (dict): Dictionary containing ground truth and predicted data.
121 | fps (float): Frames per second of the video.
122 |
123 | Returns:
124 | tuple: (MoF, IoU per class, mean IoU, F1 per class, mean F1)
125 | """
126 | gt_actions = video_data['action']
127 | gt_intervals = video_data['gt_time']
128 | pred_start_times = video_data['start_times']
129 | pred_end_times = video_data['completed_times']
130 |
131 | # Convert ground truth intervals from 1-indexed to 0-indexed
132 | gt_intervals = [(start - 1, end - 1) for start, end in gt_intervals]
133 | total_frames = gt_intervals[-1][1] + 1
134 |
135 | # Create ground truth label array
136 | label_gt = create_label_array(total_frames, gt_intervals)
137 |
138 | # Create predicted label array (initialized with -1)
139 | label_pred = [-1] * total_frames
140 | pred_keys = list(pred_start_times.keys())
141 | for idx, key in enumerate(pred_keys):
142 | start_time = pred_start_times[key]
143 | end_time = pred_end_times[key]
144 | start_frame = time_to_frame(start_time, fps)
145 | end_frame = time_to_frame(end_time, fps)
146 | for frame in range(start_frame, end_frame):
147 | if frame < total_frames:
148 | label_pred[frame] = idx
149 |
150 | # Fill any leading -1 values with 0
151 | for i in range(total_frames):
152 | if label_pred[i] == -1:
153 | label_pred[i] = 0
154 | else:
155 | break
156 |
157 | # Fill trailing -1 values with the last action's index
158 | last_index = len(pred_keys) - 1
159 | for i in range(total_frames - 1, -1, -1):
160 | if label_pred[i] == -1:
161 | label_pred[i] = last_index
162 | else:
163 | break
164 |
165 | # Ensure no -1 values remain
166 | if -1 in label_gt or -1 in label_pred:
167 | raise ValueError("Label array contains unassigned frames.")
168 |
169 | # Calculate Mean over Frames (MoF)
170 | correct_frames = sum(1 for gt, pred in zip(label_gt, label_pred) if gt == pred)
171 | mof = correct_frames / total_frames if total_frames > 0 else 0
172 |
173 | # Calculate IoU and F1 per action class
174 | iou_per_class = {}
175 | f1_per_class = {}
176 | for idx, action in enumerate(gt_actions):
177 | gt_count = sum(1 for label in label_gt if label == idx)
178 | pred_count = sum(1 for label in label_pred if label == idx)
179 | intersection = sum(1 for gt, pred in zip(label_gt, label_pred) if gt == pred == idx)
180 | union = gt_count + pred_count - intersection
181 | iou = intersection / union if union > 0 else 0
182 | iou_per_class[action] = iou
183 |
184 | tp = intersection
185 | fp = pred_count - intersection
186 | fn = gt_count - intersection
187 | precision = tp / (tp + fp) if (tp + fp) > 0 else 0
188 | recall = tp / (tp + fn) if (tp + fn) > 0 else 0
189 | f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
190 | f1_per_class[action] = f1
191 |
192 | mean_iou = sum(iou_per_class.values()) / len(iou_per_class) if iou_per_class else 0
193 | mean_f1 = sum(f1_per_class.values()) / len(f1_per_class) if f1_per_class else 0
194 |
195 | return mof, iou_per_class, mean_iou, f1_per_class, mean_f1
196 |
197 |
198 | def process_videos(label_data_estimates, tiou_thresholds):
199 | """
200 | Process each video's data, compute evaluation metrics, and collect statistics.
201 |
202 | Args:
203 | label_data_estimates (list): List of video annotation dictionaries.
204 | tiou_thresholds (list): List of tIoU thresholds for mAP calculation.
205 |
206 | Returns:
207 | dict: A dictionary with per-video metrics.
208 | dict: A dictionary containing lists of overall metrics for plotting.
209 | """
210 | mof_list = []
211 | miou_list = []
212 | mf1_list = []
213 | map_list = []
214 | action_steps = []
215 | action_frames = []
216 | results = {}
217 |
218 | for video_entry in label_data_estimates:
219 | video_path = video_entry['video_path']
220 |
221 | fps = 30.0
222 |
223 | # Skip entries where start_times is a string (invalid data)
224 | if isinstance(video_entry.get('start_times'), str):
225 | print("Skipping video:", video_path)
226 | continue
227 |
228 | mof, iou_per_class, mean_iou, f1_per_class, mean_f1 = compute_metrics(video_entry, fps)
229 | mof_list.append(mof)
230 | miou_list.append(mean_iou)
231 | mf1_list.append(mean_f1)
232 | results[video_path] = {"MoF": mof, "mIoU": mean_iou, "mF1": mean_f1}
233 |
234 | # Compute durations for predicted actions
235 | pred_start_times = video_entry['start_times']
236 | pred_end_times = video_entry['completed_times']
237 | durations = [pred_end_times[key] - pred_start_times[key] for key in pred_start_times.keys()]
238 | results[video_path]["duration"] = durations
239 |
240 | # Convert predicted intervals to frames and compute mAP
241 | pred_intervals = [
242 | (
243 | time_to_frame(pred_start_times[key], fps),
244 | time_to_frame(pred_end_times[key], fps)
245 | )
246 | for key in pred_start_times.keys()
247 | ]
248 | gt_intervals = video_entry['gt_time']
249 | map_value = compute_map(pred_intervals, gt_intervals, tiou_thresholds)
250 | map_list.append(map_value)
251 |
252 | action_steps.append(len(video_entry['action']))
253 |
254 | # Compute total frames from ground truth (adjust for 0-index)
255 | gt_intervals_zero_indexed = [(start - 1, end - 1) for start, end in video_entry['gt_time']]
256 | total_frames = gt_intervals_zero_indexed[-1][1] + 1
257 | action_frames.append(total_frames)
258 |
259 | metrics = {
260 | "MoF": mof_list,
261 | "mIoU": miou_list,
262 | "mF1": mf1_list,
263 | "mAP": map_list,
264 | "action_steps": action_steps,
265 | "action_frames": action_frames
266 | }
267 | return results, metrics
268 |
269 |
270 | def plot_metrics(metrics, output_path):
271 | """
272 | Generate scatter plots for the evaluation metrics and save the figure.
273 |
274 | Args:
275 | metrics (dict): Dictionary containing lists of metrics.
276 | output_path (str): Path to save the output plot image.
277 | """
278 | plt.figure(figsize=(12, 6))
279 |
280 | # Plot metrics against action steps
281 | ax1 = plt.subplot(2, 4, 1)
282 | plt.scatter(metrics["action_steps"], metrics["MoF"], alpha=0.6)
283 | plt.xlabel('Action Length (steps)')
284 | plt.ylabel('MoF')
285 | plt.ylim(0, 1)
286 | plt.title('Action Length vs MoF')
287 | plt.grid(True, linestyle='--', linewidth=0.5, color='gray', alpha=0.7)
288 |
289 | ax2 = plt.subplot(2, 4, 2)
290 | plt.scatter(metrics["action_steps"], metrics["mIoU"], alpha=0.6)
291 | plt.xlabel('Action Length (steps)')
292 | plt.ylabel('mIoU')
293 | plt.ylim(0, 1)
294 | plt.title('Action Length vs mIoU')
295 | plt.grid(True, linestyle='--', linewidth=0.5, color='gray', alpha=0.7)
296 |
297 | ax3 = plt.subplot(2, 4, 3)
298 | plt.scatter(metrics["action_steps"], metrics["mF1"], alpha=0.6)
299 | plt.xlabel('Action Length (steps)')
300 | plt.ylabel('mF1')
301 | plt.ylim(0, 1)
302 | plt.title('Action Length vs mF1')
303 | plt.grid(True, linestyle='--', linewidth=0.5, color='gray', alpha=0.7)
304 |
305 | ax4 = plt.subplot(2, 4, 4)
306 | plt.scatter(metrics["action_steps"], metrics["mAP"], alpha=0.6)
307 | plt.xlabel('Action Length (steps)')
308 | plt.ylabel('mAP')
309 | plt.ylim(0, 1)
310 | plt.title('Action Length vs mAP')
311 | plt.grid(True, linestyle='--', linewidth=0.5, color='gray', alpha=0.7)
312 |
313 | # Plot metrics against action frames
314 | ax5 = plt.subplot(2, 4, 5)
315 | plt.scatter(metrics["action_frames"], metrics["MoF"], alpha=0.6)
316 | plt.xlabel('Action Length (frames)')
317 | plt.ylabel('MoF')
318 | plt.ylim(0, 1)
319 | plt.title('Frames vs MoF')
320 | plt.grid(True, linestyle='--', linewidth=0.5, color='gray', alpha=0.7)
321 |
322 | ax6 = plt.subplot(2, 4, 6)
323 | plt.scatter(metrics["action_frames"], metrics["mIoU"], alpha=0.6)
324 | plt.xlabel('Action Length (frames)')
325 | plt.ylabel('mIoU')
326 | plt.ylim(0, 1)
327 | plt.title('Frames vs mIoU')
328 | plt.grid(True, linestyle='--', linewidth=0.5, color='gray', alpha=0.7)
329 |
330 | ax7 = plt.subplot(2, 4, 7)
331 | plt.scatter(metrics["action_frames"], metrics["mF1"], alpha=0.6)
332 | plt.xlabel('Action Length (frames)')
333 | plt.ylabel('mF1')
334 | plt.ylim(0, 1)
335 | plt.title('Frames vs mF1')
336 | plt.grid(True, linestyle='--', linewidth=0.5, color='gray', alpha=0.7)
337 |
338 | ax8 = plt.subplot(2, 4, 8)
339 | plt.scatter(metrics["action_frames"], metrics["mAP"], alpha=0.6)
340 | plt.xlabel('Action Length (frames)')
341 | plt.ylabel('mAP')
342 | plt.ylim(0, 1)
343 | plt.title('Frames vs mAP')
344 | plt.grid(True, linestyle='--', linewidth=0.5, color='gray', alpha=0.7)
345 |
346 | plt.tight_layout()
347 | plt.savefig(output_path)
348 | plt.close()
349 |
350 |
351 | def main():
352 | args = parse_arguments()
353 | tiou_thresholds = [0.3, 0.4, 0.5, 0.6, 0.7]
354 | input_filename = args.file
355 |
356 | # Load estimated label data from JSON
357 | with open(input_filename, "r") as f:
358 | label_data_estimates = json.load(f)
359 |
360 | # Process each video entry to compute metrics
361 | results, metrics = process_videos(label_data_estimates, tiou_thresholds)
362 |
363 | # Prepare output directories
364 | out_dir = args.outdir
365 | base_filename = os.path.splitext(os.path.basename(input_filename))[0]
366 | parent_dir = os.path.basename(os.path.dirname(input_filename))
367 | output_dir = os.path.join(out_dir, parent_dir)
368 | os.makedirs(output_dir, exist_ok=True)
369 | plot_output_path = os.path.join(output_dir, base_filename + ".png")
370 |
371 | # Plot and save the evaluation metrics
372 | plot_metrics(metrics, plot_output_path)
373 |
374 | # Print mean metric values
375 | mean_mof = sum(metrics["MoF"]) / len(metrics["MoF"]) if metrics["MoF"] else 0
376 | mean_miou = sum(metrics["mIoU"]) / len(metrics["mIoU"]) if metrics["mIoU"] else 0
377 | mean_mf1 = sum(metrics["mF1"]) / len(metrics["mF1"]) if metrics["mF1"] else 0
378 | mean_map = sum(metrics["mAP"]) / len(metrics["mAP"]) if metrics["mAP"] else 0
379 |
380 | print("Mean MoF: {:.4f}".format(mean_mof))
381 | print("Mean mIoU: {:.4f}".format(mean_miou))
382 | print("Mean mF1: {:.4f}".format(mean_mf1))
383 | print("Mean mAP: {:.4f}".format(mean_map))
384 | print("Processed videos:", len(metrics["MoF"]))
385 |
386 | # Save detailed results as JSON
387 | results_output_path = os.path.join(output_dir, base_filename + ".json")
388 | with open(results_output_path, "w") as f:
389 | json.dump(results, f, indent=4)
390 |
391 |
392 | if __name__ == "__main__":
393 | main()
394 |
--------------------------------------------------------------------------------
/example.py:
--------------------------------------------------------------------------------
1 | import cv2
2 | import base64
3 | from openai import OpenAI, AzureOpenAI
4 | import os
5 | import numpy as np
6 | import json
7 | import dotenv
8 | import time
9 | import argparse
10 | import openai
11 |
12 |
13 | # Resize the image while keeping aspect ratio
14 | def image_resize_for_vlm(frame, inter=cv2.INTER_AREA):
15 | height, width = frame.shape[:2]
16 | aspect_ratio = width / height
17 | max_short_side = 768
18 | max_long_side = 2000
19 | if aspect_ratio > 1:
20 | new_width = min(width, max_long_side)
21 | new_height = int(new_width / aspect_ratio)
22 | if new_height > max_short_side:
23 | new_height = max_short_side
24 | new_width = int(new_height * aspect_ratio)
25 | else:
26 | new_height = min(height, max_long_side)
27 | new_width = int(new_height * aspect_ratio)
28 | if new_width > max_short_side:
29 | new_width = max_short_side
30 | new_height = int(new_width / aspect_ratio)
31 | resized_frame = cv2.resize(
32 | frame, (new_width, new_height), interpolation=inter)
33 | return resized_frame
34 |
35 | # Extract JSON part from the response
36 | def extract_json_part(text):
37 | text = text.strip().replace(" ", "").replace("\n", "")
38 | try:
39 | start = text.index('{"points":')
40 | text_json = text[start:].strip()
41 | end = text_json.index('}') + 1
42 | text_json = text_json[:end].strip()
43 | return text_json
44 | except ValueError:
45 | raise ValueError("JSON part not found in the response")
46 |
47 | # Perform scene understanding on the frame
48 | def scene_understanding(credentials, frame, prompt_message):
49 | frame = image_resize_for_vlm(frame)
50 | _, buffer = cv2.imencode(".jpg", frame)
51 | base64Frame = base64.b64encode(buffer).decode("utf-8")
52 | PROMPT_MESSAGES = [
53 | {
54 | "role": "user",
55 | "content": [
56 | {
57 | "type": "text",
58 | "text": prompt_message
59 | },
60 | {
61 | "type": "image_url",
62 | "image_url": {
63 | "url": f"data:image/jpeg;base64,{base64Frame}",
64 | "detail": "high"
65 | },
66 | }
67 | ]
68 | },
69 | ]
70 |
71 | if len(credentials["AZURE_OPENAI_API_KEY"]) == 0:
72 | client_gpt4v = OpenAI(
73 | api_key=credentials["OPENAI_API_KEY"]
74 | )
75 | params = {
76 | "model": "gpt-4o",
77 | "messages": PROMPT_MESSAGES,
78 | "max_tokens": 200,
79 | "temperature": 0.1,
80 | "top_p": 0.5,
81 | "frequency_penalty": 0.0,
82 | "presence_penalty": 0.0,
83 | }
84 | else:
85 | client_gpt4v = AzureOpenAI(
86 | api_version="2024-02-01",
87 | azure_endpoint=credentials["AZURE_OPENAI_ENDPOINT"],
88 | api_key=credentials["AZURE_OPENAI_API_KEY"]
89 | )
90 | params = {
91 | "model": credentials["AZURE_OPENAI_DEPLOYMENT_NAME"],
92 | "messages": PROMPT_MESSAGES,
93 | "max_tokens": 200,
94 | "temperature": 0.1,
95 | "top_p": 0.5,
96 | "frequency_penalty": 0.0,
97 | "presence_penalty": 0.0,
98 | }
99 | count = 0
100 | while True:
101 | if count > 5:
102 | raise Exception("Failed to get response from Azure OpenAI")
103 | try:
104 | result = client_gpt4v.chat.completions.create(**params)
105 | response_json = extract_json_part(result.choices[0].message.content)
106 | break
107 | except openai.BadRequestError as e:
108 | print(e)
109 | print('Bad Request error.')
110 | return None, None
111 | except openai.RateLimitError as e:
112 | print(e)
113 | print('Rate Limit. Waiting for 5 seconds...')
114 | time.sleep(5)
115 | count += 1
116 | except openai.APIStatusError as e:
117 | print(e)
118 | print('APIStatusError. Waiting for 1 second...')
119 | time.sleep(1)
120 | count += 1
121 | except Exception as e:
122 | print(e)
123 | print('Other error. Waiting for 1 second...')
124 | time.sleep(1)
125 | count += 1
126 |
127 | json_dict = json.loads(response_json, strict=False)
128 | if len(json_dict['points']) == 0:
129 | return None
130 | if len(json_dict['points']) > 1:
131 | print("Warning: More than one point detected")
132 | return json_dict['points'][0], result.choices[0].message.content
133 |
134 |
135 | def image_resize(image, width=None, height=None, inter=cv2.INTER_AREA):
136 | dim = None
137 | (h, w) = image.shape[:2]
138 | if width is None and height is None:
139 | return image
140 | if width is None:
141 | r = height / float(h)
142 | dim = (int(w * r), height)
143 | else:
144 | r = width / float(w)
145 | dim = (width, int(h * r))
146 | resized = cv2.resize(image, dim, interpolation=inter)
147 | return resized
148 |
149 |
150 | # Create a grid of frames
151 | def create_frame_grid(video_path, center_time, interval, grid_size):
152 | spacer = 0
153 | video = cv2.VideoCapture(video_path)
154 | fps = video.get(cv2.CAP_PROP_FPS)
155 | total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT)) - 1
156 | center_frame = int(center_time * fps)
157 | interval_frames = int(interval * fps)
158 | num_frames = grid_size**2
159 | half_num_frames = num_frames // 2
160 | frame_indices = [max(0,
161 | min(center_frame + i * interval_frames,
162 | total_frames - 1)) for i in range(-half_num_frames,
163 | half_num_frames + 1)]
164 | frames = []
165 | actual_indices = []
166 | for index in frame_indices:
167 | video.set(cv2.CAP_PROP_POS_FRAMES, index)
168 | success, frame = video.read()
169 | if success:
170 | frame = image_resize(frame, width=200)
171 | frames.append(frame)
172 | actual_indices.append(index)
173 | else:
174 | print(f"Warning: Frame {index} not found")
175 | print(f"Total frames: {total_frames}")
176 | video.set(cv2.CAP_PROP_POS_FRAMES, 0)
177 | success, frame = video.read()
178 | frame = image_resize(frame, width=200)
179 | frame = frame * 0
180 | frames.append(frame)
181 | actual_indices.append(index)
182 | video.release()
183 |
184 | if len(frames) < grid_size**2:
185 | raise ValueError("Not enough frames to create the grid.")
186 |
187 | frame_height, frame_width = frames[0].shape[:2]
188 |
189 | grid_height = grid_size * frame_height + (grid_size - 1) * spacer
190 | grid_width = grid_size * frame_width + (grid_size - 1) * spacer
191 |
192 | grid_img = np.ones((grid_height, grid_width, 3), dtype=np.uint8) * 255
193 |
194 | for i in range(grid_size):
195 | for j in range(grid_size):
196 | index = i * grid_size + j
197 | frame = frames[index]
198 | cX, cY = frame.shape[1] // 2, frame.shape[0] // 2
199 | max_dim = int(min(frame.shape[:2]) * 0.5)
200 | overlay = frame.copy()
201 | if render_pos == 'center':
202 | circle_center = (cX, cY)
203 | else:
204 | circle_center = (frame.shape[1] - max_dim // 2, max_dim // 2)
205 | cv2.circle(overlay, circle_center,
206 | max_dim // 2, (255, 255, 255), -1)
207 | alpha = 0.3
208 | frame = cv2.addWeighted(overlay, alpha, frame, 1 - alpha, 0)
209 | cv2.circle(frame, circle_center, max_dim // 2, (255, 255, 255), 2)
210 | font_scale = max_dim / 50
211 | text_size = cv2.getTextSize(
212 | str(index + 1), cv2.FONT_HERSHEY_SIMPLEX, font_scale, 2)[0]
213 | if render_pos == 'center':
214 | text_x = cX - text_size[0] // 2
215 | text_y = cY + text_size[1] // 2
216 | else:
217 | text_x = frame.shape[1] - text_size[0] // 2 - max_dim // 2
218 | text_y = text_size[1] // 2 + max_dim // 2
219 | cv2.putText(frame, str(index + 1), (text_x, text_y),
220 | cv2.FONT_HERSHEY_SIMPLEX, font_scale, (0, 0, 0), 2)
221 | y1 = i * (frame_height + spacer)
222 | y2 = y1 + frame_height
223 | x1 = j * (frame_width + spacer)
224 | x2 = x1 + frame_width
225 | grid_img[y1:y2, x1:x2] = frame
226 |
227 | return grid_img, actual_indices
228 |
229 |
230 | def add_text_with_background(
231 | frame,
232 | text,
233 | position,
234 | font,
235 | font_scale,
236 | font_color,
237 | font_thickness,
238 | bg_color):
239 | text_size, _ = cv2.getTextSize(text, font, font_scale, font_thickness)
240 | text_x, text_y = position
241 | top_left = (text_x - 10, text_y - text_size[1] - 10)
242 | bottom_right = (text_x + text_size[0] + 10, text_y + 10)
243 | cv2.rectangle(frame, top_left, bottom_right, bg_color, -1)
244 | cv2.putText(frame, text, (text_x, text_y), font, font_scale,
245 | font_color, font_thickness, cv2.LINE_AA)
246 |
247 | # Annotate the video with task times
248 | def trim_video_with_annotations(
249 | video_path,
250 | start_time,
251 | end_time,
252 | text,
253 | output_path,
254 | buffer=0.5):
255 | """Trim and annotate video with specified start and end times and text."""
256 | if os.path.exists(output_path):
257 | return
258 | cap = cv2.VideoCapture(video_path)
259 | if not cap.isOpened():
260 | print(f"Error: Could not open video file {video_path}")
261 | return
262 |
263 | fps = cap.get(cv2.CAP_PROP_FPS)
264 | width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
265 | height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
266 | fourcc = cv2.VideoWriter_fourcc(*'mp4v')
267 | out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
268 |
269 | start_frame = int(start_time * fps)
270 | end_frame = int(end_time * fps)
271 | cap.set(cv2.CAP_PROP_POS_FRAMES, max(0, start_frame - int(buffer * fps)))
272 |
273 | while cap.isOpened():
274 | ret, frame = cap.read()
275 | if not ret or cap.get(
276 | cv2.CAP_PROP_POS_FRAMES) > end_frame + int(buffer * fps):
277 | break
278 | if start_frame <= cap.get(cv2.CAP_PROP_POS_FRAMES) <= end_frame:
279 | add_text_with_background(
280 | frame,
281 | text,
282 | (10,
283 | height - 10),
284 | cv2.FONT_HERSHEY_SIMPLEX,
285 | 1,
286 | (0,
287 | 0,
288 | 255),
289 | 2,
290 | (255,
291 | 255,
292 | 255))
293 | out.write(frame)
294 |
295 | cap.release()
296 | out.release()
297 |
298 | # Process each task in parallel
299 | def process_task(
300 | credentials,
301 | video_path,
302 | action,
303 | center_time,
304 | interval,
305 | fps,
306 | grid_size,
307 | search_anchor,
308 | iter_num=4):
309 | """Process a task to identify the start or end of an action in a video."""
310 | prompt_start = (
311 | f"I will show an image sequence of human cooking. "
312 | f"I have annotated the images with numbered circles. "
313 | f"Choose the number that is closest to the moment when the ({action}) has started. "
314 | f"You are a five-time world champion in this game. "
315 | f"Give a one sentence analysis of why you chose those points (less than 50 words). "
316 | f"If you consider that the action is not in the video, please choose the number -1. "
317 | f"Provide your answer at the end in a json file of this format: {{\"points\": []}}"
318 | )
319 |
320 | prompt_end = (
321 | f"I will show an image sequence of human cooking. "
322 | f"I have annotated the images with numbered circles. "
323 | f"Choose the number that is closest to the moment when the ({action}) has ended. "
324 | f"You are a five-time world champion in this game. "
325 | f"Give a one sentence analysis of why you chose those points (less than 50 words). "
326 | f"If you consider that the action has not ended yet, please choose the number -1. "
327 | f"Provide your answer at the end in a json file of this format: {{\"points\": []}}"
328 | )
329 | prompt_message = prompt_start if search_anchor == 'start' else prompt_end
330 | for iter_idx in range(iter_num): # Iterate to narrow down the time
331 | image, used_frame_indices = create_frame_grid(
332 | video_path, center_time, interval, grid_size)
333 | print(used_frame_indices)
334 | if iter_idx == 0:
335 | cv2.imwrite(
336 | os.path.join(
337 | output_folder,
338 | f"grid_image_sample.png"),
339 | image)
340 | description, reason = scene_understanding(
341 | credentials, image, prompt_message)
342 | print(reason)
343 | if description:
344 | if description == -1:
345 | return None
346 | if int(description) - 1 > len(used_frame_indices) - 1:
347 | print("Warning: Invalid frame index selected")
348 | print(f"Selected frame index: {description}")
349 | # description is 1-indexed
350 | index_specified = max(
351 | min(int(description) - 1, len(used_frame_indices) - 1), 0)
352 | selected_frame_index = used_frame_indices[index_specified]
353 | center_time = selected_frame_index / fps # Convert frame index back to time
354 | print(
355 | f"Selected frame index: {selected_frame_index}, sample time duration: {interval}")
356 | interval /= 2
357 | if int(interval * fps) == 0:
358 | break
359 | return center_time
360 |
361 |
362 | def convert_video(video_file_path: str, action: str, credentials, grid_size: int):
363 | video = cv2.VideoCapture(video_file_path)
364 | fps = video.get(cv2.CAP_PROP_FPS)
365 | total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
366 | print(f"Total frames: {total_frames}")
367 | duration = float(total_frames) / fps
368 | center_time = duration / 2
369 | interval = duration / (grid_size**2 - 1)
370 | result_start = process_task(
371 | credentials,
372 | video_file_path,
373 | action,
374 | center_time,
375 | interval,
376 | fps,
377 | grid_size,
378 | search_anchor='start')
379 | if result_start is None:
380 | return None, None
381 | total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT)
382 | ) - int(result_start * fps)
383 | duration = float(total_frames) / fps
384 | center_time = duration / 2 + result_start
385 | interval = max(duration / (grid_size**2 - 1), 1.0 / fps)
386 | result_end = process_task(
387 | credentials,
388 | video_file_path,
389 | action,
390 | center_time,
391 | interval,
392 | fps,
393 | grid_size,
394 | search_anchor='end')
395 | if result_end is None:
396 | return None, None
397 | video.release()
398 | return result_start, result_end
399 |
400 |
401 | parser = argparse.ArgumentParser()
402 | parser.add_argument("--credentials", help="credentials file")
403 | parser.add_argument("--grid", help="grid size", default=3)
404 | parser.add_argument(
405 | "--video_path",
406 | help="video path",
407 | default="sample_video/sample.mp4")
408 | parser.add_argument(
409 | "--action",
410 | help="action label",
411 | default="grabbing towards the can")
412 | pargs, unknown = parser.parse_known_args()
413 | credentials = dotenv.dotenv_values(pargs.credentials)
414 | required_keys = ["OPENAI_API_KEY", "AZURE_OPENAI_API_KEY", "AZURE_OPENAI_ENDPOINT"]
415 | if not all(key in credentials for key in required_keys):
416 | raise ValueError("Required keys are missing in the credentials file")
417 | render_pos = 'topright' # center or topright
418 | grid_size = int(pargs.grid)
419 | video_path = pargs.video_path
420 | action = pargs.action
421 | folder_name = action.replace(" ", "_")
422 | output_folder = f"results/{folder_name}"
423 | os.makedirs(output_folder, exist_ok=True)
424 | if __name__ == "__main__":
425 | if os.path.exists(video_path):
426 | print(f"Processing {video_path}")
427 | start_time, completed_time = convert_video(
428 | video_path, action, credentials, grid_size)
429 | print(f"Start time: {start_time}, End time: {completed_time}")
430 | if start_time is not None and completed_time is not None:
431 | output_file_name = f"{
432 | action.replace(
433 | ' ',
434 | '_')}_segment_{
435 | round(
436 | start_time,
437 | 2)}_{
438 | round(
439 | completed_time,
440 | 2)}.mp4"
441 | output_file_path = os.path.join(output_folder, output_file_name)
442 | trim_video_with_annotations(
443 | video_path,
444 | start_time,
445 | completed_time,
446 | action,
447 | output_file_path)
--------------------------------------------------------------------------------
/finegrained-breakfast-dataset/label_data_gt_right.json:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "action": [
4 | "Grasp with the right hand",
5 | "Picking with the right hand",
6 | "Bringing with the right hand",
7 | "Putting on with the right hand",
8 | "Release from the right hand"
9 | ],
10 | "gt_time": [
11 | [
12 | 0,
13 | 23
14 | ],
15 | [
16 | 24,
17 | 48
18 | ],
19 | [
20 | 49,
21 | 86
22 | ],
23 | [
24 | 87,
25 | 112
26 | ],
27 | [
28 | 113,
29 | 122
30 | ]
31 | ],
32 | "video_path": "original_videos\\subject_1_gopro_seg_1_2162-2284.mp4"
33 | },
34 | {
35 | "action": [
36 | "Grasp with the right hand",
37 | "Picking with the right hand",
38 | "Bringing with the right hand",
39 | "Putting on with the right hand",
40 | "Release from the right hand"
41 | ],
42 | "gt_time": [
43 | [
44 | 0,
45 | 17
46 | ],
47 | [
48 | 18,
49 | 37
50 | ],
51 | [
52 | 38,
53 | 106
54 | ],
55 | [
56 | 107,
57 | 135
58 | ],
59 | [
60 | 136,
61 | 147
62 | ]
63 | ],
64 | "video_path": "original_videos\\subject_1_gopro_seg_1_2306-2453.mp4"
65 | },
66 | {
67 | "action": [
68 | "Grasp with the right hand",
69 | "Picking with the right hand",
70 | "Putting on with the right hand",
71 | "Release from the right hand"
72 | ],
73 | "gt_time": [
74 | [
75 | 0,
76 | 10
77 | ],
78 | [
79 | 11,
80 | 21
81 | ],
82 | [
83 | 22,
84 | 48
85 | ],
86 | [
87 | 49,
88 | 60
89 | ]
90 | ],
91 | "video_path": "original_videos\\subject_1_gopro_seg_1_2487-2547.mp4"
92 | },
93 | {
94 | "action": [
95 | "Grasp with the right hand",
96 | "Picking with the right hand",
97 | "Holding with the right hand",
98 | "Putting on with the right hand",
99 | "Release from the right hand"
100 | ],
101 | "gt_time": [
102 | [
103 | 0,
104 | 14
105 | ],
106 | [
107 | 15,
108 | 39
109 | ],
110 | [
111 | 40,
112 | 103
113 | ],
114 | [
115 | 104,
116 | 142
117 | ],
118 | [
119 | 143,
120 | 152
121 | ]
122 | ],
123 | "video_path": "original_videos\\subject_1_gopro_seg_1_2842-2994.mp4"
124 | },
125 | {
126 | "action": [
127 | "Hand over from the left hand to the right hand",
128 | "Holding with the right hand",
129 | "Release from the right hand"
130 | ],
131 | "gt_time": [
132 | [
133 | 0,
134 | 13
135 | ],
136 | [
137 | 14,
138 | 41
139 | ],
140 | [
141 | 42,
142 | 53
143 | ]
144 | ],
145 | "video_path": "original_videos\\subject_1_gopro_seg_1_3511-3564.mp4"
146 | },
147 | {
148 | "action": [
149 | "Grasp with the right hand",
150 | "Cracking an egg with the right hand",
151 | "Pouring with the right hand",
152 | "Holding with the right hand",
153 | "Putting on with the right hand",
154 | "Release from the right hand"
155 | ],
156 | "gt_time": [
157 | [
158 | 0,
159 | 10
160 | ],
161 | [
162 | 11,
163 | 38
164 | ],
165 | [
166 | 39,
167 | 163
168 | ],
169 | [
170 | 164,
171 | 191
172 | ],
173 | [
174 | 192,
175 | 213
176 | ],
177 | [
178 | 214,
179 | 224
180 | ]
181 | ],
182 | "video_path": "original_videos\\subject_1_gopro_seg_2_1895-2119.mp4"
183 | },
184 | {
185 | "action": [
186 | "Grasp with the right hand",
187 | "Cracking an egg with the right hand",
188 | "Pouring with the right hand",
189 | "Holding with the right hand",
190 | "Putting on with the right hand",
191 | "Release from the right hand"
192 | ],
193 | "gt_time": [
194 | [
195 | 0,
196 | 10
197 | ],
198 | [
199 | 11,
200 | 57
201 | ],
202 | [
203 | 58,
204 | 168
205 | ],
206 | [
207 | 169,
208 | 179
209 | ],
210 | [
211 | 180,
212 | 202
213 | ],
214 | [
215 | 203,
216 | 212
217 | ]
218 | ],
219 | "video_path": "original_videos\\subject_1_gopro_seg_2_2287-2499.mp4"
220 | },
221 | {
222 | "action": [
223 | "Grasp with the right hand",
224 | "Picking with the right hand",
225 | "Starting rotary motion with the right hand while it is restrained.",
226 | "Putting on with the right hand",
227 | "Release from the right hand"
228 | ],
229 | "gt_time": [
230 | [
231 | 0,
232 | 10
233 | ],
234 | [
235 | 11,
236 | 32
237 | ],
238 | [
239 | 33,
240 | 48
241 | ],
242 | [
243 | 49,
244 | 59
245 | ],
246 | [
247 | 60,
248 | 72
249 | ]
250 | ],
251 | "video_path": "original_videos\\subject_1_gopro_seg_2_2525-2597.mp4"
252 | },
253 | {
254 | "action": [
255 | "Grasp with the right hand",
256 | "Picking with the right hand",
257 | "Rotary motion with the right hand until it cannot be rotated",
258 | "Putting on with the right hand",
259 | "Release from the right hand"
260 | ],
261 | "gt_time": [
262 | [
263 | 0,
264 | 21
265 | ],
266 | [
267 | 22,
268 | 53
269 | ],
270 | [
271 | 54,
272 | 116
273 | ],
274 | [
275 | 117,
276 | 140
277 | ],
278 | [
279 | 141,
280 | 152
281 | ]
282 | ],
283 | "video_path": "original_videos\\subject_1_gopro_seg_2_2953-3105.mp4"
284 | },
285 | {
286 | "action": [
287 | "Grasp with the right hand",
288 | "Putting on with the right hand",
289 | "Release from the right hand"
290 | ],
291 | "gt_time": [
292 | [
293 | 0,
294 | 20
295 | ],
296 | [
297 | 21,
298 | 39
299 | ],
300 | [
301 | 40,
302 | 51
303 | ]
304 | ],
305 | "video_path": "original_videos\\subject_1_gopro_seg_2_3324-3375.mp4"
306 | },
307 | {
308 | "action": [
309 | "Grasp with the right hand",
310 | "Picking with the right hand",
311 | "Putting on with the right hand",
312 | "Release from the right hand"
313 | ],
314 | "gt_time": [
315 | [
316 | 0,
317 | 15
318 | ],
319 | [
320 | 16,
321 | 40
322 | ],
323 | [
324 | 41,
325 | 68
326 | ],
327 | [
328 | 69,
329 | 78
330 | ]
331 | ],
332 | "video_path": "original_videos\\subject_1_gopro_seg_2_3467-3545.mp4"
333 | },
334 | {
335 | "action": [
336 | "Grasp with the right hand",
337 | "Starting rotary motion with the right hand while it is restrained.",
338 | "Holding with the right hand",
339 | "Rotary motion with the right hand until it cannot be rotated",
340 | "Release from the right hand"
341 | ],
342 | "gt_time": [
343 | [
344 | 0,
345 | 10
346 | ],
347 | [
348 | 11,
349 | 29
350 | ],
351 | [
352 | 30,
353 | 70
354 | ],
355 | [
356 | 71,
357 | 92
358 | ],
359 | [
360 | 93,
361 | 103
362 | ]
363 | ],
364 | "video_path": "original_videos\\subject_10_gopro_seg_1_1877-1980.mp4"
365 | },
366 | {
367 | "action": [
368 | "Grasp with the right hand",
369 | "Starting rotary motion with the right hand while it is restrained.",
370 | "Release from the right hand"
371 | ],
372 | "gt_time": [
373 | [
374 | 0,
375 | 10
376 | ],
377 | [
378 | 11,
379 | 22
380 | ],
381 | [
382 | 23,
383 | 28
384 | ]
385 | ],
386 | "video_path": "original_videos\\subject_10_gopro_seg_1_1997-2025.mp4"
387 | },
388 | {
389 | "action": [
390 | "Grasp with the right hand",
391 | "Starting rotary motion with the right hand while it is restrained.",
392 | "Release from the right hand"
393 | ],
394 | "gt_time": [
395 | [
396 | 0,
397 | 10
398 | ],
399 | [
400 | 11,
401 | 17
402 | ],
403 | [
404 | 18,
405 | 24
406 | ]
407 | ],
408 | "video_path": "original_videos\\subject_10_gopro_seg_1_2028-2052.mp4"
409 | },
410 | {
411 | "action": [
412 | "Grasp with the right hand",
413 | "Picking with the right hand",
414 | "Putting on with the right hand",
415 | "Release from the right hand"
416 | ],
417 | "gt_time": [
418 | [
419 | 0,
420 | 22
421 | ],
422 | [
423 | 23,
424 | 117
425 | ],
426 | [
427 | 118,
428 | 131
429 | ],
430 | [
431 | 132,
432 | 140
433 | ]
434 | ],
435 | "video_path": "original_videos\\subject_10_gopro_seg_2_2391-2531.mp4"
436 | },
437 | {
438 | "action": [
439 | "Grasp with the right hand",
440 | "Holding with the right hand",
441 | "Release from the right hand"
442 | ],
443 | "gt_time": [
444 | [
445 | 0,
446 | 11
447 | ],
448 | [
449 | 12,
450 | 88
451 | ],
452 | [
453 | 89,
454 | 100
455 | ]
456 | ],
457 | "video_path": "original_videos\\subject_10_gopro_seg_2_2532-2632.mp4"
458 | },
459 | {
460 | "action": [
461 | "Grasp with the right hand",
462 | "Picking with the right hand",
463 | "Tracing the surface of something with the right hand or an object held in the right hand in a planar manner.",
464 | "Striking something with the right hand or an object held in the right hand.",
465 | "Putting on with the right hand",
466 | "Release from the right hand"
467 | ],
468 | "gt_time": [
469 | [
470 | 0,
471 | 10
472 | ],
473 | [
474 | 11,
475 | 22
476 | ],
477 | [
478 | 23,
479 | 248
480 | ],
481 | [
482 | 249,
483 | 268
484 | ],
485 | [
486 | 269,
487 | 279
488 | ],
489 | [
490 | 280,
491 | 289
492 | ]
493 | ],
494 | "video_path": "original_videos\\subject_10_gopro_seg_2_2826-3115.mp4"
495 | },
496 | {
497 | "action": [
498 | "Grasp with the right hand",
499 | "Picking with the right hand",
500 | "Bringing with the right hand",
501 | "Striking something with the right hand or an object held in the right hand.",
502 | "Bringing with the right hand",
503 | "Pouring with the right hand",
504 | "Holding with the right hand",
505 | "Bringing with the right hand",
506 | "Putting on with the right hand",
507 | "Release from the right hand"
508 | ],
509 | "gt_time": [
510 | [
511 | 0,
512 | 13
513 | ],
514 | [
515 | 14,
516 | 22
517 | ],
518 | [
519 | 23,
520 | 34
521 | ],
522 | [
523 | 35,
524 | 53
525 | ],
526 | [
527 | 54,
528 | 68
529 | ],
530 | [
531 | 69,
532 | 132
533 | ],
534 | [
535 | 133,
536 | 194
537 | ],
538 | [
539 | 195,
540 | 209
541 | ],
542 | [
543 | 210,
544 | 218
545 | ],
546 | [
547 | 219,
548 | 234
549 | ]
550 | ],
551 | "video_path": "original_videos\\subject_11_gopro_seg_1_12864-13098.mp4"
552 | },
553 | {
554 | "action": [
555 | "Grasp with the right hand",
556 | "Picking with the right hand",
557 | "Striking something with the right hand or an object held in the right hand.",
558 | "Picking with the right hand",
559 | "Pouring with the right hand",
560 | "Holding with the right hand",
561 | "Bringing with the right hand",
562 | "Putting on with the right hand",
563 | "Release from the right hand"
564 | ],
565 | "gt_time": [
566 | [
567 | 0,
568 | 13
569 | ],
570 | [
571 | 14,
572 | 23
573 | ],
574 | [
575 | 24,
576 | 35
577 | ],
578 | [
579 | 36,
580 | 85
581 | ],
582 | [
583 | 86,
584 | 121
585 | ],
586 | [
587 | 122,
588 | 172
589 | ],
590 | [
591 | 173,
592 | 190
593 | ],
594 | [
595 | 191,
596 | 201
597 | ],
598 | [
599 | 202,
600 | 207
601 | ]
602 | ],
603 | "video_path": "original_videos\\subject_11_gopro_seg_1_13099-13306.mp4"
604 | },
605 | {
606 | "action": [
607 | "Grasp with the right hand",
608 | "Picking with the right hand",
609 | "Bringing with the right hand",
610 | "Striking something with the right hand or an object held in the right hand.",
611 | "Bringing with the right hand",
612 | "Pouring with the right hand",
613 | "Holding with the right hand",
614 | "Bringing with the right hand",
615 | "Putting on with the right hand",
616 | "Release from the right hand"
617 | ],
618 | "gt_time": [
619 | [
620 | 0,
621 | 11
622 | ],
623 | [
624 | 12,
625 | 21
626 | ],
627 | [
628 | 22,
629 | 27
630 | ],
631 | [
632 | 28,
633 | 38
634 | ],
635 | [
636 | 39,
637 | 70
638 | ],
639 | [
640 | 71,
641 | 91
642 | ],
643 | [
644 | 92,
645 | 130
646 | ],
647 | [
648 | 131,
649 | 160
650 | ],
651 | [
652 | 161,
653 | 171
654 | ],
655 | [
656 | 172,
657 | 184
658 | ]
659 | ],
660 | "video_path": "original_videos\\subject_11_gopro_seg_1_13307-13491.mp4"
661 | },
662 | {
663 | "action": [
664 | "Grasp with the right hand",
665 | "Moving an object held in the right hand in and out of a narrow space.",
666 | "Release from the right hand"
667 | ],
668 | "gt_time": [
669 | [
670 | 0,
671 | 40
672 | ],
673 | [
674 | 41,
675 | 68
676 | ],
677 | [
678 | 69,
679 | 114
680 | ]
681 | ],
682 | "video_path": "original_videos\\subject_11_gopro_seg_2_444-558.mp4"
683 | },
684 | {
685 | "action": [
686 | "Grasp with the right hand",
687 | "Picking with the right hand",
688 | "Bringing with the right hand",
689 | "Putting on with the right hand",
690 | "Release from the right hand"
691 | ],
692 | "gt_time": [
693 | [
694 | 0,
695 | 17
696 | ],
697 | [
698 | 18,
699 | 36
700 | ],
701 | [
702 | 37,
703 | 94
704 | ],
705 | [
706 | 95,
707 | 107
708 | ],
709 | [
710 | 108,
711 | 112
712 | ]
713 | ],
714 | "video_path": "original_videos\\subject_11_gopro_seg_2_786-898.mp4"
715 | },
716 | {
717 | "action": [
718 | "Grasp with the right hand",
719 | "Starting rotary motion with the right hand while it is restrained.",
720 | "Picking with the right hand",
721 | "Putting on with the right hand",
722 | "Release from the right hand"
723 | ],
724 | "gt_time": [
725 | [
726 | 0,
727 | 10
728 | ],
729 | [
730 | 11,
731 | 18
732 | ],
733 | [
734 | 19,
735 | 32
736 | ],
737 | [
738 | 33,
739 | 44
740 | ],
741 | [
742 | 45,
743 | 51
744 | ]
745 | ],
746 | "video_path": "original_videos\\subject_11_gopro_seg_2_903-954.mp4"
747 | },
748 | {
749 | "action": [
750 | "Grasp with the right hand",
751 | "Release from the right hand"
752 | ],
753 | "gt_time": [
754 | [
755 | 0,
756 | 9
757 | ],
758 | [
759 | 10,
760 | 20
761 | ]
762 | ],
763 | "video_path": "original_videos\\subject_11_gopro_seg_2_955-975.mp4"
764 | },
765 | {
766 | "action": [
767 | "Grasp with the right hand",
768 | "Picking with the right hand",
769 | "Holding with the right hand",
770 | "Putting on with the right hand",
771 | "Release from the right hand"
772 | ],
773 | "gt_time": [
774 | [
775 | 0,
776 | 19
777 | ],
778 | [
779 | 20,
780 | 54
781 | ],
782 | [
783 | 55,
784 | 147
785 | ],
786 | [
787 | 148,
788 | 160
789 | ],
790 | [
791 | 161,
792 | 172
793 | ]
794 | ],
795 | "video_path": "original_videos\\subject_11_gopro_seg_2_984-1156.mp4"
796 | },
797 | {
798 | "action": [
799 | "Grasp with the right hand",
800 | "Picking with the right hand",
801 | "Putting on with the right hand",
802 | "Rotary motion with the right hand until it cannot be rotated",
803 | "Release from the right hand"
804 | ],
805 | "gt_time": [
806 | [
807 | 0,
808 | 7
809 | ],
810 | [
811 | 8,
812 | 25
813 | ],
814 | [
815 | 26,
816 | 51
817 | ],
818 | [
819 | 52,
820 | 98
821 | ],
822 | [
823 | 99,
824 | 110
825 | ]
826 | ],
827 | "video_path": "original_videos\\subject_11_gopro_seg_2_1157-1267.mp4"
828 | },
829 | {
830 | "action": [
831 | "Grasp with the right hand",
832 | "Picking with the right hand",
833 | "Bringing with the right hand",
834 | "Putting on with the right hand",
835 | "Release from the right hand"
836 | ],
837 | "gt_time": [
838 | [
839 | 0,
840 | 10
841 | ],
842 | [
843 | 11,
844 | 30
845 | ],
846 | [
847 | 31,
848 | 58
849 | ],
850 | [
851 | 59,
852 | 83
853 | ],
854 | [
855 | 84,
856 | 93
857 | ]
858 | ],
859 | "video_path": "original_videos\\subject_11_gopro_seg_2_1268-1361.mp4"
860 | },
861 | {
862 | "action": [
863 | "Grasp with the right hand",
864 | "Picking with the right hand",
865 | "Bringing with the right hand",
866 | "Putting on with the right hand",
867 | "Release from the right hand"
868 | ],
869 | "gt_time": [
870 | [
871 | 0,
872 | 10
873 | ],
874 | [
875 | 11,
876 | 31
877 | ],
878 | [
879 | 32,
880 | 195
881 | ],
882 | [
883 | 196,
884 | 214
885 | ],
886 | [
887 | 215,
888 | 226
889 | ]
890 | ],
891 | "video_path": "original_videos\\subject_11_gopro_seg_2_1377-1603.mp4"
892 | },
893 | {
894 | "action": [
895 | "Grasp with the right hand",
896 | "Holding with the right hand",
897 | "Striking something with the right hand or an object held in the right hand.",
898 | "Cracking an egg with the right hand",
899 | "Pouring with the right hand",
900 | "Holding with the right hand",
901 | "Bringing with the right hand",
902 | "Putting on with the right hand",
903 | "Release from the right hand"
904 | ],
905 | "gt_time": [
906 | [
907 | 0,
908 | 12
909 | ],
910 | [
911 | 13,
912 | 68
913 | ],
914 | [
915 | 69,
916 | 94
917 | ],
918 | [
919 | 95,
920 | 105
921 | ],
922 | [
923 | 106,
924 | 120
925 | ],
926 | [
927 | 121,
928 | 216
929 | ],
930 | [
931 | 217,
932 | 313
933 | ],
934 | [
935 | 314,
936 | 332
937 | ],
938 | [
939 | 333,
940 | 343
941 | ]
942 | ],
943 | "video_path": "original_videos\\subject_12_gopro_seg_1_7253-7596.mp4"
944 | },
945 | {
946 | "action": [
947 | "Grasp with the right hand",
948 | "Holding with the right hand",
949 | "Tracing the surface of something with the right hand or an object held in the right hand in a linear manner.",
950 | "Holding with the right hand",
951 | "Release from the right hand"
952 | ],
953 | "gt_time": [
954 | [
955 | 0,
956 | 14
957 | ],
958 | [
959 | 15,
960 | 143
961 | ],
962 | [
963 | 144,
964 | 203
965 | ],
966 | [
967 | 204,
968 | 435
969 | ],
970 | [
971 | 436,
972 | 443
973 | ]
974 | ],
975 | "video_path": "original_videos\\subject_12_gopro_seg_2_12602-13045.mp4"
976 | },
977 | {
978 | "action": [
979 | "Grasp with the right hand",
980 | "Holding with the right hand",
981 | "Release from the right hand"
982 | ],
983 | "gt_time": [
984 | [
985 | 0,
986 | 10
987 | ],
988 | [
989 | 11,
990 | 40
991 | ],
992 | [
993 | 41,
994 | 51
995 | ]
996 | ],
997 | "video_path": "original_videos\\subject_12_gopro_seg_2_13923-13974.mp4"
998 | },
999 | {
1000 | "action": [
1001 | "Grasp with the right hand",
1002 | "Picking with the right hand",
1003 | "Tracing the surface of something with the right hand or an object held in the right hand in a planar manner.",
1004 | "Release from the right hand"
1005 | ],
1006 | "gt_time": [
1007 | [
1008 | 0,
1009 | 6
1010 | ],
1011 | [
1012 | 7,
1013 | 27
1014 | ],
1015 | [
1016 | 28,
1017 | 90
1018 | ],
1019 | [
1020 | 91,
1021 | 97
1022 | ]
1023 | ],
1024 | "video_path": "original_videos\\subject_12_gopro_seg_2_13975-14072.mp4"
1025 | },
1026 | {
1027 | "action": [
1028 | "Grasp with the right hand",
1029 | "Holding with the right hand",
1030 | "Release from the right hand"
1031 | ],
1032 | "gt_time": [
1033 | [
1034 | 0,
1035 | 3
1036 | ],
1037 | [
1038 | 4,
1039 | 50
1040 | ],
1041 | [
1042 | 51,
1043 | 60
1044 | ]
1045 | ],
1046 | "video_path": "original_videos\\subject_12_gopro_seg_2_14073-14133.mp4"
1047 | },
1048 | {
1049 | "action": [
1050 | "Grasp with the right hand",
1051 | "Starting rotary motion with the right hand while it is restrained.",
1052 | "Release from the right hand"
1053 | ],
1054 | "gt_time": [
1055 | [
1056 | 0,
1057 | 11
1058 | ],
1059 | [
1060 | 12,
1061 | 66
1062 | ],
1063 | [
1064 | 67,
1065 | 77
1066 | ]
1067 | ],
1068 | "video_path": "original_videos\\subject_12_gopro_seg_2_14134-14211.mp4"
1069 | },
1070 | {
1071 | "action": [
1072 | "Grasp with the right hand",
1073 | "Rotary motion with the right hand until it cannot be rotated",
1074 | "Release from the right hand"
1075 | ],
1076 | "gt_time": [
1077 | [
1078 | 0,
1079 | 9
1080 | ],
1081 | [
1082 | 10,
1083 | 46
1084 | ],
1085 | [
1086 | 47,
1087 | 56
1088 | ]
1089 | ],
1090 | "video_path": "original_videos\\subject_12_gopro_seg_2_14257-14313.mp4"
1091 | },
1092 | {
1093 | "action": [
1094 | "Grasp with the right hand",
1095 | "Holding with the right hand",
1096 | "Release from the right hand"
1097 | ],
1098 | "gt_time": [
1099 | [
1100 | 0,
1101 | 10
1102 | ],
1103 | [
1104 | 11,
1105 | 38
1106 | ],
1107 | [
1108 | 39,
1109 | 48
1110 | ]
1111 | ],
1112 | "video_path": "original_videos\\subject_12_gopro_seg_2_14316-14364.mp4"
1113 | },
1114 | {
1115 | "action": [
1116 | "Grasp with the right hand",
1117 | "Rotary motion with the right hand until it cannot be rotated",
1118 | "Release from the right hand"
1119 | ],
1120 | "gt_time": [
1121 | [
1122 | 0,
1123 | 8
1124 | ],
1125 | [
1126 | 9,
1127 | 14
1128 | ],
1129 | [
1130 | 15,
1131 | 25
1132 | ]
1133 | ],
1134 | "video_path": "original_videos\\subject_13_gopro_seg_1_14430-14455.mp4"
1135 | },
1136 | {
1137 | "action": [
1138 | "Grasp with the right hand",
1139 | "Picking with the right hand",
1140 | "Release from the right hand"
1141 | ],
1142 | "gt_time": [
1143 | [
1144 | 0,
1145 | 34
1146 | ],
1147 | [
1148 | 35,
1149 | 43
1150 | ],
1151 | [
1152 | 44,
1153 | 101
1154 | ]
1155 | ],
1156 | "video_path": "original_videos\\subject_13_gopro_seg_1_14497-14598.mp4"
1157 | },
1158 | {
1159 | "action": [
1160 | "Grasp with the right hand",
1161 | "Putting on with the right hand",
1162 | "Release from the right hand"
1163 | ],
1164 | "gt_time": [
1165 | [
1166 | 0,
1167 | 10
1168 | ],
1169 | [
1170 | 11,
1171 | 43
1172 | ],
1173 | [
1174 | 44,
1175 | 57
1176 | ]
1177 | ],
1178 | "video_path": "original_videos\\subject_13_gopro_seg_1_15750-15807.mp4"
1179 | },
1180 | {
1181 | "action": [
1182 | "Grasp with the right hand",
1183 | "Picking with the right hand",
1184 | "Holding with the right hand",
1185 | "Putting on with the right hand",
1186 | "Release from the right hand"
1187 | ],
1188 | "gt_time": [
1189 | [
1190 | 0,
1191 | 17
1192 | ],
1193 | [
1194 | 18,
1195 | 62
1196 | ],
1197 | [
1198 | 63,
1199 | 88
1200 | ],
1201 | [
1202 | 89,
1203 | 108
1204 | ],
1205 | [
1206 | 109,
1207 | 121
1208 | ]
1209 | ],
1210 | "video_path": "original_videos\\subject_2_d_gopro_seg_1_2352-2473.mp4"
1211 | },
1212 | {
1213 | "action": [
1214 | "Grasp with the right hand",
1215 | "Picking with the right hand",
1216 | "Bringing with the right hand",
1217 | "Putting on with the right hand",
1218 | "Release from the right hand"
1219 | ],
1220 | "gt_time": [
1221 | [
1222 | 0,
1223 | 16
1224 | ],
1225 | [
1226 | 17,
1227 | 54
1228 | ],
1229 | [
1230 | 55,
1231 | 383
1232 | ],
1233 | [
1234 | 384,
1235 | 424
1236 | ],
1237 | [
1238 | 425,
1239 | 433
1240 | ]
1241 | ],
1242 | "video_path": "original_videos\\subject_2_d_gopro_seg_1_2474-2907.mp4"
1243 | },
1244 | {
1245 | "action": [
1246 | "Grasp with the right hand",
1247 | "Picking with the right hand",
1248 | "Holding with the right hand",
1249 | "Striking something with the right hand or an object held in the right hand.",
1250 | "Cracking an egg with the right hand",
1251 | "Pouring with the right hand",
1252 | "Holding with the right hand",
1253 | "Bringing with the right hand",
1254 | "Putting on with the right hand",
1255 | "Release from the right hand"
1256 | ],
1257 | "gt_time": [
1258 | [
1259 | 0,
1260 | 40
1261 | ],
1262 | [
1263 | 41,
1264 | 63
1265 | ],
1266 | [
1267 | 64,
1268 | 98
1269 | ],
1270 | [
1271 | 99,
1272 | 139
1273 | ],
1274 | [
1275 | 140,
1276 | 158
1277 | ],
1278 | [
1279 | 159,
1280 | 221
1281 | ],
1282 | [
1283 | 222,
1284 | 239
1285 | ],
1286 | [
1287 | 240,
1288 | 272
1289 | ],
1290 | [
1291 | 273,
1292 | 281
1293 | ],
1294 | [
1295 | 282,
1296 | 291
1297 | ]
1298 | ],
1299 | "video_path": "original_videos\\subject_2_d_gopro_seg_1_3258-3549.mp4"
1300 | },
1301 | {
1302 | "action": [
1303 | "Grasp with the right hand",
1304 | "Picking with the right hand",
1305 | "Bringing with the right hand",
1306 | "Pouring with the right hand",
1307 | "Holding with the right hand",
1308 | "Pouring with the right hand",
1309 | "Bringing with the right hand",
1310 | "Putting on with the right hand",
1311 | "Release from the right hand"
1312 | ],
1313 | "gt_time": [
1314 | [
1315 | 0,
1316 | 52
1317 | ],
1318 | [
1319 | 53,
1320 | 78
1321 | ],
1322 | [
1323 | 79,
1324 | 168
1325 | ],
1326 | [
1327 | 169,
1328 | 234
1329 | ],
1330 | [
1331 | 235,
1332 | 386
1333 | ],
1334 | [
1335 | 387,
1336 | 421
1337 | ],
1338 | [
1339 | 422,
1340 | 454
1341 | ],
1342 | [
1343 | 455,
1344 | 497
1345 | ],
1346 | [
1347 | 498,
1348 | 507
1349 | ]
1350 | ],
1351 | "video_path": "original_videos\\subject_2_d_gopro_seg_2_3852-4359.mp4"
1352 | },
1353 | {
1354 | "action": [
1355 | "Hand over from the left hand to the right hand",
1356 | "Putting on with the right hand",
1357 | "Release from the right hand"
1358 | ],
1359 | "gt_time": [
1360 | [
1361 | 0,
1362 | 20
1363 | ],
1364 | [
1365 | 21,
1366 | 41
1367 | ],
1368 | [
1369 | 42,
1370 | 52
1371 | ]
1372 | ],
1373 | "video_path": "original_videos\\subject_3_o_gopro_seg_1_14504-14556.mp4"
1374 | },
1375 | {
1376 | "action": [
1377 | "Grasp with the right hand",
1378 | "Holding with the right hand",
1379 | "Release from the right hand"
1380 | ],
1381 | "gt_time": [
1382 | [
1383 | 0,
1384 | 12
1385 | ],
1386 | [
1387 | 13,
1388 | 29
1389 | ],
1390 | [
1391 | 30,
1392 | 37
1393 | ]
1394 | ],
1395 | "video_path": "original_videos\\subject_3_o_gopro_seg_1_14557-14594.mp4"
1396 | },
1397 | {
1398 | "action": [
1399 | "Grasp with the right hand",
1400 | "Release from the right hand"
1401 | ],
1402 | "gt_time": [
1403 | [
1404 | 0,
1405 | 16
1406 | ],
1407 | [
1408 | 17,
1409 | 28
1410 | ]
1411 | ],
1412 | "video_path": "original_videos\\subject_3_o_gopro_seg_1_14635-14663.mp4"
1413 | },
1414 | {
1415 | "action": [
1416 | "Grasp with the right hand",
1417 | "Holding with the right hand",
1418 | "Release from the right hand"
1419 | ],
1420 | "gt_time": [
1421 | [
1422 | 0,
1423 | 12
1424 | ],
1425 | [
1426 | 13,
1427 | 27
1428 | ],
1429 | [
1430 | 28,
1431 | 33
1432 | ]
1433 | ],
1434 | "video_path": "original_videos\\subject_3_o_gopro_seg_1_14666-14699.mp4"
1435 | },
1436 | {
1437 | "action": [
1438 | "Grasp with the right hand",
1439 | "Putting on with the right hand",
1440 | "Release from the right hand"
1441 | ],
1442 | "gt_time": [
1443 | [
1444 | 0,
1445 | 4
1446 | ],
1447 | [
1448 | 5,
1449 | 27
1450 | ],
1451 | [
1452 | 28,
1453 | 38
1454 | ]
1455 | ],
1456 | "video_path": "original_videos\\subject_3_o_gopro_seg_1_14737-14775.mp4"
1457 | },
1458 | {
1459 | "action": [
1460 | "Grasp with the right hand",
1461 | "Picking with the right hand",
1462 | "Release from the right hand"
1463 | ],
1464 | "gt_time": [
1465 | [
1466 | 0,
1467 | 10
1468 | ],
1469 | [
1470 | 11,
1471 | 29
1472 | ],
1473 | [
1474 | 30,
1475 | 43
1476 | ]
1477 | ],
1478 | "video_path": "original_videos\\subject_3_o_gopro_seg_1_14853-14896.mp4"
1479 | },
1480 | {
1481 | "action": [
1482 | "Grasp with the right hand",
1483 | "Picking with the right hand",
1484 | "Tracing the surface of something with the right hand or an object held in the right hand in a planar manner.",
1485 | "Striking something with the right hand or an object held in the right hand.",
1486 | "Bringing with the right hand",
1487 | "Putting on with the right hand",
1488 | "Release from the right hand"
1489 | ],
1490 | "gt_time": [
1491 | [
1492 | 0,
1493 | 24
1494 | ],
1495 | [
1496 | 25,
1497 | 39
1498 | ],
1499 | [
1500 | 40,
1501 | 296
1502 | ],
1503 | [
1504 | 297,
1505 | 329
1506 | ],
1507 | [
1508 | 330,
1509 | 346
1510 | ],
1511 | [
1512 | 347,
1513 | 347
1514 | ],
1515 | [
1516 | 348,
1517 | 360
1518 | ]
1519 | ],
1520 | "video_path": "original_videos\\subject_3_o_gopro_seg_1_14901-15261.mp4"
1521 | },
1522 | {
1523 | "action": [
1524 | "Grasp with the right hand",
1525 | "Putting on with the right hand",
1526 | "Release from the right hand"
1527 | ],
1528 | "gt_time": [
1529 | [
1530 | 0,
1531 | 7
1532 | ],
1533 | [
1534 | 8,
1535 | 28
1536 | ],
1537 | [
1538 | 29,
1539 | 39
1540 | ]
1541 | ],
1542 | "video_path": "original_videos\\subject_3_o_gopro_seg_1_15262-15301.mp4"
1543 | },
1544 | {
1545 | "action": [
1546 | "Hand over from the left hand to the right hand",
1547 | "Putting on with the right hand",
1548 | "Release from the right hand"
1549 | ],
1550 | "gt_time": [
1551 | [
1552 | 0,
1553 | 20
1554 | ],
1555 | [
1556 | 21,
1557 | 36
1558 | ],
1559 | [
1560 | 37,
1561 | 44
1562 | ]
1563 | ],
1564 | "video_path": "original_videos\\subject_3_o_gopro_seg_1_15395-15439.mp4"
1565 | },
1566 | {
1567 | "action": [
1568 | "Grasp with the right hand",
1569 | "Pressing a button with the right hand",
1570 | "Release from the right hand"
1571 | ],
1572 | "gt_time": [
1573 | [
1574 | 0,
1575 | 9
1576 | ],
1577 | [
1578 | 10,
1579 | 41
1580 | ],
1581 | [
1582 | 42,
1583 | 49
1584 | ]
1585 | ],
1586 | "video_path": "original_videos\\subject_3_o_gopro_seg_1_15440-15489.mp4"
1587 | },
1588 | {
1589 | "action": [
1590 | "Grasp with the right hand",
1591 | "Picking with the right hand",
1592 | "Bringing with the right hand",
1593 | "Release from the right hand"
1594 | ],
1595 | "gt_time": [
1596 | [
1597 | 0,
1598 | 18
1599 | ],
1600 | [
1601 | 19,
1602 | 33
1603 | ],
1604 | [
1605 | 34,
1606 | 51
1607 | ],
1608 | [
1609 | 52,
1610 | 59
1611 | ]
1612 | ],
1613 | "video_path": "original_videos\\subject_3_o_gopro_seg_1_15574-15633.mp4"
1614 | },
1615 | {
1616 | "action": [
1617 | "Grasp with the right hand",
1618 | "Starting rotary motion with the right hand while it is restrained.",
1619 | "Release from the right hand"
1620 | ],
1621 | "gt_time": [
1622 | [
1623 | 0,
1624 | 4
1625 | ],
1626 | [
1627 | 5,
1628 | 9
1629 | ],
1630 | [
1631 | 10,
1632 | 17
1633 | ]
1634 | ],
1635 | "video_path": "original_videos\\subject_3_o_gopro_seg_1_15634-15651.mp4"
1636 | },
1637 | {
1638 | "action": [
1639 | "Grasp with the right hand",
1640 | "Tracing the surface of something with the right hand or an object held in the right hand in a linear manner.",
1641 | "Release from the right hand"
1642 | ],
1643 | "gt_time": [
1644 | [
1645 | 0,
1646 | 8
1647 | ],
1648 | [
1649 | 9,
1650 | 77
1651 | ],
1652 | [
1653 | 78,
1654 | 90
1655 | ]
1656 | ],
1657 | "video_path": "original_videos\\subject_3_o_gopro_seg_1_15652-15742.mp4"
1658 | },
1659 | {
1660 | "action": [
1661 | "Grasp with the right hand",
1662 | "Picking with the right hand",
1663 | "Tracing the surface of something with the right hand or an object held in the right hand in a linear manner.",
1664 | "Release from the right hand"
1665 | ],
1666 | "gt_time": [
1667 | [
1668 | 0,
1669 | 4
1670 | ],
1671 | [
1672 | 5,
1673 | 24
1674 | ],
1675 | [
1676 | 25,
1677 | 57
1678 | ],
1679 | [
1680 | 58,
1681 | 69
1682 | ]
1683 | ],
1684 | "video_path": "original_videos\\subject_3_o_gopro_seg_1_15743-15812.mp4"
1685 | },
1686 | {
1687 | "action": [
1688 | "Grasp with the right hand",
1689 | "Tracing the surface of something with the right hand or an object held in the right hand in a linear manner.",
1690 | "Release from the right hand"
1691 | ],
1692 | "gt_time": [
1693 | [
1694 | 0,
1695 | 6
1696 | ],
1697 | [
1698 | 7,
1699 | 38
1700 | ],
1701 | [
1702 | 39,
1703 | 40
1704 | ]
1705 | ],
1706 | "video_path": "original_videos\\subject_3_o_gopro_seg_1_15813-15853.mp4"
1707 | },
1708 | {
1709 | "action": [
1710 | "Grasp with the right hand",
1711 | "Rotary motion with the right hand until it cannot be rotated",
1712 | "Release from the right hand"
1713 | ],
1714 | "gt_time": [
1715 | [
1716 | 0,
1717 | 12
1718 | ],
1719 | [
1720 | 13,
1721 | 18
1722 | ],
1723 | [
1724 | 19,
1725 | 25
1726 | ]
1727 | ],
1728 | "video_path": "original_videos\\subject_3_o_gopro_seg_1_15854-15879.mp4"
1729 | },
1730 | {
1731 | "action": [
1732 | "Grasp with the right hand",
1733 | "Starting rotary motion with the right hand while it is restrained.",
1734 | "Release from the right hand"
1735 | ],
1736 | "gt_time": [
1737 | [
1738 | 0,
1739 | 2
1740 | ],
1741 | [
1742 | 3,
1743 | 7
1744 | ],
1745 | [
1746 | 8,
1747 | 15
1748 | ]
1749 | ],
1750 | "video_path": "original_videos\\subject_3_o_gopro_seg_2_14670-14685.mp4"
1751 | },
1752 | {
1753 | "action": [
1754 | "Grasp with the right hand",
1755 | "Tracing the surface of something with the right hand or an object held in the right hand in a linear manner.",
1756 | "Tracing the surface of something with the right hand or an object held in the right hand in a planar manner.",
1757 | "Putting on with the right hand",
1758 | "Release from the right hand"
1759 | ],
1760 | "gt_time": [
1761 | [
1762 | 0,
1763 | 4
1764 | ],
1765 | [
1766 | 5,
1767 | 19
1768 | ],
1769 | [
1770 | 20,
1771 | 164
1772 | ],
1773 | [
1774 | 165,
1775 | 175
1776 | ],
1777 | [
1778 | 176,
1779 | 186
1780 | ]
1781 | ],
1782 | "video_path": "original_videos\\subject_3_o_gopro_seg_2_15622-15808.mp4"
1783 | },
1784 | {
1785 | "action": [
1786 | "Grasp with the right hand",
1787 | "Starting rotary motion with the right hand while it is restrained.",
1788 | "Release from the right hand"
1789 | ],
1790 | "gt_time": [
1791 | [
1792 | 0,
1793 | 6
1794 | ],
1795 | [
1796 | 7,
1797 | 15
1798 | ],
1799 | [
1800 | 16,
1801 | 25
1802 | ]
1803 | ],
1804 | "video_path": "original_videos\\subject_3_o_gopro_seg_2_15809-15834.mp4"
1805 | },
1806 | {
1807 | "action": [
1808 | "Grasp with the right hand",
1809 | "Rotary motion with the right hand until it cannot be rotated",
1810 | "Release from the right hand"
1811 | ],
1812 | "gt_time": [
1813 | [
1814 | 0,
1815 | 6
1816 | ],
1817 | [
1818 | 7,
1819 | 15
1820 | ],
1821 | [
1822 | 16,
1823 | 21
1824 | ]
1825 | ],
1826 | "video_path": "original_videos\\subject_3_o_gopro_seg_2_15972-15993.mp4"
1827 | },
1828 | {
1829 | "action": [
1830 | "Grasp with the right hand",
1831 | "Bringing with the right hand",
1832 | "Linear motion with the right hand until it cannot be moved",
1833 | "Release from the right hand"
1834 | ],
1835 | "gt_time": [
1836 | [
1837 | 0,
1838 | 66
1839 | ],
1840 | [
1841 | 67,
1842 | 76
1843 | ],
1844 | [
1845 | 77,
1846 | 88
1847 | ],
1848 | [
1849 | 89,
1850 | 99
1851 | ]
1852 | ],
1853 | "video_path": "original_videos\\subject_3_o_gopro_seg_2_16025-16124.mp4"
1854 | },
1855 | {
1856 | "action": [
1857 | "Grasp with the right hand",
1858 | "Rotary motion with the right hand until it cannot be rotated",
1859 | "Release from the right hand"
1860 | ],
1861 | "gt_time": [
1862 | [
1863 | 0,
1864 | 5
1865 | ],
1866 | [
1867 | 6,
1868 | 16
1869 | ],
1870 | [
1871 | 17,
1872 | 26
1873 | ]
1874 | ],
1875 | "video_path": "original_videos\\subject_4_gopro_seg_1_7675-7701.mp4"
1876 | },
1877 | {
1878 | "action": [
1879 | "Grasp with the right hand",
1880 | "Starting rotary motion with the right hand while it is restrained.",
1881 | "Release from the right hand"
1882 | ],
1883 | "gt_time": [
1884 | [
1885 | 0,
1886 | 16
1887 | ],
1888 | [
1889 | 17,
1890 | 41
1891 | ],
1892 | [
1893 | 42,
1894 | 51
1895 | ]
1896 | ],
1897 | "video_path": "original_videos\\subject_4_gopro_seg_2_16670-16721.mp4"
1898 | },
1899 | {
1900 | "action": [
1901 | "Grasp with the right hand",
1902 | "Tracing the surface of something with the right hand or an object held in the right hand in a linear manner.",
1903 | "Striking something with the right hand or an object held in the right hand.",
1904 | "Bringing with the right hand",
1905 | "Linear motion with the right hand until it cannot be moved",
1906 | "Release from the right hand"
1907 | ],
1908 | "gt_time": [
1909 | [
1910 | 0,
1911 | 10
1912 | ],
1913 | [
1914 | 11,
1915 | 152
1916 | ],
1917 | [
1918 | 153,
1919 | 189
1920 | ],
1921 | [
1922 | 190,
1923 | 211
1924 | ],
1925 | [
1926 | 212,
1927 | 234
1928 | ],
1929 | [
1930 | 235,
1931 | 247
1932 | ]
1933 | ],
1934 | "video_path": "original_videos\\subject_4_gopro_seg_2_17147-17394.mp4"
1935 | },
1936 | {
1937 | "action": [
1938 | "Grasp with the right hand",
1939 | "Picking with the right hand",
1940 | "Release from the right hand"
1941 | ],
1942 | "gt_time": [
1943 | [
1944 | 0,
1945 | 16
1946 | ],
1947 | [
1948 | 17,
1949 | 48
1950 | ],
1951 | [
1952 | 49,
1953 | 60
1954 | ]
1955 | ],
1956 | "video_path": "original_videos\\subject_4_gopro_seg_2_17398-17458.mp4"
1957 | },
1958 | {
1959 | "action": [
1960 | "Grasp with the right hand",
1961 | "Tracing the surface of something with the right hand or an object held in the right hand in a planar manner.",
1962 | "Holding with the right hand",
1963 | "Tracing the surface of something with the right hand or an object held in the right hand in a planar manner.",
1964 | "Holding with the right hand",
1965 | "Tracing the surface of something with the right hand or an object held in the right hand in a planar manner.",
1966 | "Release from the right hand"
1967 | ],
1968 | "gt_time": [
1969 | [
1970 | 0,
1971 | 11
1972 | ],
1973 | [
1974 | 12,
1975 | 62
1976 | ],
1977 | [
1978 | 63,
1979 | 107
1980 | ],
1981 | [
1982 | 108,
1983 | 165
1984 | ],
1985 | [
1986 | 166,
1987 | 205
1988 | ],
1989 | [
1990 | 206,
1991 | 216
1992 | ],
1993 | [
1994 | 217,
1995 | 224
1996 | ]
1997 | ],
1998 | "video_path": "original_videos\\subject_4_gopro_seg_2_17460-17684.mp4"
1999 | },
2000 | {
2001 | "action": [
2002 | "Grasp with the right hand",
2003 | "Rotary motion with the right hand until it cannot be rotated",
2004 | "Release from the right hand"
2005 | ],
2006 | "gt_time": [
2007 | [
2008 | 0,
2009 | 8
2010 | ],
2011 | [
2012 | 9,
2013 | 19
2014 | ],
2015 | [
2016 | 20,
2017 | 22
2018 | ]
2019 | ],
2020 | "video_path": "original_videos\\subject_4_gopro_seg_2_17722-17744.mp4"
2021 | },
2022 | {
2023 | "action": [
2024 | "Grasp with the right hand",
2025 | "Striking something with the right hand or an object held in the right hand.",
2026 | "Bringing with the right hand",
2027 | "Moving an object held in the right hand in and out of a narrow space.",
2028 | "Release from the right hand"
2029 | ],
2030 | "gt_time": [
2031 | [
2032 | 0,
2033 | 9
2034 | ],
2035 | [
2036 | 10,
2037 | 26
2038 | ],
2039 | [
2040 | 27,
2041 | 111
2042 | ],
2043 | [
2044 | 112,
2045 | 126
2046 | ],
2047 | [
2048 | 127,
2049 | 136
2050 | ]
2051 | ],
2052 | "video_path": "original_videos\\subject_4_gopro_seg_2_17745-17881.mp4"
2053 | },
2054 | {
2055 | "action": [
2056 | "Grasp with the right hand",
2057 | "Starting rotary motion with the right hand while it is restrained.",
2058 | "Release from the right hand"
2059 | ],
2060 | "gt_time": [
2061 | [
2062 | 0,
2063 | 15
2064 | ],
2065 | [
2066 | 16,
2067 | 34
2068 | ],
2069 | [
2070 | 35,
2071 | 44
2072 | ]
2073 | ],
2074 | "video_path": "original_videos\\subject_4_gopro_seg_2_17955-17999.mp4"
2075 | },
2076 | {
2077 | "action": [
2078 | "Grasp with the right hand",
2079 | "Cracking an egg with the right hand",
2080 | "Bringing with the right hand",
2081 | "Release from the right hand"
2082 | ],
2083 | "gt_time": [
2084 | [
2085 | 0,
2086 | 16
2087 | ],
2088 | [
2089 | 17,
2090 | 148
2091 | ],
2092 | [
2093 | 149,
2094 | 196
2095 | ],
2096 | [
2097 | 197,
2098 | 206
2099 | ]
2100 | ],
2101 | "video_path": "original_videos\\subject_4_gopro_seg_2_7354-7560.mp4"
2102 | },
2103 | {
2104 | "action": [
2105 | "Grasp with the right hand",
2106 | "Picking with the right hand",
2107 | "Holding with the right hand",
2108 | "Bringing with the right hand",
2109 | "Striking something with the right hand or an object held in the right hand.",
2110 | "Cracking an egg with the right hand",
2111 | "Holding with the right hand",
2112 | "Striking something with the right hand or an object held in the right hand.",
2113 | "Bringing with the right hand",
2114 | "Putting on with the right hand",
2115 | "Release from the right hand"
2116 | ],
2117 | "gt_time": [
2118 | [
2119 | 0,
2120 | 14
2121 | ],
2122 | [
2123 | 15,
2124 | 27
2125 | ],
2126 | [
2127 | 28,
2128 | 112
2129 | ],
2130 | [
2131 | 113,
2132 | 160
2133 | ],
2134 | [
2135 | 161,
2136 | 206
2137 | ],
2138 | [
2139 | 207,
2140 | 278
2141 | ],
2142 | [
2143 | 279,
2144 | 562
2145 | ],
2146 | [
2147 | 563,
2148 | 633
2149 | ],
2150 | [
2151 | 634,
2152 | 645
2153 | ],
2154 | [
2155 | 646,
2156 | 671
2157 | ],
2158 | [
2159 | 672,
2160 | 681
2161 | ]
2162 | ],
2163 | "video_path": "original_videos\\subject_4_gopro_seg_2_8051-8732.mp4"
2164 | },
2165 | {
2166 | "action": [
2167 | "Grasp with the right hand",
2168 | "Cracking an egg with the right hand",
2169 | "Bringing with the right hand",
2170 | "Release from the right hand"
2171 | ],
2172 | "gt_time": [
2173 | [
2174 | 0,
2175 | 10
2176 | ],
2177 | [
2178 | 11,
2179 | 110
2180 | ],
2181 | [
2182 | 111,
2183 | 134
2184 | ],
2185 | [
2186 | 135,
2187 | 145
2188 | ]
2189 | ],
2190 | "video_path": "original_videos\\subject_4_gopro_seg_2_8736-8881.mp4"
2191 | },
2192 | {
2193 | "action": [
2194 | "Grasp with the right hand",
2195 | "Moving an object held in the right hand in and out of a narrow space.",
2196 | "Bringing with the right hand",
2197 | "Putting on with the right hand",
2198 | "Release from the right hand"
2199 | ],
2200 | "gt_time": [
2201 | [
2202 | 0,
2203 | 109
2204 | ],
2205 | [
2206 | 110,
2207 | 130
2208 | ],
2209 | [
2210 | 131,
2211 | 184
2212 | ],
2213 | [
2214 | 185,
2215 | 198
2216 | ],
2217 | [
2218 | 199,
2219 | 209
2220 | ]
2221 | ],
2222 | "video_path": "original_videos\\subject_5_gopro_seg_1_7243-7452.mp4"
2223 | },
2224 | {
2225 | "action": [
2226 | "Grasp with the right hand",
2227 | "Picking with the right hand",
2228 | "Pouring with the right hand",
2229 | "Putting on with the right hand",
2230 | "Release from the right hand"
2231 | ],
2232 | "gt_time": [
2233 | [
2234 | 0,
2235 | 33
2236 | ],
2237 | [
2238 | 34,
2239 | 75
2240 | ],
2241 | [
2242 | 76,
2243 | 220
2244 | ],
2245 | [
2246 | 221,
2247 | 279
2248 | ],
2249 | [
2250 | 280,
2251 | 305
2252 | ]
2253 | ],
2254 | "video_path": "original_videos\\subject_5_gopro_seg_2_14524-14829.mp4"
2255 | },
2256 | {
2257 | "action": [
2258 | "Grasp with the right hand",
2259 | "Bringing with the right hand",
2260 | "Putting on with the right hand",
2261 | "Release from the right hand"
2262 | ],
2263 | "gt_time": [
2264 | [
2265 | 0,
2266 | 32
2267 | ],
2268 | [
2269 | 33,
2270 | 328
2271 | ],
2272 | [
2273 | 329,
2274 | 351
2275 | ],
2276 | [
2277 | 352,
2278 | 362
2279 | ]
2280 | ],
2281 | "video_path": "original_videos\\subject_5_gopro_seg_2_14832-15194.mp4"
2282 | },
2283 | {
2284 | "action": [
2285 | "Grasp with the right hand",
2286 | "Bringing with the right hand",
2287 | "Release from the right hand"
2288 | ],
2289 | "gt_time": [
2290 | [
2291 | 0,
2292 | 13
2293 | ],
2294 | [
2295 | 14,
2296 | 26
2297 | ],
2298 | [
2299 | 27,
2300 | 41
2301 | ]
2302 | ],
2303 | "video_path": "original_videos\\subject_5_gopro_seg_2_2333-2374.mp4"
2304 | },
2305 | {
2306 | "action": [
2307 | "Grasp with the right hand",
2308 | "Striking something with the right hand or an object held in the right hand.",
2309 | "Putting on with the right hand",
2310 | "Release from the right hand"
2311 | ],
2312 | "gt_time": [
2313 | [
2314 | 0,
2315 | 10
2316 | ],
2317 | [
2318 | 11,
2319 | 54
2320 | ],
2321 | [
2322 | 55,
2323 | 66
2324 | ],
2325 | [
2326 | 67,
2327 | 80
2328 | ]
2329 | ],
2330 | "video_path": "original_videos\\subject_5_gopro_seg_2_2634-2714.mp4"
2331 | },
2332 | {
2333 | "action": [
2334 | "Grasp with the right hand",
2335 | "Picking with the right hand",
2336 | "Putting on with the right hand",
2337 | "Release from the right hand"
2338 | ],
2339 | "gt_time": [
2340 | [
2341 | 0,
2342 | 16
2343 | ],
2344 | [
2345 | 17,
2346 | 36
2347 | ],
2348 | [
2349 | 37,
2350 | 55
2351 | ],
2352 | [
2353 | 56,
2354 | 64
2355 | ]
2356 | ],
2357 | "video_path": "original_videos\\subject_5_gopro_seg_2_2744-2808.mp4"
2358 | },
2359 | {
2360 | "action": [
2361 | "Grasp with the right hand",
2362 | "Starting rotary motion with the right hand while it is restrained.",
2363 | "Release from the right hand"
2364 | ],
2365 | "gt_time": [
2366 | [
2367 | 0,
2368 | 8
2369 | ],
2370 | [
2371 | 9,
2372 | 51
2373 | ],
2374 | [
2375 | 52,
2376 | 63
2377 | ]
2378 | ],
2379 | "video_path": "original_videos\\subject_5_gopro_seg_2_2809-2872.mp4"
2380 | },
2381 | {
2382 | "action": [
2383 | "Grasp with the right hand",
2384 | "Holding with the right hand",
2385 | "Release from the right hand"
2386 | ],
2387 | "gt_time": [
2388 | [
2389 | 0,
2390 | 8
2391 | ],
2392 | [
2393 | 9,
2394 | 61
2395 | ],
2396 | [
2397 | 62,
2398 | 72
2399 | ]
2400 | ],
2401 | "video_path": "original_videos\\subject_5_gopro_seg_2_2873-2945.mp4"
2402 | },
2403 | {
2404 | "action": [
2405 | "Grasp with the right hand",
2406 | "Striking something with the right hand or an object held in the right hand.",
2407 | "Release from the right hand"
2408 | ],
2409 | "gt_time": [
2410 | [
2411 | 0,
2412 | 10
2413 | ],
2414 | [
2415 | 11,
2416 | 29
2417 | ],
2418 | [
2419 | 30,
2420 | 42
2421 | ]
2422 | ],
2423 | "video_path": "original_videos\\subject_5_gopro_seg_2_3091-3133.mp4"
2424 | },
2425 | {
2426 | "action": [
2427 | "Grasp with the right hand",
2428 | "Picking with the right hand",
2429 | "Bringing with the right hand",
2430 | "Putting on with the right hand",
2431 | "Release from the right hand"
2432 | ],
2433 | "gt_time": [
2434 | [
2435 | 0,
2436 | 20
2437 | ],
2438 | [
2439 | 21,
2440 | 41
2441 | ],
2442 | [
2443 | 42,
2444 | 152
2445 | ],
2446 | [
2447 | 153,
2448 | 163
2449 | ],
2450 | [
2451 | 164,
2452 | 175
2453 | ]
2454 | ],
2455 | "video_path": "original_videos\\subject_5_gopro_seg_2_3400-3575.mp4"
2456 | },
2457 | {
2458 | "action": [
2459 | "Grasp with the right hand",
2460 | "Picking with the right hand",
2461 | "Bringing with the right hand",
2462 | "Putting on with the right hand",
2463 | "Release from the right hand"
2464 | ],
2465 | "gt_time": [
2466 | [
2467 | 0,
2468 | 24
2469 | ],
2470 | [
2471 | 25,
2472 | 52
2473 | ],
2474 | [
2475 | 53,
2476 | 70
2477 | ],
2478 | [
2479 | 71,
2480 | 85
2481 | ],
2482 | [
2483 | 86,
2484 | 106
2485 | ]
2486 | ],
2487 | "video_path": "original_videos\\subject_6_gopro_seg_1_108-214.mp4"
2488 | },
2489 | {
2490 | "action": [
2491 | "Grasp with the right hand",
2492 | "Picking with the right hand",
2493 | "Bringing with the right hand",
2494 | "Release from the right hand"
2495 | ],
2496 | "gt_time": [
2497 | [
2498 | 0,
2499 | 10
2500 | ],
2501 | [
2502 | 11,
2503 | 28
2504 | ],
2505 | [
2506 | 29,
2507 | 56
2508 | ],
2509 | [
2510 | 57,
2511 | 67
2512 | ]
2513 | ],
2514 | "video_path": "original_videos\\subject_6_gopro_seg_1_216-283.mp4"
2515 | },
2516 | {
2517 | "action": [
2518 | "Grasp with the right hand",
2519 | "Widening a bag with the right hand",
2520 | "Release from the right hand"
2521 | ],
2522 | "gt_time": [
2523 | [
2524 | 0,
2525 | 6
2526 | ],
2527 | [
2528 | 7,
2529 | 28
2530 | ],
2531 | [
2532 | 29,
2533 | 34
2534 | ]
2535 | ],
2536 | "video_path": "original_videos\\subject_6_gopro_seg_1_818-852.mp4"
2537 | },
2538 | {
2539 | "action": [
2540 | "Grasp with the right hand",
2541 | "Picking with the right hand",
2542 | "Bringing with the right hand",
2543 | "Putting on with the right hand",
2544 | "Release from the right hand"
2545 | ],
2546 | "gt_time": [
2547 | [
2548 | 0,
2549 | 35
2550 | ],
2551 | [
2552 | 36,
2553 | 75
2554 | ],
2555 | [
2556 | 76,
2557 | 115
2558 | ],
2559 | [
2560 | 116,
2561 | 148
2562 | ],
2563 | [
2564 | 149,
2565 | 163
2566 | ]
2567 | ],
2568 | "video_path": "original_videos\\subject_6_gopro_seg_1_1230-1393.mp4"
2569 | },
2570 | {
2571 | "action": [
2572 | "Grasp with the right hand",
2573 | "Holding with the right hand",
2574 | "Release from the right hand"
2575 | ],
2576 | "gt_time": [
2577 | [
2578 | 0,
2579 | 22
2580 | ],
2581 | [
2582 | 23,
2583 | 89
2584 | ],
2585 | [
2586 | 90,
2587 | 101
2588 | ]
2589 | ],
2590 | "video_path": "original_videos\\subject_6_gopro_seg_1_11201-11302.mp4"
2591 | },
2592 | {
2593 | "action": [
2594 | "Grasp with the right hand",
2595 | "Putting on with the right hand",
2596 | "Release from the right hand"
2597 | ],
2598 | "gt_time": [
2599 | [
2600 | 0,
2601 | 18
2602 | ],
2603 | [
2604 | 19,
2605 | 63
2606 | ],
2607 | [
2608 | 64,
2609 | 71
2610 | ]
2611 | ],
2612 | "video_path": "original_videos\\subject_6_gopro_seg_1_11303-11374.mp4"
2613 | },
2614 | {
2615 | "action": [
2616 | "Grasp with the right hand",
2617 | "Starting rotary motion with the right hand while it is restrained.",
2618 | "Release from the right hand"
2619 | ],
2620 | "gt_time": [
2621 | [
2622 | 0,
2623 | 20
2624 | ],
2625 | [
2626 | 21,
2627 | 41
2628 | ],
2629 | [
2630 | 42,
2631 | 53
2632 | ]
2633 | ],
2634 | "video_path": "original_videos\\subject_6_gopro_seg_1_11853-11906.mp4"
2635 | },
2636 | {
2637 | "action": [
2638 | "Grasp with the right hand",
2639 | "Rotary motion with the right hand until it cannot be rotated",
2640 | "Release from the right hand"
2641 | ],
2642 | "gt_time": [
2643 | [
2644 | 0,
2645 | 14
2646 | ],
2647 | [
2648 | 15,
2649 | 21
2650 | ],
2651 | [
2652 | 22,
2653 | 32
2654 | ]
2655 | ],
2656 | "video_path": "original_videos\\subject_6_gopro_seg_1_12107-12139.mp4"
2657 | },
2658 | {
2659 | "action": [
2660 | "Grasp with the right hand",
2661 | "Starting rotary motion with the right hand while it is restrained.",
2662 | "Release from the right hand"
2663 | ],
2664 | "gt_time": [
2665 | [
2666 | 0,
2667 | 8
2668 | ],
2669 | [
2670 | 9,
2671 | 46
2672 | ],
2673 | [
2674 | 47,
2675 | 56
2676 | ]
2677 | ],
2678 | "video_path": "original_videos\\subject_6_gopro_seg_1_12471-12527.mp4"
2679 | },
2680 | {
2681 | "action": [
2682 | "Grasp with the right hand",
2683 | "Widening a bag with the right hand",
2684 | "Release from the right hand"
2685 | ],
2686 | "gt_time": [
2687 | [
2688 | 0,
2689 | 10
2690 | ],
2691 | [
2692 | 11,
2693 | 29
2694 | ],
2695 | [
2696 | 30,
2697 | 40
2698 | ]
2699 | ],
2700 | "video_path": "original_videos\\subject_6_gopro_seg_2_3665-3705.mp4"
2701 | },
2702 | {
2703 | "action": [
2704 | "Grasp with the right hand",
2705 | "Starting rotary motion with the right hand while it is restrained.",
2706 | "Release from the right hand"
2707 | ],
2708 | "gt_time": [
2709 | [
2710 | 0,
2711 | 99
2712 | ],
2713 | [
2714 | 100,
2715 | 141
2716 | ],
2717 | [
2718 | 142,
2719 | 150
2720 | ]
2721 | ],
2722 | "video_path": "original_videos\\subject_6_gopro_seg_2_3793-3943.mp4"
2723 | },
2724 | {
2725 | "action": [
2726 | "Grasp with the right hand",
2727 | "Picking with the right hand",
2728 | "Putting on with the right hand",
2729 | "Release from the right hand"
2730 | ],
2731 | "gt_time": [
2732 | [
2733 | 0,
2734 | 59
2735 | ],
2736 | [
2737 | 60,
2738 | 71
2739 | ],
2740 | [
2741 | 72,
2742 | 94
2743 | ],
2744 | [
2745 | 95,
2746 | 104
2747 | ]
2748 | ],
2749 | "video_path": "original_videos\\subject_6_gopro_seg_2_4258-4362.mp4"
2750 | },
2751 | {
2752 | "action": [
2753 | "Grasp with the right hand",
2754 | "Picking with the right hand",
2755 | "Tracing the surface of something with the right hand or an object held in the right hand in a planar manner.",
2756 | "Putting on with the right hand",
2757 | "Release from the right hand"
2758 | ],
2759 | "gt_time": [
2760 | [
2761 | 0,
2762 | 10
2763 | ],
2764 | [
2765 | 11,
2766 | 33
2767 | ],
2768 | [
2769 | 34,
2770 | 414
2771 | ],
2772 | [
2773 | 415,
2774 | 451
2775 | ],
2776 | [
2777 | 452,
2778 | 462
2779 | ]
2780 | ],
2781 | "video_path": "original_videos\\subject_6_gopro_seg_2_4367-4829.mp4"
2782 | },
2783 | {
2784 | "action": [
2785 | "Grasp with the right hand",
2786 | "Picking with the right hand",
2787 | "Holding with the right hand",
2788 | "Pouring with the right hand",
2789 | "Holding with the right hand",
2790 | "Pouring with the right hand",
2791 | "Holding with the right hand",
2792 | "Putting on with the right hand",
2793 | "Release from the right hand"
2794 | ],
2795 | "gt_time": [
2796 | [
2797 | 0,
2798 | 19
2799 | ],
2800 | [
2801 | 20,
2802 | 55
2803 | ],
2804 | [
2805 | 56,
2806 | 81
2807 | ],
2808 | [
2809 | 82,
2810 | 130
2811 | ],
2812 | [
2813 | 131,
2814 | 140
2815 | ],
2816 | [
2817 | 141,
2818 | 164
2819 | ],
2820 | [
2821 | 165,
2822 | 206
2823 | ],
2824 | [
2825 | 207,
2826 | 234
2827 | ],
2828 | [
2829 | 235,
2830 | 246
2831 | ]
2832 | ],
2833 | "video_path": "original_videos\\subject_6_gopro_seg_2_4834-5080.mp4"
2834 | },
2835 | {
2836 | "action": [
2837 | "Grasp with the right hand",
2838 | "Picking with the right hand",
2839 | "Bringing with the right hand",
2840 | "Putting on with the right hand",
2841 | "Release from the right hand"
2842 | ],
2843 | "gt_time": [
2844 | [
2845 | 0,
2846 | 24
2847 | ],
2848 | [
2849 | 25,
2850 | 51
2851 | ],
2852 | [
2853 | 52,
2854 | 248
2855 | ],
2856 | [
2857 | 249,
2858 | 266
2859 | ],
2860 | [
2861 | 267,
2862 | 279
2863 | ]
2864 | ],
2865 | "video_path": "original_videos\\subject_7_gopro_seg_1_16498-16777.mp4"
2866 | },
2867 | {
2868 | "action": [
2869 | "Grasp with the right hand",
2870 | "Rotary motion with the right hand until it cannot be rotated",
2871 | "Release from the right hand"
2872 | ],
2873 | "gt_time": [
2874 | [
2875 | 0,
2876 | 62
2877 | ],
2878 | [
2879 | 63,
2880 | 284
2881 | ],
2882 | [
2883 | 285,
2884 | 287
2885 | ]
2886 | ],
2887 | "video_path": "original_videos\\subject_7_gopro_seg_1_17070-17357.mp4"
2888 | },
2889 | {
2890 | "action": [
2891 | "Grasp with the right hand",
2892 | "Starting rotary motion with the right hand while it is restrained.",
2893 | "Release from the right hand"
2894 | ],
2895 | "gt_time": [
2896 | [
2897 | 0,
2898 | 10
2899 | ],
2900 | [
2901 | 11,
2902 | 36
2903 | ],
2904 | [
2905 | 37,
2906 | 49
2907 | ]
2908 | ],
2909 | "video_path": "original_videos\\subject_7_gopro_seg_1_17625-17674.mp4"
2910 | },
2911 | {
2912 | "action": [
2913 | "Grasp with the right hand",
2914 | "Picking with the right hand",
2915 | "Release from the right hand"
2916 | ],
2917 | "gt_time": [
2918 | [
2919 | 0,
2920 | 10
2921 | ],
2922 | [
2923 | 11,
2924 | 24
2925 | ],
2926 | [
2927 | 25,
2928 | 36
2929 | ]
2930 | ],
2931 | "video_path": "original_videos\\subject_7_gopro_seg_2_12265-12301.mp4"
2932 | },
2933 | {
2934 | "action": [
2935 | "Grasp with the right hand",
2936 | "Cracking an egg with the right hand",
2937 | "Pouring with the right hand",
2938 | "Striking something with the right hand or an object held in the right hand.",
2939 | "Putting on with the right hand",
2940 | "Release from the right hand"
2941 | ],
2942 | "gt_time": [
2943 | [
2944 | 0,
2945 | 10
2946 | ],
2947 | [
2948 | 11,
2949 | 50
2950 | ],
2951 | [
2952 | 51,
2953 | 153
2954 | ],
2955 | [
2956 | 154,
2957 | 204
2958 | ],
2959 | [
2960 | 205,
2961 | 252
2962 | ],
2963 | [
2964 | 253,
2965 | 268
2966 | ]
2967 | ],
2968 | "video_path": "original_videos\\subject_8_gopro_seg_1_3685-3953.mp4"
2969 | },
2970 | {
2971 | "action": [
2972 | "Grasp with the right hand",
2973 | "Picking with the right hand",
2974 | "Putting on with the right hand",
2975 | "Release from the right hand"
2976 | ],
2977 | "gt_time": [
2978 | [
2979 | 0,
2980 | 17
2981 | ],
2982 | [
2983 | 18,
2984 | 41
2985 | ],
2986 | [
2987 | 42,
2988 | 60
2989 | ],
2990 | [
2991 | 61,
2992 | 69
2993 | ]
2994 | ],
2995 | "video_path": "original_videos\\subject_8_gopro_seg_1_5019-5088.mp4"
2996 | },
2997 | {
2998 | "action": [
2999 | "Grasp with the right hand",
3000 | "Picking with the right hand",
3001 | "Putting on with the right hand",
3002 | "Release from the right hand"
3003 | ],
3004 | "gt_time": [
3005 | [
3006 | 0,
3007 | 0
3008 | ],
3009 | [
3010 | 1,
3011 | 29
3012 | ],
3013 | [
3014 | 30,
3015 | 84
3016 | ],
3017 | [
3018 | 85,
3019 | 95
3020 | ]
3021 | ],
3022 | "video_path": "original_videos\\subject_8_gopro_seg_2_9473-9568.mp4"
3023 | },
3024 | {
3025 | "action": [
3026 | "Grasp with the right hand",
3027 | "Picking with the right hand",
3028 | "Putting on with the right hand",
3029 | "Release from the right hand"
3030 | ],
3031 | "gt_time": [
3032 | [
3033 | 0,
3034 | 22
3035 | ],
3036 | [
3037 | 23,
3038 | 45
3039 | ],
3040 | [
3041 | 46,
3042 | 149
3043 | ],
3044 | [
3045 | 150,
3046 | 159
3047 | ]
3048 | ],
3049 | "video_path": "original_videos\\subject_8_gopro_seg_2_9577-9736.mp4"
3050 | },
3051 | {
3052 | "action": [
3053 | "Grasp with the right hand",
3054 | "Picking with the right hand",
3055 | "Release from the right hand"
3056 | ],
3057 | "gt_time": [
3058 | [
3059 | 0,
3060 | 16
3061 | ],
3062 | [
3063 | 17,
3064 | 29
3065 | ],
3066 | [
3067 | 30,
3068 | 40
3069 | ]
3070 | ],
3071 | "video_path": "original_videos\\subject_8_gopro_seg_2_10056-10096.mp4"
3072 | },
3073 | {
3074 | "action": [
3075 | "Grasp with the right hand",
3076 | "Picking with the right hand",
3077 | "Release from the right hand"
3078 | ],
3079 | "gt_time": [
3080 | [
3081 | 0,
3082 | 29
3083 | ],
3084 | [
3085 | 30,
3086 | 43
3087 | ],
3088 | [
3089 | 44,
3090 | 54
3091 | ]
3092 | ],
3093 | "video_path": "original_videos\\subject_8_gopro_seg_2_10097-10151.mp4"
3094 | },
3095 | {
3096 | "action": [
3097 | "Grasp with the right hand",
3098 | "Tracing the surface of something with the right hand or an object held in the right hand in a planar manner.",
3099 | "Picking with the right hand",
3100 | "Release from the right hand"
3101 | ],
3102 | "gt_time": [
3103 | [
3104 | 0,
3105 | 10
3106 | ],
3107 | [
3108 | 11,
3109 | 397
3110 | ],
3111 | [
3112 | 398,
3113 | 431
3114 | ],
3115 | [
3116 | 432,
3117 | 458
3118 | ]
3119 | ],
3120 | "video_path": "original_videos\\subject_8_gopro_seg_2_10248-10706.mp4"
3121 | },
3122 | {
3123 | "action": [
3124 | "Grasp with the right hand",
3125 | "Release from the right hand"
3126 | ],
3127 | "gt_time": [
3128 | [
3129 | 0,
3130 | 26
3131 | ],
3132 | [
3133 | 27,
3134 | 36
3135 | ]
3136 | ],
3137 | "video_path": "original_videos\\subject_9_gopro_seg_1_1884-1920.mp4"
3138 | },
3139 | {
3140 | "action": [
3141 | "Grasp with the right hand",
3142 | "Picking with the right hand",
3143 | "Bringing with the right hand",
3144 | "Pouring with the right hand",
3145 | "Putting on with the right hand",
3146 | "Rotary motion with the right hand until it cannot be rotated",
3147 | "Release from the right hand"
3148 | ],
3149 | "gt_time": [
3150 | [
3151 | 0,
3152 | 15
3153 | ],
3154 | [
3155 | 16,
3156 | 28
3157 | ],
3158 | [
3159 | 29,
3160 | 50
3161 | ],
3162 | [
3163 | 51,
3164 | 185
3165 | ],
3166 | [
3167 | 186,
3168 | 209
3169 | ],
3170 | [
3171 | 210,
3172 | 233
3173 | ],
3174 | [
3175 | 234,
3176 | 251
3177 | ]
3178 | ],
3179 | "video_path": "original_videos\\subject_9_gopro_seg_1_2324-2575.mp4"
3180 | },
3181 | {
3182 | "action": [
3183 | "Grasp with the right hand",
3184 | "Release from the right hand"
3185 | ],
3186 | "gt_time": [
3187 | [
3188 | 0,
3189 | 12
3190 | ],
3191 | [
3192 | 13,
3193 | 28
3194 | ]
3195 | ],
3196 | "video_path": "original_videos\\subject_9_gopro_seg_1_3469-3497.mp4"
3197 | }
3198 | ]
--------------------------------------------------------------------------------