├── .DS_Store
├── images
    ├── .DS_Store
    ├── statistics.png
    ├── dataset_multimodal.jpg
    └── dataset_temporal.jpg
├── split_clips_to_frames.py
├── extend_driving_decision_annotation.py
├── README.md
└── extend_intent_annotation.py


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PSI-Intention2022/PSI-Dataset/HEAD/.DS_Store


--------------------------------------------------------------------------------
/images/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PSI-Intention2022/PSI-Dataset/HEAD/images/.DS_Store


--------------------------------------------------------------------------------
/images/statistics.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PSI-Intention2022/PSI-Dataset/HEAD/images/statistics.png


--------------------------------------------------------------------------------
/images/dataset_multimodal.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PSI-Intention2022/PSI-Dataset/HEAD/images/dataset_multimodal.jpg


--------------------------------------------------------------------------------
/images/dataset_temporal.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PSI-Intention2022/PSI-Dataset/HEAD/images/dataset_temporal.jpg


--------------------------------------------------------------------------------
/split_clips_to_frames.py:
--------------------------------------------------------------------------------
 1 | '''Given video path, extract frames for all videos. Check if frames exist first.'''
 2 | 
 3 | import os
 4 | import argparse
 5 | from pathlib import Path
 6 | import cv2
 7 | from tqdm import tqdm
 8 | import sys
 9 | 
10 | if __name__ == '__main__':
11 |     root_path = sys.argv[1]
12 |     video_path = os.path.join(root_path, 'PSI_Videos/videos')
13 |     frames_path = os.path.join(root_path, 'frames')
14 | 
15 |     #create 'data/frames' folder
16 |     if not os.path.exists(frames_path):
17 |         os.makedirs(frames_path)
18 |         print("Created 'frames' folder.")
19 |         
20 |     for video in tqdm(sorted(os.listdir(video_path))):
21 |         name = video.split('.mp4')[0]
22 |         video_target = os.path.join(video_path, video)
23 |         frames_target = os.path.join(frames_path, name)
24 | 
25 |         if not os.path.exists(frames_target):
26 |             os.makedirs(frames_target)
27 |             print(f'Created frames folder for video {name}')
28 | 
29 |         try:
30 |             vidcap = cv2.VideoCapture(video_target)
31 |             if not vidcap.isOpened():
32 |                 raise Exception(f'Cannot open file {video}')
33 |         except Exception as e:
34 |             raise e
35 | 
36 |         success, frame = vidcap.read()
37 |         cur_frame = 0
38 |         while(success):
39 |             frame_num = str(cur_frame).zfill(3)
40 |             cv2.imwrite(os.path.join(frames_target, f'{frame_num}.jpg'), frame)
41 |             success, frame = vidcap.read()
42 |             cur_frame += 1
43 |         vidcap.release()
44 |         # break
45 | 
46 | 


--------------------------------------------------------------------------------
/extend_driving_decision_annotation.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import sys
 4 | import copy
 5 | 
 6 |     
 7 | if __name__ == '__main__':
 8 |     print("Extend Driving Decision Annotations of PSI 2.0 Dataset.")
 9 | 
10 |     root_path = sys.argv[1]
11 | 
12 |     key_frame_anotation_path = os.path.join(root_path, 'PSI2.0_TrainVal/annotations/cognitive_annotation_key_frame')
13 |     extended_annotation_path = os.path.join(root_path, 'PSI2.0_TrainVal/annotations/cognitive_annotation_extended')
14 |     cv_annotation_path = os.path.join(root_path, 'PSI2.0_TrainVal/annotations/cv_annotation')
15 | 
16 |     if not os.path.exists(extended_annotation_path):
17 |         os.makedirs(extended_annotation_path)
18 | 
19 |     video_list = sorted(os.listdir(key_frame_anotation_path))
20 | 
21 |     for vname in video_list:
22 |         # 1. load key-frame annotations
23 |         key_dd_ann_file = os.path.join(key_frame_anotation_path, vname, 'driving_decision.json')
24 |         with open(key_dd_ann_file, 'r') as f:
25 |             key_dd_ann = json.load(f)
26 | 
27 |         # 2. extend annotations (driving decision) - decision to the future frames, description to the prior frames
28 |         extended_dd_ann = copy.deepcopy(key_dd_ann)
29 | 
30 |         annotator_list = key_dd_ann['frames'][list(key_dd_ann['frames'].keys())[0]]['cognitive_annotation'].keys()
31 |         frame_list = sorted(key_dd_ann['frames'].keys())
32 |         for annotator_k in annotator_list:            
33 |             pivot_speed = "maintainSpeed" # at the beginning no labels, use "goStraight" == "maintainSpeed" == 0
34 |             pivot_direction = "goStraight"
35 |             for i in range(len(frame_list)):
36 |                 frame_id = frame_list[i]
37 |                 if extended_dd_ann['frames'][frame_id]['cognitive_annotation'][annotator_k]['driving_decision_speed'] == "":
38 |                     extended_dd_ann['frames'][frame_id]['cognitive_annotation'][annotator_k]['driving_decision_speed'] = pivot_speed
39 |                 else:
40 |                     pivot_speed = extended_dd_ann['frames'][frame_id]['cognitive_annotation'][annotator_k]['driving_decision_speed']
41 |                 if extended_dd_ann['frames'][frame_id]['cognitive_annotation'][annotator_k]['driving_decision_direction'] == "":
42 |                     extended_dd_ann['frames'][frame_id]['cognitive_annotation'][annotator_k]['driving_decision_direction'] = pivot_direction
43 |                 else:
44 |                     pivot_direction = extended_dd_ann['frames'][frame_id]['cognitive_annotation'][annotator_k]['driving_decision_direction']
45 |                     
46 |             last_frame_id = frame_list[-1]
47 |             pivot_des = extended_dd_ann['frames'][last_frame_id]['cognitive_annotation'][annotator_k]['explanation']
48 |             for i in range(len(frame_list)-1, -1, -1):
49 |                 frame_id = frame_list[i]
50 |                 if extended_dd_ann['frames'][frame_id]['cognitive_annotation'][annotator_k]['explanation'] == '':
51 |                     extended_dd_ann['frames'][frame_id]['cognitive_annotation'][annotator_k]['explanation'] = pivot_des
52 |                 else:
53 |                     pivot_des = extended_dd_ann['frames'][frame_id]['cognitive_annotation'][annotator_k]['explanation']
54 |                     # Note: after this operation, some frames at the end of the observed frame list do not have descriptions, ['description']== ""  
55 | 
56 |         # 3. output extended annotations
57 |         output_dir = os.path.join(extended_annotation_path, vname)
58 |         if not os.path.exists(output_dir):
59 |             os.makedirs(output_dir)
60 | 
61 |         # write json to file
62 |         extended_dd_ann_file = os.path.join(extended_annotation_path, vname, 'driving_decision.json')
63 |         with open(extended_dd_ann_file, 'w') as file:
64 |             json_string = json.dumps(extended_dd_ann, default=lambda o: o.__dict__, sort_keys=False, indent=4)
65 |             file.write(json_string)
66 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Pedestrian Situated Intent (PSI) Bencharmark
  2 | 
  3 | This Repository contains the scripts and instructions about preparing the **Pedestrian Situated Intent (PSI) 1.0 & 2.0** dataset. 
  4 | 
  5 | ![image](./images/dataset_multimodal.jpg)
  6 | ![image](./images/dataset_temporal.jpg)
  7 | 
  8 | - **PSI 1.0**: video_0001 ~ video_0110
  9 | - **PSI 2.0**: video_0001 ~ video_0204
 10 | 
 11 | **NOTE: You may only need to use PSI 2.0 dataset for the [[IEEE ITSS PSI Competition](https://psi-intention2022.github.io)]**. But the PSI 1.0 dataset is also provided, and feel free to use it if you would explore additional knowledge beyond the PSI 2.0.
 12 | 
 13 | # 1. PSI 2.0 Dataset
 14 | 
 15 | ## Part 1. Prepare the dataset
 16 | ***Step 1.*** Download the PSI 2.0 Dataset videos from ~~[[Google Drive]()]~~ [[PSI Homepage](http://pedestriandataset.situated-intent.net)]. Move *\*.zip* files to the dataset *ROOT_PATH*, and unzip them by 
 17 | 
 18 | ```shell
 19 |     cd ROOT_PATH # e.g., root/Dataset
 20 |     unzip '*.zip' -d .
 21 |     rm *.zip
 22 | ```
 23 | The extracted folder contains all videos (Train/Val):
 24 | -  *ROOT_PATH/PSI_Videos/videos*.
 25 | 
 26 | ***Step 2*** Download PSI 2.0 data annotations from ~~[[Google Drive]()]~~ [[PSI Homepage](http://pedestriandataset.situated-intent.net)]. Move downloaded *\*.zip* files to the dataset *ROOT_PATH*.
 27 | 
 28 | ```python
 29 |     unzip '*.zip' -d .
 30 |     rm *.zip
 31 | ```
 32 | 
 33 | The extracted folder contains all annotations of the PSI 2.0 Dataset (Train/Val)
 34 | - *ROOT_PATH/PSI2.0_TrainVal/annotations/cognitive_annotation_key_frame*
 35 | - *ROOT_PATH/PSI2.0_TrainVal/annotations/cv_annotation*
 36 | 
 37 | and the train/val/test splits:
 38 | - *ROOT_PATH/PSI2.0_TrainVal/splits/PSI2_split.json*.
 39 | 
 40 | 
 41 | ***Step 3*** Split the videos into frames by
 42 | 
 43 | ```shell
 44 |     python split_clips_to_frames.py *ROOT_PATH*
 45 | ```
 46 | and the output frames are saved as:
 47 | - *ROOT_PATH/frames*.
 48 | 
 49 | 
 50 | ## Part 2. Extend Key-frame Cognitive Annotations
 51 | 
 52 | **TASK 1 -  Pedestrian Intent**: The frame at which one annotator explicitly make a cross intent annotation is treated as the "key-frame". Every annotator would give one "intent" annotation about the crossing intent estimation of the target pedestrian, together with one "reansoning/explanation" of the estimation. For these two annotation: 
 53 | 
 54 | - Crossing Intent: We extend the crossing intent annotation to the frames following the current key-frame, until the next frame that one of the annotators make another estimation (no matter if the two annotations are the same or not).
 55 | - Reasoning/Explanation: We extend the reasoning/description of the intent estimation to the frames prior to the current key-frame, until the last key-frame that one of the annotators made another estimation, assuming the description is about the scenes observed by the annotators to support the intent estimation.
 56 | 
 57 | *Already-crossed*: When a pedestrian has already crossed in front of the moving vehicle, we categorize the status after the target pedestrian crosses the middle line of the ego-view as "*Already-crossed*." In this scenario, there is no necessity to predict any further crossing intent, as the target pedestrian has already safely crossed the road.
 58 | 
 59 | 
 60 | ```shell
 61 |     python extend_intent_annotation.py *ROOT_PATH*
 62 | ```
 63 | and the output frames are saved as:
 64 | - *ROOT_PATH*/PSI2.0_TrainVal/annotations/cognitive_annotation_extended.
 65 | 
 66 | **TASK 2 -  Pedestrian Trajectory**: Pedestrian trajectory prediction task uses only the visual annotations (bounding boxes) of the target pedestrian, there is no need to exntend the cognitive annotations to all frames. 
 67 | 
 68 | **TASK 3 -  Driving Decision**: The frame at which one of the annotators explicitly make a driving decision is treated as the "key-frame". Every annotator would give one "decision" at the key-frame, and provide one "reansoning/description" of the decision made. For these two cognitive annotations: 
 69 | 
 70 | - Driving Decision: We extend the driving decision annotation to the frames following the current key-frame, until the next frame that one of the annotators make another driving decision (e.g., turn or go straight).
 71 | - Reasoning/Description: We extend the reasoning/description to the frames prior to the current key-frame, until the last key-frame that one of the annotators made another estimation, assuming the description is about the scenes observed by the annotators to support the driving decision.
 72 | 
 73 | ```shell
 74 |     python extend_driving_decision_annotation.py *ROOT_PATH*
 75 | ```
 76 | and the output frames are saved as:
 77 | - *ROOT_PATH*/PSI2.0_TrainVal/annotations/cognitive_annotation_extended.
 78 | 
 79 | ![image](./images/statistics.png)
 80 | 
 81 | ## Part 3. Baselines for Different Tasks Using the PSI Dataset 
 82 | 
 83 | We provide baselines for all tracks of challenges as hints about using the PSI 2.0 dataset for a quick start. 
 84 | 
 85 | ***Track 1 ([Pedestrian Intent Prediction (PIP)](https://github.com/PSI-Intention2022/PSI-Intent-Prediction.git))***
 86 | 
 87 | ***Track 2 ([Pedestrian Trajectory Prediction (PTP)](https://github.com/PSI-Intention2022/PSI-Trajectory-Prediction.git))***
 88 | 
 89 | ***Track 3 ([Driver Decision Prediction (DDP)](https://github.com/PSI-Intention2022/PSI-DriverDecision-Prediction.git))***
 90 | 
 91 | 
 92 | # 2. PSI 1.0 Dataset
 93 | 
 94 | If you would like to use the PSI 1.0 dataset annotations, download PSI 1.0 data annotations from ~~[[Google Drive]()]~~ [[PSI Homepage](http://pedestriandataset.situated-intent.net)]. Move downloaded *\*.zip* files to the dataset *ROOT_PATH*.
 95 | 
 96 | ```shell
 97 |     unzip '*.zip' -d .
 98 |     rm *.zip
 99 | ```
100 | 
101 | The extracted PSI 1.0 dataset follows the same format as PSI 2.0, so feel free to use the *Cognitive Annotation Extention* scripts and *Baselines* prepared for PSI 2.0 to explore the PSI 1.0 knowledge. 
102 | 
103 | (*Note:* PSI 2.0 and PSI 1.0 share the first 110 videos, but have different annotations. Please check the [[paper](https://arxiv.org/pdf/2112.02604v2.pdf)] and our future udpates for more information. )


--------------------------------------------------------------------------------
/extend_intent_annotation.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import sys
  4 | import copy
  5 | 
  6 | def most_frequent(List):
  7 |     # return the most frequent intent estimation made by all annotators
  8 |     counter = 0
  9 |     num = List[0]
 10 |     
 11 |     for i in List:
 12 |         curr_frequency = List.count(i)
 13 |         if(curr_frequency> counter):
 14 |             counter = curr_frequency
 15 |             num = i
 16 | 
 17 |     return num
 18 | 
 19 |     
 20 | if __name__ == '__main__':
 21 |     print("Extend Intent Annotations of PSI 2.0 Dataset.")
 22 | 
 23 |     root_path = sys.argv[1]
 24 | 
 25 |     key_frame_anotation_path = os.path.join(root_path, 'PSI2.0_TrainVal/annotations/cognitive_annotation_key_frame')
 26 |     extended_annotation_path = os.path.join(root_path, 'PSI2.0_TrainVal/annotations/cognitive_annotation_extended')
 27 |     cv_annotation_path = os.path.join(root_path, 'PSI2.0_TrainVal/annotations/cv_annotation')
 28 | 
 29 |     if not os.path.exists(extended_annotation_path):
 30 |         os.makedirs(extended_annotation_path)
 31 | 
 32 |     video_list = sorted(os.listdir(key_frame_anotation_path))
 33 | 
 34 |     for vname in video_list:
 35 |         # 1. load key-frame annotations
 36 |         key_intent_ann_file = os.path.join(key_frame_anotation_path, vname, 'pedestrian_intent.json')
 37 |         with open(key_intent_ann_file, 'r') as f:
 38 |             key_intent_ann = json.load(f)
 39 | 
 40 |         # 2. extend annotations (intent & description) - intent to the future frames, description to the prior frames
 41 |         extended_intent_ann = copy.deepcopy(key_intent_ann)
 42 | 
 43 |         for ped_k in key_intent_ann['pedestrians'].keys():
 44 |             observed_frames = key_intent_ann['pedestrians'][ped_k]['observed_frames']
 45 |             for ann_k in key_intent_ann['pedestrians'][ped_k]['cognitive_annotations'].keys():
 46 |                 intent_list = key_intent_ann['pedestrians'][ped_k]['cognitive_annotations'][ann_k]['intent']
 47 |                 key_frame_list = key_intent_ann['pedestrians'][ped_k]['cognitive_annotations'][ann_k]['key_frame']
 48 |                 description_list = key_intent_ann['pedestrians'][ped_k]['cognitive_annotations'][ann_k]['description']
 49 |                 assert len(intent_list) == len(key_frame_list) == len(description_list)
 50 | 
 51 |                 pivot_int = 'not_sure' # 0.5 # at the beginning if no labels, use "not_sure"
 52 |                 for frame_k in range(len(observed_frames)):
 53 |                     if intent_list[frame_k] == "":
 54 |                         extended_intent_ann['pedestrians'][ped_k]['cognitive_annotations'][ann_k]['intent'][frame_k] = pivot_int
 55 |                     else:
 56 |                         pivot_int = intent_list[frame_k]
 57 | 
 58 |                 pivot_des = description_list[-1]
 59 |                 for frame_k in range(len(observed_frames)-1, -1, -1):
 60 |                     if description_list[frame_k] == "":
 61 |                         extended_intent_ann['pedestrians'][ped_k]['cognitive_annotations'][ann_k]['description'][frame_k] = pivot_des
 62 |                     else:
 63 |                         pivot_des = description_list[frame_k]
 64 |                     # Note: after this operation, some frames at the end of the observed frame list do not have descriptions, ['description']== ""  
 65 | 
 66 |         # 3. Ignore 'Already-crossed' frames
 67 |         for ped_k in extended_intent_ann['pedestrians'].keys():
 68 |             observed_frames = extended_intent_ann['pedestrians'][ped_k]['observed_frames']
 69 |             last_intents = []
 70 |             last_key_frames = []
 71 |             for ann_k in extended_intent_ann['pedestrians'][ped_k]['cognitive_annotations'].keys():
 72 |                 intent_list = extended_intent_ann['pedestrians'][ped_k]['cognitive_annotations'][ann_k]['intent']
 73 |                 key_frame_list = extended_intent_ann['pedestrians'][ped_k]['cognitive_annotations'][ann_k]['key_frame']
 74 |                 last_intents.append(intent_list[-1])
 75 |                 for j in range(len(observed_frames)-1, -1, -1):
 76 |                     if key_frame_list[j] != 0:
 77 |                         last_key_frames.append(j) # Note! Here 'j' is not the frame number, it's the idx/position of the frame in the 'observed_frame' list
 78 |                         break
 79 |                     else:
 80 |                         continue
 81 | 
 82 |             if most_frequent(last_intents) == 'cross': # only apply to the 'cross' cases
 83 |                 start_box = extended_intent_ann['pedestrians'][ped_k]['cv_annotations']['bboxes'][0]
 84 |                 last_key_box = extended_intent_ann['pedestrians'][ped_k]['cv_annotations']['bboxes'][max(last_key_frames)]
 85 |                 end_box = extended_intent_ann['pedestrians'][ped_k]['cv_annotations']['bboxes'][-1]
 86 |                 if ((last_key_box[0]+last_key_box[2])/2 - 640) * ((end_box[0]+end_box[2])/2 - 640) >= 0:
 87 |                     # Situation 1: By the last key-frame annotation, the target pedestrian already crossed the middle line of the ego-view, and is on the same side as the final position.
 88 |                     # In such case, we use the last annotated key-frame as the end of "intent estimation" task
 89 |                     last_intent_estimate_idx = max(last_key_frames)
 90 |                 else: # < 0
 91 |                     # Situation 2: By the last key-frame annotation, the target pedestrian is at a different position compared to the final observed position.
 92 |                     # In such case, we use the moment when the target pedestrian crossed the middle line of the ego-view as the last frame of "intent estimation" task
 93 |                     for cur_frame_k in range(max(last_key_frames), len(observed_frames)): # pedestrian could change positions several times, e.g., vehicle is turning. Thus starts from the last key-frame
 94 |                         current_box = extended_intent_ann['pedestrians'][ped_k]['cv_annotations']['bboxes'][cur_frame_k]
 95 |                         if ((current_box[0]+current_box[2])/2 - 640) * ((end_box[0]+end_box[2])/2 - 640) >= 0:
 96 |                             # once the pedestrian crossed the middle line of ego-view, to the same side as the last frame, use this moment as the last intent estimation task frame
 97 |                             last_intent_estimate_idx = cur_frame_k
 98 |                             break
 99 |                         else:
100 |                             continue
101 |                 # Cut redundant intent extended annotations that not usable for "intent estimation" task
102 |                 del extended_intent_ann['pedestrians'][ped_k]['observed_frames'][last_intent_estimate_idx+1:]
103 |                 del extended_intent_ann['pedestrians'][ped_k]['cv_annotations']['bboxes'][last_intent_estimate_idx+1:]
104 |                 for ann_k in extended_intent_ann['pedestrians'][ped_k]['cognitive_annotations'].keys():
105 |                     del extended_intent_ann['pedestrians'][ped_k]['cognitive_annotations'][ann_k]['intent'][last_intent_estimate_idx+1:]
106 |                     del extended_intent_ann['pedestrians'][ped_k]['cognitive_annotations'][ann_k]['key_frame'][last_intent_estimate_idx+1:]
107 |                     del extended_intent_ann['pedestrians'][ped_k]['cognitive_annotations'][ann_k]['description'][last_intent_estimate_idx+1:]
108 | 
109 | 
110 |         # 4. output extended annotations
111 |         output_dir = os.path.join(extended_annotation_path, vname)
112 |         if not os.path.exists(output_dir):
113 |             os.makedirs(output_dir)
114 | 
115 |         # write json to file
116 |         extended_intent_ann_file = os.path.join(extended_annotation_path, vname, 'pedestrian_intent.json')
117 |         with open(extended_intent_ann_file, 'w') as file:
118 |             json_string = json.dumps(extended_intent_ann, default=lambda o: o.__dict__, sort_keys=False, indent=4)
119 |             file.write(json_string)
120 | 
121 |         print(vname, ": Original observed frames: {} --> valid intent estimation frames: {}".format(
122 |             len(key_intent_ann['pedestrians'][ped_k]['observed_frames']),
123 |             len(extended_intent_ann['pedestrians'][ped_k]['observed_frames'])
124 |         ))
125 | 


--------------------------------------------------------------------------------