├── .gitignore ├── .gitmodules ├── LICENSE ├── README.md ├── download.sh ├── download_model.sh ├── model ├── __init__.py ├── attentions.py ├── context_encoder.py ├── cuda.py ├── follower_coattend.py ├── follower_coground.py └── speaker_lstm.py ├── requirements.txt ├── simulator ├── connectivity │ ├── 17DRP5sb8fy_connectivity.json │ ├── 1LXtFkjw3qL_connectivity.json │ ├── 1pXnuDYAj8r_connectivity.json │ ├── 29hnd4uzFmX_connectivity.json │ ├── 2azQ1b91cZZ_connectivity.json │ ├── 2n8kARJN3HM_connectivity.json │ ├── 2t7WUuJeko7_connectivity.json │ ├── 5LpN3gDmAk7_connectivity.json │ ├── 5ZKStnWn8Zo_connectivity.json │ ├── 5q7pvUzZiYa_connectivity.json │ ├── 759xd9YjKW5_connectivity.json │ ├── 7y3sRwLe3Va_connectivity.json │ ├── 8194nk5LbLH_connectivity.json │ ├── 82sE5b5pLXE_connectivity.json │ ├── 8WUmhLawc2A_connectivity.json │ ├── ARNzJeq3xxb_connectivity.json │ ├── B6ByNegPMKs_connectivity.json │ ├── D7G3Y4RVNrH_connectivity.json │ ├── D7N2EKCX4Sj_connectivity.json │ ├── E9uDoFAP3SH_connectivity.json │ ├── EDJbREhghzL_connectivity.json │ ├── EU6Fwq7SyZv_connectivity.json │ ├── GdvgFV5R1Z5_connectivity.json │ ├── HxpKQynjfin_connectivity.json │ ├── JF19kD82Mey_connectivity.json │ ├── JeFG25nYj2p_connectivity.json │ ├── JmbYfDe2QKZ_connectivity.json │ ├── PX4nDJXEHrG_connectivity.json │ ├── Pm6F8kyY3z2_connectivity.json │ ├── PuKPg4mmafe_connectivity.json │ ├── QUCTc6BB5sX_connectivity.json │ ├── README.md │ ├── RPmz2sHmrrY_connectivity.json │ ├── S9hNv5qa7GM_connectivity.json │ ├── SN83YJsR3w2_connectivity.json │ ├── TbHJrupSAjP_connectivity.json │ ├── ULsKaCPVFJR_connectivity.json │ ├── UwV83HsGsw3_connectivity.json │ ├── Uxmj2M2itWa_connectivity.json │ ├── V2XKFyX4ASd_connectivity.json │ ├── VFuaQ6m2Qom_connectivity.json │ ├── VLzqgDo317F_connectivity.json │ ├── VVfe2KiqLaN_connectivity.json │ ├── Vt2qJdWjCF2_connectivity.json │ ├── Vvot9Ly1tCj_connectivity.json │ ├── VzqfbhrpDEA_connectivity.json │ ├── WYY7iVyf5p8_connectivity.json │ ├── X7HyMhZNoso_connectivity.json │ ├── XcA2TqTSSAj_connectivity.json │ ├── YFuZgdQ5vWj_connectivity.json │ ├── YVUC4YcDtcY_connectivity.json │ ├── YmJkqBEsHnH_connectivity.json │ ├── Z6MFQCViBuw_connectivity.json │ ├── ZMojNkEp431_connectivity.json │ ├── aayBHfsNo7d_connectivity.json │ ├── ac26ZMwG7aT_connectivity.json │ ├── b8cTxDM8gDG_connectivity.json │ ├── cV4RVeZvu5T_connectivity.json │ ├── dhjEzFoUFzH_connectivity.json │ ├── e9zR4mvMWw7_connectivity.json │ ├── fzynW3qQPVF_connectivity.json │ ├── gTV8FGcVJC9_connectivity.json │ ├── gYvKGZ5eRqb_connectivity.json │ ├── gZ6f7yhEvPG_connectivity.json │ ├── gxdoqLR6rwA_connectivity.json │ ├── i5noydFURQK_connectivity.json │ ├── jh4fc5c5qoQ_connectivity.json │ ├── jtcxE69GiFV_connectivity.json │ ├── kEZ7cmS4wCh_connectivity.json │ ├── mJXqzFtmKg4_connectivity.json │ ├── oLBMNvg9in8_connectivity.json │ ├── p5wJjkQkbXX_connectivity.json │ ├── pLe4wQe7qrG_connectivity.json │ ├── pRbA3pwrgk9_connectivity.json │ ├── pa4otMbVnkk_connectivity.json │ ├── q9vSo1VnCiC_connectivity.json │ ├── qoiz87JEwZ2_connectivity.json │ ├── r1Q1Z4BcV1o_connectivity.json │ ├── r47D5H71a5s_connectivity.json │ ├── rPc6DW4iMge_connectivity.json │ ├── rqfALeAoiTq_connectivity.json │ ├── s8pcmisQ38h_connectivity.json │ ├── sKLMLpTHeUy_connectivity.json │ ├── sT4fr6TAbpF_connectivity.json │ ├── scans.txt │ ├── uNb9QFRL6hY_connectivity.json │ ├── ur6pFq6Qu1A_connectivity.json │ ├── vyrNrziPKCB_connectivity.json │ ├── wc2JMjhGNzB_connectivity.json │ ├── x8F5xyUWy9e_connectivity.json │ ├── yqstnuAEVhm_connectivity.json │ └── zsNo4HB9uLZ_connectivity.json ├── envs │ ├── __init__.py │ ├── env.py │ ├── envs_utils.py │ ├── image_feature.py │ └── paths.py └── resnet_feature │ └── ResNet-152-deploy.prototxt ├── src ├── __init__.py ├── eval_follower.py ├── follower.py ├── params.py ├── process_data.py ├── speaker.py ├── train_follower.py ├── utils.py ├── val_follower.py └── vocab │ ├── __init__.py │ ├── tokenizer.py │ └── vocab_path.py ├── tasks ├── R2R │ ├── README.md │ ├── __init__.py │ └── requirements.txt ├── R4R │ ├── README.md │ ├── __init__.py │ ├── cls.py │ ├── dtw.py │ ├── graph_utils.py │ └── r4r_generate_data.py └── __init__.py └── teaser ├── babywalk_curriculum.jpg └── pytorch-logo-dark.png /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | /data 3 | /build 4 | /doxygen 5 | *.tsv 6 | /sim_imgs 7 | *.so 8 | *kdev4* 9 | *.caffemodel 10 | *.caffemodel.h5 11 | *.pyc 12 | *.out 13 | *.zip 14 | plot* 15 | 16 | simulator/*~ 17 | simulator/data 18 | simulator/build* 19 | simulator/doxygen 20 | simulator/*.tsv 21 | simulator/sim_imgs 22 | simulator/*.so 23 | simulator/*kdev4* 24 | simulator/*.caffemodel 25 | simulator/*.caffemodel.h5 26 | simulator/*.pyc 27 | simulator/*.out 28 | simulator/*.zip 29 | simulator/envs/*.swp 30 | 31 | /tasks/R2R/data/*.json 32 | /tasks/R2R/plot* 33 | /tasks/R2R/results/ 34 | /tasks/R2R/snapshots/ 35 | /tasks/R2R/data/v1 36 | /tasks/R2R/*.swp 37 | /tasks/R2R/follower/ 38 | /tasks/R2R/speaker/ 39 | /tasks/R2R/follower_traj/ 40 | 41 | /tasks/R4R/data/*.json 42 | /tasks/R4R/plot* 43 | /tasks/R4R/results/ 44 | /tasks/R4R/snapshots/ 45 | /tasks/R4R/data/v1 46 | /tasks/R4R/*.swp 47 | /tasks/R4R/follower/ 48 | /tasks/R4R/speaker/ 49 | /tasks/R4R/follower_traj/ 50 | 51 | /model/*.swp 52 | # intellij 53 | .idea/ 54 | *.iml 55 | 56 | img_features 57 | /run_script.sh 58 | /val_all.bash 59 | /val_all.sh 60 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "simulator/pybind11"] 2 | path = simulator/pybind11 3 | url = https://github.com/pybind/pybind11.git 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Sha-Lab 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # BabyWalk: Going Farther in Vision-and-Language Navigationby Taking Baby Steps 2 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) 3 | 4 | This is the PyTorch implementation of our paper: 5 | 6 | **BabyWalk: Going Farther in Vision-and-Language Navigationby Taking Baby Steps**
7 | Wang Zhu*, Hexiang Hu*, Jiacheng Chen, Zhiwei Deng, Vihan Jain, Eugene Ie, Fei Sha
8 | 2020 Annual Conference of the Association for Computational Linguistics (ACL 2020) 9 | 10 | [[arXiv](http://arxiv.org/abs/2005.04625)] [[GitHub](https://github.com/Sha-Lab/babywalk)] 11 | 12 | ## Abstract 13 | Learning to follow instructions is of fundamental importance to autonomous agents for vision-and-language navigation (VLN). In this paper, we study how an agent can navigate long paths when learning from a corpus that consists of shorter ones. We show that existing state-of-the-art agents do not generalize well. To this end, we propose BabyWalk, a new VLN agent that is learned to navigate by decomposing long instructions into shorter ones (BabySteps) and completing them sequentially. A special design memory buffer is used by the agent to turn its past experiences into contexts for future steps. The learning process is composed of two phases. In the first phase, the agent uses imitation learning from demonstration to accomplish BabySteps. In the second phase, the agent uses curriculum-based reinforcement learning to maximize rewards on navigation tasks with increasingly longer instructions. We create two new benchmark datasets (of long navigation tasks) and use them in conjunction with existing ones to examine BabyWalk's generalization ability. Empirical results show that BabyWalk achieves state-of-the-art results on several metrics, in particular, is able to follow long instructions better. 14 |

15 | 16 | 17 | ## Installation 18 | 19 | 1. Install Python 3.7 (Anaconda recommended: https://www.anaconda.com/distribution/). 20 | 2. Install PyTorch following the instructions on https://pytorch.org/ (we used PyTorch 1.1.0 in our experiments). 21 | 3. Download this repository or clone with Git, and then enter the root directory of the repository: 22 | ``` 23 | git clone https://github.com/Sha-Lab/babywalk 24 | cd babywalk 25 | ``` 26 | 4. Check the installation of required packages in requirement.txt. 27 | 5. Download and preprocess the data 28 | ``` 29 | chmod +x download.sh 30 | ./download.sh 31 | ``` 32 | After this step, check 33 | + `simulator/resnet_feature/` should contain `ResNet-152-imagenet.tsv`. 34 | + `simulator` should contain `total_adj_list.json`, which replace the Matterport3D simulator 35 | + `src/vocab/vocab_data` should contain vocabulary and its glove embedding files `train_vocab.txt` and `train_glove.npy`. 36 | + `tasks/` should contain `R2R`, `R4R`, `R6R`, `R8R`, `R2T8`, each which a data folder in it containing training/evaluation data. 37 | 38 | **Updates**: The old link for the ResNet feature is expired. Please see [here](https://drive.google.com/file/d/1HjEH3EQt-aHSjolg0VnX_YF1UEHiXLfT/view?usp=sharing) for the new link and the additional [**landmark alignment code**](https://drive.google.com/file/d/1soXYE-IMveMpvjAi6lNSpJURE3B4IYk3/view?usp=sharing). 39 | 40 | ## Training and evaluation 41 | Here we take training on R2R as an example, using BABYWALK. 42 | 43 | ### Warmup with IL 44 | ``` 45 | CUDA_VISIBLE_DEVICES=0 python src/train_follower.py \ 46 | --split_postfix "_landmark" \ 47 | --task_name R2R \ 48 | --n_iters 50000 \ 49 | --model_name "follower_bbw" \ 50 | --il_mode "landmark_split" \ 51 | --one_by_one \ 52 | --one_by_one_mode "landmark" \ 53 | --history \ 54 | --log_every 100 55 | ``` 56 | 57 | ### Training with CRL 58 | ``` 59 | CUDA_VISIBLE_DEVICES=0 python src/train_follower.py \ 60 | --split_postfix "_landmark" \ 61 | --task_name R2R \ 62 | --n_iters 30000 \ 63 | --curriculum_iters 5000 \ 64 | --model_name "follower_bbw_crl" \ 65 | --one_by_one \ 66 | --one_by_one_mode "landmark" \ 67 | --history \ 68 | --log_every 100 \ 69 | --reward \ 70 | --reward_type "cls" \ 71 | --batch_size 64 \ 72 | --curriculum_rl \ 73 | --max_curriculum 4 \ 74 | --no_speaker \ 75 | --follower_prefix "tasks/R2R/follower/snapshots/follower_bbw_sample_train_iter_30000" 76 | ``` 77 | 78 | ### Other baselines 79 | Here we take training on R2R as an example, using Speaker-Follower and Reinforced Cross-modal Matching. 80 | + Speaker-Follower 81 | ``` 82 | CUDA_VISIBLE_DEVICES=0 python src/train_follower.py \ 83 | --task_name R2R \ 84 | --n_iters 50000 \ 85 | --model_name "follower_sf_aug" \ 86 | --add_augment 87 | CUDA_VISIBLE_DEVICES=0 python src/train_follower.py \ 88 | --task_name R2R \ 89 | --n_iters 20000 \ 90 | --model_name "follower_sf" \ 91 | --follower_prefix "tasks/R2R/follower/snapshots/best_model" 92 | ``` 93 | + Reinforced Cross-modal Matching 94 | ``` 95 | CUDA_VISIBLE_DEVICES=0 python src/train_follower.py \ 96 | --task_name R2R \ 97 | --n_iters 20000 \ 98 | --model_name "follower_rcm_cls" \ 99 | --reward \ 100 | --reward_type "cls" \ 101 | --batch_size 64 \ 102 | --no_speaker \ 103 | --follower_prefix "tasks/R2R/follower/snapshots/follower_sf_aug_sample_train-literal_speaker_data_augmentation_iter_50000" 104 | ``` 105 | 106 | ### Evaluation 107 | Here we take model trained on R2R, using BABYWALK as an example.
108 | + Evaluate on the validation unseen data of Room 2-to-8. 109 | ``` 110 | CUDA_VISIBLE_DEVICES=0 python src/val_follower.py \ 111 | --task_name R2T8 \ 112 | --split_postfix "_landmark" \ 113 | --one_by_one \ 114 | --one_by_one_mode "landmark" \ 115 | --model_name "follower_bbw" \ 116 | --history \ 117 | --follower_prefix "tasks/R2R/follower/snapshots/best_model" 118 | ``` 119 | 120 | + Evaluate on the validation seen / unseen data of R**x**R (**x**=2,4,6,8). 121 | + change ``` --task_name R2T8 ``` to ``` --task_name RxR ``` 122 | + Evaluate on the test data of R2R. 123 | + set ``` --task_name R2R ``` 124 | + add ``` --use test ``` 125 | + For SF/RCM models, evaluate on R**x**R (**x**=2,4,6,8). 126 | + set ``` --task_name RxR ``` 127 | + set ``` --max_steps 5*x ``` and ``` --max_ins_len 50*x ``` 128 | ## Download reported models in our paper 129 | ``` 130 | chmod +x download_model.sh 131 | ./download_model.sh 132 | ``` 133 | ### Performance comparison on SDTW 134 | **Models trained on R4R** 135 | | Model | Eval R2R | Eval R4R | Eval R6R | Eval R8R | 136 | |:-------------------:|:--------:|:--------:|:--------:|:--------:| 137 | | SF | 14.8 | 9.2 | 5.2 | 5.0 | 138 | | RCM(FIDELITY) | 18.3 | 13.7 | 7.9 | 6.1 | 139 | | REGRETFUL | 13.4 | 13.5 | 7.5 | 5.6 | 140 | | FAST | 14.2 | 15.5 | 7.7 | 6.3 | 141 | | BABYWALK | 27.8 | 17.3 | 13.1 | 11.5 | 142 | | BABYWALK(COGROUND) | ***31.6*** | ***20.0*** | ***15.9*** | ***13.9*** | 143 | 144 | 145 | **Models trained on R2R** 146 | | Model | Eval R2R | Eval R4R | Eval R6R | Eval R8R | 147 | |:-------------------:|:--------:|:--------:|:--------:|:--------:| 148 | | SF | 27.2 | 6.7 | 7.2 | 3.8 | 149 | | RCM(FIDELITY) | 34.4 | 7.2 | 8.4 | 4.3 | 150 | | REGRETFUL | 40.6 | 9.8 | 6.8 | 2.4 | 151 | | FAST | ***45.4*** | 7.2 | 8.5 | 2.4 | 152 | | BABYWALK | 36.9 | ***13.8*** | ***11.2*** | ***9.8*** | 153 | 154 | 155 | # Citation 156 | 157 | Please citing the follow BibTex entry if you are using any content from this repository: 158 | ``` 159 | @inproceedings{zhu2020babywalk, 160 | title = "{B}aby{W}alk: Going Farther in Vision-and-Language Navigation by Taking Baby Steps", 161 | author = "Zhu, Wang and Hu, Hexiang and Chen, Jiacheng and Deng, Zhiwei and Jain, Vihan and Ie, Eugene and Sha, Fei", 162 | booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics", 163 | year = "2020", 164 | publisher = "Association for Computational Linguistics", 165 | pages = "2539--2556", 166 | } 167 | ``` 168 | 169 | -------------------------------------------------------------------------------- /download.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # vocab data 4 | mkdir -p src/vocab/vocab_data 5 | wget https://www.dropbox.com/s/r71i31xpm1zy3oy/sub_train_vocab.txt?dl=0 -O src/vocab/vocab_data/sub_train_vocab.txt 6 | wget https://www.dropbox.com/s/xqt6et0i1g41t88/train_glove.npy?dl=0 -O src/vocab/vocab_data/train_glove.npy 7 | wget https://www.dropbox.com/s/l7dee5fls07t9q0/train_vocab.txt?dl=0 -O src/vocab/vocab_data/train_vocab.txt 8 | wget https://www.dropbox.com/s/cjapgv3rpxrq1ie/trainval_glove.npy?dl=0 -O src/vocab/vocab_data/trainval_glove.npy 9 | wget https://www.dropbox.com/s/3s2plada1vttxuv/trainval_vocab.txt?dl=0 -O src/vocab/vocab_data/trainval_vocab.txt 10 | 11 | # resnet feature 12 | mkdir -p simulator/resnet_feature/ 13 | curl -c /tmp/cookies "https://drive.google.com/uc?export=download&id=1HjEH3EQt-aHSjolg0VnX_YF1UEHiXLfT" > /tmp/intermezzo.html 14 | curl -L -b /tmp/cookies "https://drive.google.com$(cat /tmp/intermezzo.html | grep -Po 'uc-download-link" [^>]* href="\K[^"]*' | sed 's/\&/\&/g')" > simulator/resnet_feature/ResNet-152-imagenet.zip 15 | unzip simulator/resnet_feature/ResNet-152-imagenet.zip -d simulator/resnet_feature 16 | 17 | # adjacency dict 18 | wget https://www.dropbox.com/s/6a076293c3o77gi/total_adj_list.json?dl=0 -O simulator/total_adj_list.json 19 | 20 | # training/eval data 21 | mkdir -p tasks/R2R/data 22 | mkdir -p tasks/R4R/data 23 | mkdir -p tasks/R6R/data 24 | mkdir -p tasks/R8R/data 25 | mkdir -p tasks/R2T8/data 26 | wget https://www.dropbox.com/s/2v3f72vpoj53r6d/R2R_data.zip?dl=0 -O tasks/R2R/data/R2R_data.zip 27 | wget https://www.dropbox.com/s/7n7ptzkjr601dq9/R4R_data.zip?dl=0 -O tasks/R4R/data/R4R_data.zip 28 | wget https://www.dropbox.com/s/bjqwu9tn0t6f50r/R6R_data.zip?dl=0 -O tasks/R6R/data/R6R_data.zip 29 | wget https://www.dropbox.com/s/kdid25goi88sgxo/R8R_data.zip?dl=0 -O tasks/R8R/data/R8R_data.zip 30 | wget https://www.dropbox.com/s/aswlh36v68x3al0/R2T8_data.zip?dl=0 -O tasks/R2T8/data/R2T8_data.zip 31 | unzip tasks/R2R/data/R2R_data.zip -d tasks/R2R/data 32 | unzip tasks/R4R/data/R4R_data.zip -d tasks/R4R/data 33 | unzip tasks/R6R/data/R6R_data.zip -d tasks/R6R/data 34 | unzip tasks/R8R/data/R8R_data.zip -d tasks/R8R/data 35 | unzip tasks/R2T8/data/R2T8_data.zip -d tasks/R2T8/data 36 | 37 | # download speaker model 38 | mkdir -p tasks/R2R/speaker/snapshots 39 | mkdir -p tasks/R4R/speaker/snapshots 40 | mkdir -p tasks/R6R/speaker/snapshots 41 | mkdir -p tasks/R8R/speaker/snapshots 42 | wget https://www.dropbox.com/s/65z90zktd7w6dtz/speaker.zip?dl=0 -O tasks/R2R/speaker/snapshots/speaker.zip 43 | wget https://www.dropbox.com/s/q223j0vn1ofd89z/speaker.zip?dl=0 -O tasks/R4R/speaker/snapshots/speaker.zip 44 | unzip tasks/R2R/speaker/snapshots/speaker.zip -d tasks/R2R/speaker/snapshots 45 | unzip tasks/R4R/speaker/snapshots/speaker.zip -d tasks/R4R/speaker/snapshots 46 | cp tasks/R4R/speaker/snapshots/* tasks/R6R/speaker/snapshots 47 | cp tasks/R6R/speaker/snapshots/* tasks/R8R/speaker/snapshots -------------------------------------------------------------------------------- /download_model.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # babywalk models 4 | wget https://www.dropbox.com/s/buxgob1xusp4401/follower_r2r_bbw.zip?dl=0 -O tasks/R2R/follower/follower_r2r_bbw.zip 5 | wget https://www.dropbox.com/s/okpjhgmdvzrkwiq/follower_r4r_bbw.zip?dl=0 -O tasks/R4R/follower/follower_r4r_bbw.zip 6 | wget https://www.dropbox.com/s/mc8iuav1g5buqfp/follower_r6r_bbw.zip?dl=0 -O tasks/R6R/follower/follower_r6r_bbw.zip 7 | wget https://www.dropbox.com/s/ttwff1nv5sthd9t/follower_r8r_bbw.zip?dl=0 -O tasks/R8R/follower/follower_r8r_bbw.zip 8 | 9 | # babywalk(coground) models 10 | wget https://www.dropbox.com/s/l2tnmb0ej6y5l0y/follower_coground_r4r_bbw.zip?dl=0 -O tasks/R4R/follower/follower_coground_r4r_bbw.zip 11 | -------------------------------------------------------------------------------- /model/__init__.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | sys.path.append("model") 4 | -------------------------------------------------------------------------------- /model/attentions.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class SoftDotAttention(nn.Module): 6 | '''Soft Dot Attention. 7 | 8 | Ref: http://www.aclweb.org/anthology/D15-1166 9 | Adapted from PyTorch OPEN NMT. 10 | ''' 11 | 12 | def __init__(self, dim, ctx_dim=None): 13 | '''Initialize layer.''' 14 | super(SoftDotAttention, self).__init__() 15 | if ctx_dim is None: 16 | ctx_dim = dim 17 | self.linear_in = nn.Linear(dim, ctx_dim, bias=False) 18 | self.sm = nn.Softmax(dim=1) 19 | self.linear_out = nn.Linear(dim + ctx_dim, dim, bias=False) 20 | self.tanh = nn.Tanh() 21 | 22 | def forward(self, h, context, mask=None): 23 | '''Propagate h through the network. 24 | 25 | h: batch x dim 26 | context: batch x seq_len x dim 27 | mask: batch x seq_len indices to be masked 28 | ''' 29 | target = self.linear_in(h).unsqueeze(2) # batch x dim x 1 30 | 31 | # Get attention 32 | attn = torch.bmm(context, target).squeeze(2) # batch x seq_len 33 | if mask is not None: 34 | # -Inf masking prior to the softmax 35 | attn.data.masked_fill_(mask, -float('inf')) 36 | attn = self.sm(attn) 37 | attn3 = attn.view(attn.size(0), 1, attn.size(1)) # batch x 1 x seq_len 38 | 39 | weighted_context = torch.bmm(attn3, context).squeeze(1) # batch x dim 40 | h_tilde = torch.cat((weighted_context, h), 1) 41 | 42 | h_tilde = self.tanh(self.linear_out(h_tilde)) 43 | return h_tilde, attn 44 | 45 | 46 | class WhSoftDotAttention(nn.Module): 47 | ''' Visual Dot Attention Layer. ''' 48 | 49 | def __init__(self, h_dim, v_dim=None): 50 | '''Initialize layer.''' 51 | super(WhSoftDotAttention, self).__init__() 52 | if v_dim is None: 53 | v_dim = h_dim 54 | self.h_dim = h_dim 55 | self.v_dim = v_dim 56 | self.linear_in_h = nn.Linear(h_dim, v_dim, bias=True) 57 | self.sm = nn.Softmax(dim=1) 58 | 59 | def forward(self, h, k, mask=None, v=None): 60 | '''Propagate h through the network. 61 | h: batch x h_dim 62 | k: batch x v_num x v_dim 63 | ''' 64 | target = self.linear_in_h(h).unsqueeze(2) # batch x dot_dim x 1 65 | attn = torch.bmm(k, target).squeeze(2) # batch x v_num 66 | if mask is not None: 67 | attn.data.masked_fill_(mask, -float('inf')) 68 | attn_sm = self.sm(attn) 69 | attn3 = attn_sm.view(attn.size(0), 1, attn.size(1)) # batch x 1 x v_num 70 | ctx = v if v is not None else k 71 | weighted_context = torch.bmm(attn3, ctx).squeeze(1) # batch x v_dim 72 | return weighted_context, attn 73 | 74 | 75 | class TextDotAttention(nn.Module): 76 | '''Soft Dot Attention. 77 | 78 | Ref: http://www.aclweb.org/anthology/D15-1166 79 | Adapted from PyTorch OPEN NMT. 80 | ''' 81 | 82 | def __init__(self, dim): 83 | '''Initialize layer.''' 84 | super(TextDotAttention, self).__init__() 85 | self.linear_in = nn.Linear(dim * 2, dim, bias=False) 86 | self.sm = nn.Softmax(dim=1) 87 | self.linear_out = nn.Linear(dim * 2, dim, bias=False) 88 | self.tanh = nn.Tanh() 89 | 90 | def forward(self, h, c, context, mask=None): 91 | '''Propagate h through the network. 92 | 93 | h: batch x dim 94 | context: batch x seq_len x dim 95 | mask: batch x seq_len indices to be masked 96 | ''' 97 | target = self.linear_in(torch.cat((h, c), -1)).unsqueeze( 98 | 2) # batch x dim x 1 99 | 100 | # Get attention 101 | attn = torch.bmm(context, target).squeeze(2) # batch x seq_len 102 | if mask is not None: 103 | # -Inf masking prior to the softmax 104 | attn.data.masked_fill_(mask, -float('inf')) 105 | attn = self.sm(attn) 106 | attn3 = attn.view(attn.size(0), 1, attn.size(1)) # batch x 1 x seq_len 107 | 108 | weighted_context = torch.bmm(attn3, context).squeeze(1) # batch x dim 109 | h_tilde = torch.cat((weighted_context, h), 1) 110 | 111 | h_tilde = self.tanh(self.linear_out(h_tilde)) 112 | return h_tilde, attn 113 | 114 | 115 | class VisualSoftDotAttention(nn.Module): 116 | ''' Visual Dot Attention Layer. ''' 117 | 118 | def __init__(self, h_dim, v_dim, dot_dim=256): 119 | '''Initialize layer.''' 120 | super(VisualSoftDotAttention, self).__init__() 121 | self.linear_in_h = nn.Linear(h_dim, dot_dim, bias=True) 122 | self.linear_in_v = nn.Linear(v_dim, dot_dim, bias=True) 123 | self.sm = nn.Softmax(dim=1) 124 | 125 | def forward(self, h, visual_context, mask=None): 126 | '''Propagate h through the network. 127 | 128 | h: batch x h_dim 129 | visual_context: batch x v_num x v_dim 130 | ''' 131 | target = self.linear_in_h(h).unsqueeze(2) # batch x dot_dim x 1 132 | context = self.linear_in_v(visual_context) # batch x v_num x dot_dim 133 | 134 | # Get attention 135 | attn = torch.bmm(context, target).squeeze(2) # batch x v_num 136 | attn = self.sm(attn) 137 | attn3 = attn.view(attn.size(0), 1, attn.size(1)) # batch x 1 x v_num 138 | 139 | weighted_context = torch.bmm( 140 | attn3, visual_context).squeeze(1) # batch x v_dim 141 | return weighted_context, attn 142 | -------------------------------------------------------------------------------- /model/context_encoder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence 4 | from attentions import SoftDotAttention 5 | from cuda import try_cuda 6 | 7 | 8 | class ContextEncoder(nn.Module): 9 | def __init__(self, feature_size, hidden_size, dropout_ratio): 10 | ''' Bidirectional LSTM ContextEncoder. ''' 11 | super().__init__() 12 | self.hidden_size = hidden_size 13 | self.feature_size = feature_size 14 | self.lstm = nn.LSTM(feature_size, self.hidden_size // 2, 1, 15 | batch_first=True, bidirectional=True) 16 | self.drop = nn.Dropout(p=dropout_ratio) 17 | self.attention_layer = SoftDotAttention(self.hidden_size, 18 | ctx_dim=feature_size) 19 | self.post_lstm = nn.LSTM(self.hidden_size, self.hidden_size // 2, 1, 20 | batch_first=True, bidirectional=True) 21 | 22 | def forward(self, feature, action_embeds, lengths): 23 | ''' 24 | :param action_embeds: (batch_size, length, 2048). The feature of the view 25 | :param feature: (batch_size, length, 36, 2048). The action taken (with the image feature) 26 | :param lengths: 27 | :return: context with shape (batch_size, length, hidden_size) -> (batch_size, hidden_size) 28 | ''' 29 | 30 | # LSTM on the action embed 31 | new_lengths = [1 if l == 0 else l for l in lengths] 32 | packed_embeds = pack_padded_sequence(action_embeds, new_lengths, 33 | enforce_sorted=False, 34 | batch_first=True) 35 | 36 | enc_h, _ = self.lstm(packed_embeds) 37 | ctx, _ = pad_packed_sequence(enc_h, batch_first=True) 38 | ctx = self.drop(ctx) 39 | 40 | # Att and Handle with the shape 41 | batch_size, max_length, _ = ctx.size() 42 | x, _ = self.attention_layer( # Attend to the feature map 43 | ctx.contiguous().view(-1, self.hidden_size), 44 | # (batch, length, hidden) --> (batch x length, hidden) 45 | feature.view(batch_size * max_length, -1, self.feature_size), 46 | # (batch, length, # of images, feature_size) --> (batch x length, # of images, feature_size) 47 | ) 48 | x = x.view(batch_size, max_length, -1) 49 | x = self.drop(x) 50 | 51 | # Post LSTM layer 52 | packed_x = pack_padded_sequence(x, new_lengths, 53 | enforce_sorted=False, batch_first=True) 54 | # self.post_lstm.flatten_parameters() 55 | enc_x, _ = self.post_lstm(packed_x) 56 | x, _ = pad_packed_sequence(enc_x, batch_first=True) 57 | 58 | out = torch.stack([x[i, l - 1, :] if l > 0 59 | else try_cuda(torch.zeros(self.hidden_size)) 60 | for i, l in enumerate(lengths)], dim=0) 61 | return out 62 | 63 | 64 | class LSTMMemory(nn.Module): 65 | def __init__(self, hidden_size): 66 | '''Initialize layer.''' 67 | super().__init__() 68 | self.hidden_size = hidden_size 69 | self.lstm = nn.LSTM(self.hidden_size, self.hidden_size, 70 | batch_first=True, bidirectional=False) 71 | 72 | def forward(self, hidden_variables, lengths): 73 | if len(hidden_variables) != 0: 74 | x = hidden_variables # [batch * length * hidden] 75 | ctx, _ = self.lstm(x) 76 | out = torch.stack([ctx[i, l - 1, :] if l > 0 77 | else try_cuda(torch.zeros(self.hidden_size)) 78 | for i, l in enumerate(lengths)], dim=0) 79 | return out 80 | else: 81 | return try_cuda(torch.zeros(len(lengths), self.hidden_size)) 82 | -------------------------------------------------------------------------------- /model/cuda.py: -------------------------------------------------------------------------------- 1 | def try_cuda(pytorch_obj): 2 | import torch.cuda 3 | try: 4 | disabled = torch.cuda.disabled 5 | except: 6 | disabled = False 7 | if torch.cuda.is_available() and not disabled: 8 | return pytorch_obj.cuda() 9 | else: 10 | return pytorch_obj 11 | -------------------------------------------------------------------------------- /model/follower_coattend.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence 6 | from cuda import try_cuda 7 | from attentions import SoftDotAttention, VisualSoftDotAttention 8 | from context_encoder import ContextEncoder, LSTMMemory 9 | 10 | 11 | class EltwiseProdScoring(nn.Module): 12 | ''' 13 | Linearly mapping h and v to the same dimension, and do a elementwise 14 | multiplication and a linear scoring 15 | ''' 16 | 17 | def __init__(self, h_dim, a_dim, dot_dim=256): 18 | '''Initialize layer.''' 19 | super(EltwiseProdScoring, self).__init__() 20 | self.linear_in_h = nn.Linear(h_dim, dot_dim, bias=True) 21 | self.linear_in_a = nn.Linear(a_dim, dot_dim, bias=True) 22 | self.linear_out = nn.Linear(dot_dim, 1, bias=True) 23 | 24 | def forward(self, h, all_u_t, mask=None): 25 | '''Propagate h through the network. 26 | 27 | h: batch x h_dim 28 | all_u_t: batch x a_num x a_dim 29 | ''' 30 | target = self.linear_in_h(h).unsqueeze(1) # batch x 1 x dot_dim 31 | context = self.linear_in_a(all_u_t) # batch x a_num x dot_dim 32 | eltprod = torch.mul(target, context) # batch x a_num x dot_dim 33 | logits = self.linear_out(eltprod).squeeze(2) # batch x a_num 34 | return logits 35 | 36 | 37 | class EltwiseProdScoringWithContext(nn.Module): 38 | ''' 39 | Linearly mapping h and v to the same dimension, and do a elementwise 40 | multiplication and a linear scoring 41 | ''' 42 | 43 | def __init__(self, h_dim, a_dim, dot_dim=512, dropout=0.5): 44 | '''Initialize layer.''' 45 | super(EltwiseProdScoringWithContext, self).__init__() 46 | self.linear_combine = nn.Sequential( 47 | nn.Linear(h_dim * 3, dot_dim, bias=True), 48 | nn.ReLU(), 49 | nn.Linear(dot_dim, dot_dim, bias=True) 50 | ) 51 | self.linear_in_a = nn.Linear(a_dim, dot_dim, bias=True) 52 | self.linear_out = nn.Linear(dot_dim, 1, bias=True) 53 | 54 | def forward(self, h, context, text_context, all_u_t, mask=None): 55 | '''Propagate h through the network. 56 | 57 | h: batch x h_dim 58 | all_u_t: batch x a_num x a_dim 59 | ''' 60 | combine = torch.cat([F.normalize(h), 61 | F.normalize(context), 62 | F.normalize(text_context)], dim=1) 63 | target = self.linear_combine(combine).unsqueeze(1) # batch x 1 x dot_dim 64 | actions = self.linear_in_a(all_u_t) # batch x a_num x dot_dim 65 | eltprod = torch.mul(target, actions) # batch x a_num x dot_dim 66 | logits = self.linear_out(eltprod).squeeze(2) # batch x a_num 67 | return logits 68 | 69 | 70 | class EncoderLSTM(nn.Module): 71 | ''' Encodes navigation instructions, returning hidden state context (for 72 | attention methods) and a decoder initial state. 73 | ''' 74 | 75 | def __init__(self, vocab_size, embedding_size, hidden_size, padding_idx, 76 | dropout_ratio, glove=None): 77 | """ Simple LSTM encoder """ 78 | super(EncoderLSTM, self).__init__() 79 | self.embedding_size = embedding_size 80 | self.hidden_size = hidden_size 81 | self.drop = nn.Dropout(p=dropout_ratio) 82 | self.embedding = nn.Embedding(vocab_size, embedding_size, padding_idx) 83 | self.use_glove = glove is not None 84 | if self.use_glove: 85 | print('Using GloVe embedding') 86 | self.embedding.weight.data[...] = torch.from_numpy(glove) 87 | self.embedding.weight.requires_grad = False 88 | self.lstm = nn.LSTM(embedding_size, hidden_size, 1, 89 | batch_first=True, 90 | bidirectional=False) 91 | self.encoder2decoder = nn.Linear(hidden_size, hidden_size) 92 | 93 | def load_my_state_dict(self, state_dict): 94 | own_state = self.state_dict() 95 | for name, param in state_dict.items(): 96 | if name not in own_state: 97 | continue 98 | if isinstance(param, nn.Parameter): 99 | param = param.data 100 | own_state[name].copy_(param) 101 | 102 | def init_state(self, batch_size): 103 | ''' Initialize to zero cell states and hidden states.''' 104 | h0 = try_cuda(torch.zeros(1, batch_size, self.hidden_size)) 105 | c0 = try_cuda(torch.zeros(1, batch_size, self.hidden_size)) 106 | return h0, c0 107 | 108 | def forward(self, *args, **kwargs): 109 | '''Encode history instructions (text context) or encode current instructions.''' 110 | if 'context' in kwargs and kwargs['context'] == True: 111 | return self.forward_context(*args) 112 | else: 113 | return self.forward_current(*args) 114 | 115 | def forward_current(self, inputs, lengths): 116 | ''' Expects input vocab indices as (batch, seq_len). Also requires a 117 | list of lengths for dynamic batching. 118 | ''' 119 | batch_size = inputs.size(0) 120 | embeds = self.embedding(inputs) # (batch, seq_len, embedding_size) 121 | if not self.use_glove: 122 | embeds = self.drop(embeds) 123 | h0, c0 = self.init_state(batch_size) 124 | packed_embeds = pack_padded_sequence(embeds, lengths, 125 | enforce_sorted=False, 126 | batch_first=True) 127 | enc_h, (enc_h_t, enc_c_t) = self.lstm(packed_embeds, (h0, c0)) 128 | h_t = enc_h_t[-1] 129 | c_t = enc_c_t[-1] # (batch, hidden_size) 130 | 131 | ctx, lengths = pad_packed_sequence(enc_h, batch_first=True) 132 | decoder_init = nn.Tanh()(self.encoder2decoder(h_t)) 133 | ctx = self.drop(ctx) 134 | return ctx, decoder_init, c_t 135 | 136 | def forward_context(self, inputs, lengths): 137 | ''' Expects input vocab indices as (batch, seq_len). Also requires a 138 | list of lengths for dynamic batching. 139 | ''' 140 | batch_size = inputs.size(0) 141 | embeds = self.embedding(inputs) # (batch, seq_len, embedding_size) 142 | if not self.use_glove: 143 | embeds = self.drop(embeds) 144 | h0, c0 = self.init_state(batch_size) 145 | packed_embeds = pack_padded_sequence(embeds, lengths, 146 | enforce_sorted=False, 147 | batch_first=True) 148 | enc_h, (enc_h_t, enc_c_t) = self.lstm(packed_embeds, (h0, c0)) 149 | h_t = enc_h_t[-1] 150 | return h_t 151 | 152 | 153 | class AttnDecoderLSTM(nn.Module): 154 | ''' 155 | An unrolled LSTM with attention over instructions for decoding navigation 156 | actions. 157 | ''' 158 | 159 | def __init__(self, embedding_size, hidden_size, dropout_ratio, feature_size, 160 | history=False, lstm_mem=False): 161 | super(AttnDecoderLSTM, self).__init__() 162 | self.embedding_size = embedding_size 163 | self.feature_size = feature_size 164 | self.hidden_size = hidden_size 165 | self.drop = nn.Dropout(p=dropout_ratio) 166 | self.lstm = nn.LSTMCell(embedding_size + feature_size, hidden_size) 167 | self.visual_attention_layer = \ 168 | VisualSoftDotAttention(hidden_size, feature_size) 169 | self.text_attention_layer = SoftDotAttention(hidden_size) 170 | if history: 171 | self.linear_context_out = nn.Linear(hidden_size, hidden_size, bias=True) 172 | self.linear_text_out = nn.Linear(hidden_size, hidden_size, bias=True) 173 | self.context_encoder = \ 174 | ContextEncoder(feature_size, hidden_size, dropout_ratio) 175 | self.decoder2action_text_context = \ 176 | EltwiseProdScoringWithContext(hidden_size, embedding_size) 177 | if lstm_mem: 178 | self.context_lstm = LSTMMemory(hidden_size) 179 | self.text_context_lstm = LSTMMemory(hidden_size) 180 | else: 181 | self.decoder2action = EltwiseProdScoring(hidden_size, embedding_size) 182 | 183 | def load_my_state_dict(self, state_dict): 184 | own_state = self.state_dict() 185 | for name, param in state_dict.items(): 186 | if name not in own_state: 187 | continue 188 | if isinstance(param, nn.Parameter): 189 | param = param.data 190 | own_state[name].copy_(param) 191 | 192 | def forward(self, *args, **kwargs): 193 | '''Encode history trajectories (visual context) or decode current trajectories.''' 194 | if 'context' in kwargs and kwargs['context'] == True: 195 | return self.forward_context(*args) 196 | else: 197 | return self.forward_current(*args, 198 | ctx_mask=kwargs['ctx_mask'], 199 | history_context=kwargs['history_context']) 200 | 201 | def forward_current(self, u_t_prev, all_u_t, visual_context, h_0, c_0, ctx, 202 | ctx_mask=None, history_context=None): 203 | ''' Takes a single step in the decoder LSTM (allowing sampling). 204 | 205 | u_t_prev: batch x embedding_size 206 | all_u_t: batch x a_num x embedding_size 207 | visual_context: batch x v_num x feature_size 208 | h_0: batch x hidden_size 209 | c_0: batch x hidden_size 210 | ctx: batch x seq_len x dim 211 | ctx_mask: batch x seq_len - indices to be masked 212 | history_context: None or [batch x hidden_size, batch x hidden_size] 213 | ''' 214 | feature, alpha_v = self.visual_attention_layer(h_0, visual_context) 215 | concat_input = torch.cat((u_t_prev, feature), 1) 216 | concat_drop = self.drop(concat_input) 217 | 218 | h_1, c_1 = self.lstm(concat_drop, (h_0, c_0)) 219 | h_1_drop = self.drop(h_1) 220 | h_tilde, alpha = self.text_attention_layer(h_1_drop, ctx, ctx_mask) 221 | 222 | if history_context is not None: 223 | context = self.linear_context_out(history_context[0]) 224 | text = self.linear_text_out(history_context[1]) 225 | logit = self.decoder2action_text_context(h_tilde, context, text, all_u_t) 226 | else: 227 | logit = self.decoder2action(h_tilde, all_u_t) 228 | return h_1, c_1, alpha, logit, alpha_v 229 | 230 | def forward_context(self, *args): 231 | return self.context_encoder(*args) 232 | -------------------------------------------------------------------------------- /model/follower_coground.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | import torch.nn as nn 4 | 5 | from torch.autograd import Variable 6 | from cuda import try_cuda 7 | from attentions import WhSoftDotAttention 8 | from context_encoder import ContextEncoder 9 | 10 | 11 | class PositionalEncoding(nn.Module): 12 | def __init__(self, d_model, dropout, max_len): 13 | super(PositionalEncoding, self).__init__() 14 | self.dropout = nn.Dropout(p=dropout) 15 | 16 | # Compute the PE once 17 | pe = torch.zeros(max_len, d_model) 18 | position = torch.arange(0, max_len).unsqueeze(1).float() 19 | div_term = torch.exp( 20 | torch.arange(0, d_model, 2).float() / d_model * (-math.log(10000.0)) 21 | ) 22 | pe[:, 0::2] = torch.sin(position * div_term) 23 | pe[:, 1::2] = torch.cos(position * div_term) 24 | pe = pe.unsqueeze(0) 25 | self.register_buffer('pe', pe) 26 | 27 | def forward(self, x): 28 | x = x + Variable(self.pe[:, :x.size(1)], requires_grad=False) 29 | return self.dropout(x) 30 | 31 | 32 | class CogroundDecoderLSTM(nn.Module): 33 | def __init__(self, embedding_size, hidden_size, dropout_ratio, feature_size, 34 | max_len, history=False, visual_hidden_size=1024): 35 | super(CogroundDecoderLSTM, self).__init__() 36 | self.embedding_size = embedding_size 37 | self.feature_size = feature_size 38 | self.hidden_size = hidden_size 39 | self.u_begin = try_cuda(Variable(torch.zeros(embedding_size), 40 | requires_grad=False)) 41 | self.drop = nn.Dropout(p=dropout_ratio) 42 | self.lstm = nn.LSTMCell(2 * embedding_size + hidden_size, hidden_size) 43 | self.text_attention_layer = WhSoftDotAttention(hidden_size, hidden_size) 44 | self.positional_encoding = PositionalEncoding(hidden_size, 45 | dropout=0, max_len=max_len) 46 | self.visual_attention_layer = WhSoftDotAttention(hidden_size, 47 | visual_hidden_size) 48 | self.visual_mlp = nn.Sequential( 49 | nn.BatchNorm1d(feature_size), 50 | nn.Linear(feature_size, visual_hidden_size), 51 | nn.BatchNorm1d(visual_hidden_size), 52 | nn.Dropout(dropout_ratio), 53 | nn.ReLU() 54 | ) 55 | self.action_attention_layer = WhSoftDotAttention(hidden_size * 2, 56 | visual_hidden_size) 57 | self.sm = nn.Softmax(dim=1) 58 | if history: 59 | self.linear_context_out = nn.Linear(hidden_size, hidden_size, bias=True) 60 | self.linear_text_out = nn.Linear(hidden_size, hidden_size, bias=True) 61 | self.context_encoder = ContextEncoder(feature_size, hidden_size, 62 | dropout_ratio) 63 | self.linear_combine = nn.Sequential( 64 | nn.Linear(hidden_size * 4, hidden_size * 2, bias=True), 65 | nn.ReLU(), 66 | nn.Linear(hidden_size * 2, hidden_size * 2, bias=True) 67 | ) 68 | 69 | def load_my_state_dict(self, state_dict): 70 | own_state = self.state_dict() 71 | for name, param in state_dict.items(): 72 | if name not in own_state: 73 | continue 74 | if isinstance(param, nn.Parameter): 75 | param = param.data 76 | own_state[name].copy_(param) 77 | 78 | def forward(self, *args, **kwargs): 79 | if 'context' in kwargs and kwargs['context'] == True: 80 | return self.forward_context(*args) 81 | else: 82 | return self.forward_current(*args, ctx_mask=kwargs['ctx_mask'], 83 | history_context=kwargs['history_context']) 84 | 85 | def forward_current(self, u_t_prev, all_u_t, visual_context, h_0, c_0, ctx, 86 | ctx_mask=None, history_context=None): 87 | ''' 88 | u_t_prev: batch x embedding_size 89 | all_u_t: batch x a_num x embedding_size 90 | visual_context: batch x v_num x feature_size => panoramic view, DEP 91 | h_0: batch x hidden_size 92 | c_0: batch x hidden_size 93 | ctx: batch x seq_len x dim 94 | ctx_mask: batch x seq_len - indices to be masked 95 | ''' 96 | ctx_pos = self.positional_encoding(ctx) 97 | attn_text, _alpha_text = \ 98 | self.text_attention_layer(h_0, ctx_pos, v=ctx, mask=ctx_mask) 99 | alpha_text = self.sm(_alpha_text) 100 | 101 | batch_size, a_size, _ = all_u_t.size() 102 | g_v = all_u_t.view(-1, self.feature_size) 103 | g_v = self.visual_mlp(g_v).view(batch_size, a_size, -1) 104 | attn_vision, _alpha_vision = \ 105 | self.visual_attention_layer(h_0, g_v, v=all_u_t) 106 | alpha_vision = self.sm(_alpha_vision) 107 | 108 | concat_input = torch.cat((attn_text, attn_vision, u_t_prev), 1) 109 | drop = concat_input 110 | h_1, c_1 = self.lstm(drop, (h_0, c_0)) 111 | 112 | if history_context is not None: 113 | context = self.linear_context_out(history_context[0]) 114 | text = self.linear_text_out(history_context[1]) 115 | action_selector = self.linear_combine( 116 | torch.cat((attn_text, h_1, context, text), 1)) 117 | else: 118 | action_selector = torch.cat((attn_text, h_1), 1) 119 | _, alpha_action = self.action_attention_layer(action_selector, g_v) 120 | return h_1, c_1, alpha_text, alpha_action, alpha_vision 121 | 122 | def forward_context(self, *args): 123 | return self.context_encoder(*args) 124 | -------------------------------------------------------------------------------- /model/speaker_lstm.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from attentions import SoftDotAttention 4 | 5 | 6 | class SpeakerEncoderLSTM(nn.Module): 7 | def __init__(self, feature_size, hidden_size, dropout_ratio): 8 | ''' Bidirectional LSTM Speaker''' 9 | super().__init__() 10 | self.hidden_size = hidden_size 11 | self.feature_size = feature_size 12 | self.lstm = nn.LSTM(feature_size, self.hidden_size // 2, 1, 13 | batch_first=True, 14 | bidirectional=True) 15 | self.drop = nn.Dropout(p=dropout_ratio) 16 | self.drop3 = nn.Dropout(p=0.3) 17 | self.attention_layer = SoftDotAttention(self.hidden_size, feature_size) 18 | self.post_lstm = nn.LSTM(self.hidden_size, self.hidden_size // 2, 1, 19 | batch_first=True, 20 | bidirectional=True) 21 | 22 | def load_my_state_dict(self, state_dict): 23 | own_state = self.state_dict() 24 | for name, param in state_dict.items(): 25 | if name not in own_state: 26 | continue 27 | if isinstance(param, nn.Parameter): 28 | param = param.data 29 | own_state[name].copy_(param) 30 | 31 | def forward(self, feature, action_embeds, lengths): 32 | """ 33 | :param action_embeds: (batch_size, length, 2052). The feature of the view 34 | :param feature: (batch_size, length, 36, 2052). The action taken (with the image feature) 35 | :param lengths: Not used in it 36 | :return: context with shape (batch_size, length, hidden_size) 37 | """ 38 | x = action_embeds 39 | x[..., :-128] = self.drop3( 40 | x[..., :-128]) # Do not dropout the spatial features 41 | 42 | # LSTM on the action embed 43 | ctx, _ = self.lstm(x) 44 | ctx = self.drop(ctx) 45 | 46 | # Att and Handle with the shape 47 | batch_size, max_length, _ = ctx.size() 48 | feature[..., :-128] = self.drop3( 49 | feature[..., :-128]) # Dropout the image feature 50 | x, _ = self.attention_layer( # Attend to the feature map 51 | ctx.contiguous().view(-1, self.hidden_size), 52 | # (batch, length, hidden) --> (batch x length, hidden) 53 | feature.view(batch_size * max_length, -1, self.feature_size), 54 | # (batch, length, # of images, feature_size) --> (batch x length, # of images, feature_size) 55 | ) 56 | x = x.view(batch_size, max_length, -1) 57 | x = self.drop(x) 58 | 59 | # Post LSTM layer 60 | x, _ = self.post_lstm(x) 61 | x = self.drop(x) 62 | 63 | return x 64 | 65 | 66 | class SpeakerDecoderLSTM(nn.Module): 67 | def __init__(self, vocab_size, embedding_size, padding_idx, hidden_size, 68 | dropout_ratio, glove=None): 69 | super().__init__() 70 | self.hidden_size = hidden_size 71 | self.embedding = torch.nn.Embedding(vocab_size, embedding_size, 72 | padding_idx) 73 | self.use_glove = glove is not None 74 | if self.use_glove: 75 | print('Using GloVe embedding') 76 | self.embedding.weight.data[...] = torch.from_numpy(glove) 77 | self.embedding.weight.requires_grad = False 78 | self.lstm = nn.LSTM(embedding_size, hidden_size, batch_first=True) 79 | self.drop = nn.Dropout(dropout_ratio) 80 | self.attention_layer = SoftDotAttention(hidden_size, hidden_size) 81 | self.projection = nn.Linear(hidden_size, vocab_size) 82 | self.baseline_projection = nn.Sequential( 83 | nn.Linear(hidden_size, 128), 84 | nn.ReLU(), 85 | nn.Dropout(dropout_ratio), 86 | nn.Linear(128, 1) 87 | ) 88 | 89 | def load_my_state_dict(self, state_dict): 90 | own_state = self.state_dict() 91 | for name, param in state_dict.items(): 92 | if name not in own_state: 93 | continue 94 | if isinstance(param, nn.Parameter): 95 | param = param.data 96 | own_state[name].copy_(param) 97 | 98 | def forward(self, words, ctx, ctx_mask, h0, c0): 99 | h0, c0 = h0.unsqueeze(0), c0.unsqueeze(0) 100 | embeds = self.embedding(words) 101 | embeds = self.drop(embeds) 102 | x, (h1, c1) = self.lstm(embeds, (h0, c0)) 103 | 104 | x = self.drop(x) 105 | 106 | # Get the size 107 | batchXlength = words.size(0) * words.size(1) 108 | multiplier = batchXlength // ctx.size(0) 109 | 110 | # Att and Handle with the shape 111 | # Reshaping x --> (b(word)*l(word), r) 112 | # Expand the ctx from (b, a, r) --> (b(word)*l(word), a, r) 113 | # Expand the ctx_mask (b, a) --> (b(word)*l(word), a) 114 | x, _ = self.attention_layer( 115 | x.contiguous().view(batchXlength, self.hidden_size), 116 | ctx.unsqueeze(1).expand(-1, multiplier, -1, -1).contiguous().view( 117 | batchXlength, -1, self.hidden_size), 118 | mask=ctx_mask.unsqueeze(1).expand(-1, multiplier, -1).contiguous().view( 119 | batchXlength, -1) 120 | ) 121 | x = x.view(words.size(0), words.size(1), self.hidden_size) 122 | 123 | # Output the prediction logit 124 | x = self.drop(x) 125 | logit = self.projection(x) 126 | 127 | return logit, h1.squeeze(0), c1.squeeze(0) 128 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # This file may be used to create an environment using: 2 | # $ conda create --name --file 3 | # platform: linux-64 4 | python=3.7.3 5 | pytorch=1.1.0 6 | torchvision=0.3.0 7 | numpy=1.16.2 8 | networkx=2.2 -------------------------------------------------------------------------------- /simulator/connectivity/8194nk5LbLH_connectivity.json: -------------------------------------------------------------------------------- 1 | [{"image_id":"c9e8dc09263e4d0da77d16de0ecddd39","pose":[-0.611043,-0.00396746,-0.791588,-0.213904,0.791585,-0.00882497,-0.610996,2.305,-0.00456166,-0.999953,0.00853306,1.56916,0,0,0,1],"included":true,"visible":[false,false,false,false,true,true,false,true,true,true,false,false,false,false,false,false,false,false,false,false],"unobstructed":[false,false,false,false,true,false,false,false,true,true,false,false,false,false,false,false,false,false,false,false],"height":1.5826326295962942},{"image_id":"286b0c2d9a46408ba80b6ccebb21e582","pose":[0.951596,0.00201098,0.307346,6.58012,-0.307351,0.00915895,0.951552,-2.96479,-0.000901435,-0.999956,0.00933374,4.36353,0,0,0,1],"included":true,"visible":[false,false,true,true,false,false,false,false,false,false,false,false,false,true,false,true,false,true,false,true],"unobstructed":[false,false,false,true,false,false,false,false,false,false,false,false,false,true,false,false,false,true,true,false],"height":1.5712253956498747},{"image_id":"6776097c17ed4b93aee61704eb32f06c","pose":[-0.711582,-0.00419131,-0.702591,-1.68941,0.702575,0.00464776,-0.711594,-5.37908,0.00624796,-0.99998,-0.000362505,1.58622,0,0,0,1],"included":true,"visible":[false,false,false,false,false,true,true,true,false,true,false,true,false,false,true,false,true,false,false,false],"unobstructed":[false,false,false,false,false,true,true,false,false,false,false,true,false,false,false,false,false,false,false,true],"height":1.5804941871490743},{"image_id":"8c7e8da7d4a44ab695e6b3195eac0cf1","pose":[0.709879,0.011247,0.704234,8.62929,-0.70424,-0.00407304,0.70995,-1.77115,0.0108531,-0.999928,0.00502926,4.38556,0,0,0,1],"included":true,"visible":[false,true,false,false,false,false,false,false,false,false,true,false,true,true,false,false,false,true,true,false],"unobstructed":[false,true,false,false,false,false,false,false,false,false,true,false,false,true,false,false,false,true,true,false],"height":1.585645804390483},{"image_id":"f33c718aaf2c41469389a87944442c62","pose":[0.619478,0.0166688,0.784837,-3.88437,-0.784902,-0.00375152,0.619609,-0.528748,0.0132725,-0.999854,0.0107595,1.58368,0,0,0,1],"included":true,"visible":[true,false,false,false,false,false,false,false,false,true,false,false,false,false,false,false,false,false,false,true],"unobstructed":[true,false,false,false,false,false,false,false,false,true,false,false,false,false,false,false,false,false,false,true],"height":1.5829827809014503},{"image_id":"fcd90a404061413385286bef9662630e","pose":[-0.111393,0.00837906,0.993741,2.80245,-0.993773,-0.00348217,-0.111367,-3.78204,0.0025272,-0.999959,0.00871482,1.58057,0,0,0,1],"included":true,"visible":[true,false,true,false,false,false,false,true,false,false,false,false,false,false,false,true,false,false,false,false],"unobstructed":[false,false,true,false,false,false,true,true,true,false,false,false,false,false,false,false,false,false,false,false],"height":1.5763528408163245},{"image_id":"c07d4ae8330542a09cf8f8dddb9728ce","pose":[-0.985207,-0.0101267,0.171069,0.656519,-0.171094,0.00168538,-0.985253,-5.08928,0.00968898,-0.999947,-0.00339301,1.57611,0,0,0,1],"included":true,"visible":[true,false,true,false,false,true,false,true,false,false,false,false,false,false,true,false,true,false,false,true],"unobstructed":[false,false,true,false,false,true,false,true,false,false,false,true,false,false,false,false,false,false,false,false],"height":1.575276915205382},{"image_id":"2393bffb53fe4205bcc67796c6fb76e3","pose":[-0.241654,0.00228344,-0.97036,3.33582,0.970294,0.0124463,-0.241608,-5.90025,0.0115256,-0.99992,-0.00522325,1.57791,0,0,0,1],"included":true,"visible":[false,false,true,false,false,true,true,false,false,false,false,false,false,false,true,false,true,false,false,false],"unobstructed":[false,false,false,false,false,true,true,false,false,false,false,false,false,false,true,false,false,false,false,false],"height":1.5730354249357412},{"image_id":"71bf74df73cd4e24a191ef4f2338ca22","pose":[0.906931,-0.00688335,-0.421222,0.122562,0.421182,-0.00662188,0.906952,-0.00319673,-0.00903217,-0.999954,-0.00310641,1.57207,0,0,0,1],"included":true,"visible":[true,false,false,false,true,true,true,true,false,true,false,false,false,false,false,false,false,false,false,false],"unobstructed":[true,false,false,false,false,true,false,false,false,true,false,false,false,false,false,false,false,false,false,false],"height":1.570272020216938},{"image_id":"be8a2edacab34ec8887ba6a7b1e4945f","pose":[0.791463,0.0101015,0.611133,-3.50132,-0.611154,-0.00121731,0.791511,1.58103,0.00873934,-0.999948,0.00521015,1.56992,0,0,0,1],"included":true,"visible":[true,false,true,false,true,false,false,true,false,false,false,true,false,false,false,false,false,false,false,true],"unobstructed":[true,false,false,false,true,false,false,false,true,false,false,false,false,false,false,false,false,false,false,false],"height":1.577126892771864},{"image_id":"9bdde31adaa1443bb206b09bfa3c474c","pose":[0.799844,0.0047414,0.60019,8.67581,-0.600208,0.0075118,0.799809,-4.8108,-0.000716311,-0.99996,0.00885413,2.82261,0,0,0,1],"included":true,"visible":[false,false,false,true,false,false,false,false,false,false,false,false,false,true,false,false,true,true,false,false],"unobstructed":[false,false,false,true,false,false,false,false,false,false,false,false,false,false,false,false,true,false,false,false],"height":1.58264400638767},{"image_id":"66d4adb61b57494aa2c1ad141a0fad9b","pose":[-0.34536,-0.0108675,-0.938407,-2.27885,0.938436,0.00459882,-0.345423,-3.2282,0.00806945,-0.99993,0.00861029,1.58739,0,0,0,1],"included":true,"visible":[false,false,true,false,false,true,true,true,false,true,false,false,false,false,true,true,false,false,false,true],"unobstructed":[false,false,true,false,false,false,true,false,false,false,false,false,false,false,false,false,false,false,false,true],"height":1.5705441219971223},{"image_id":"83ff709c0e3e46079836153ea5c7feac","pose":[0.68423,0.0137303,0.729137,3.42529,-0.729235,0.00364543,0.684254,1.65175,0.00673696,-0.999899,0.012507,4.37069,0,0,0,1],"included":true,"visible":[false,false,false,true,false,false,false,false,false,false,false,false,false,false,false,false,false,false,true,false],"unobstructed":[false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,true,false],"height":1.578378655072358},{"image_id":"d9e325df2f3948679c78b93d8025e2da","pose":[0.826698,0.0192407,0.562317,8.49764,-0.562455,0.00220125,0.826825,-0.816805,0.0146709,-0.999812,0.0126418,4.38875,0,0,0,1],"included":true,"visible":[false,true,false,true,false,false,false,false,false,false,true,false,true,false,false,false,false,true,true,false],"unobstructed":[false,true,false,true,false,false,false,false,false,false,false,false,false,false,false,false,false,true,true,false],"height":1.5865892751674604},{"image_id":"423efb97f77f4e7995f19c66fe82afbc","pose":[0.958879,0.00141119,0.283813,5.51819,-0.283808,0.0124035,0.958801,-5.67527,-0.00216725,-0.999922,0.012294,1.58856,0,0,0,1],"included":true,"visible":[false,false,true,false,false,false,true,true,false,false,false,false,false,false,false,false,true,false,false,false],"unobstructed":[false,false,false,false,false,false,false,true,false,false,false,false,false,false,false,false,true,false,false,false],"height":1.5784339701720043},{"image_id":"6c49579a5cd34df8acb7f790b74e9eae","pose":[-0.95716,-0.00676032,-0.289482,-6.48379,0.289538,-0.00977451,-0.957117,-2.57899,0.00364085,-0.999929,0.0113132,1.59886,0,0,0,1],"included":true,"visible":[false,false,false,false,false,false,false,false,false,false,false,true,false,false,false,false,false,false,false,true],"unobstructed":[false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,true],"height":1.5798282335589897},{"image_id":"aeed67040d744240b188f66f17d87d43","pose":[0.132175,0.0257204,0.990893,7.67989,-0.991226,0.00381825,0.132121,-5.81072,-0.000385302,-0.999662,0.0259995,2.29866,0,0,0,1],"included":true,"visible":[false,false,true,false,false,false,true,true,false,false,true,false,false,false,true,false,false,false,false,false],"unobstructed":[false,false,false,false,false,false,false,false,false,false,true,false,false,false,true,false,false,false,false,false],"height":1.6026680667792301},{"image_id":"aae01016bb354f78bd6db86e9d71af2b","pose":[0.0788252,0.00384462,0.996881,6.79041,-0.996887,0.00184069,0.0788186,-0.995862,-0.00153193,-0.999991,0.0039778,4.37219,0,0,0,1],"included":true,"visible":[false,true,false,true,false,false,false,false,false,false,false,false,false,true,false,false,false,false,true,false],"unobstructed":[false,true,false,true,false,false,false,false,false,false,false,false,false,true,false,false,false,false,true,false],"height":1.5770919536040346},{"image_id":"346b680ac5904359a1859c929ad312b6","pose":[-0.589008,0.00463239,0.808114,5.58585,-0.808123,0.00000695791,-0.589015,0.644327,-0.00273419,-0.999989,0.00373948,4.38174,0,0,0,1],"included":true,"visible":[false,true,false,true,false,false,false,false,false,false,false,false,true,true,false,false,false,true,false,false],"unobstructed":[false,true,false,true,false,false,false,false,false,false,false,false,true,true,false,false,false,true,false,false],"height":1.5707587596461066},{"image_id":"ae91518ed77047b3bdeeca864cd04029","pose":[0.310985,0.0070688,0.950389,-4.60607,-0.950392,-0.00460962,0.31102,-2.5949,0.00657945,-0.999964,0.00528466,1.58581,0,0,0,1],"included":true,"visible":[false,false,true,false,true,true,false,true,false,true,false,true,false,false,false,true,false,false,false,false],"unobstructed":[false,false,true,false,true,false,false,false,false,false,false,true,false,false,false,true,false,false,false,false],"height":1.5747548700639524}] -------------------------------------------------------------------------------- /simulator/connectivity/GdvgFV5R1Z5_connectivity.json: -------------------------------------------------------------------------------- 1 | [{"image_id":"0b02e18654324edd8d74c078b66bfb20","pose":[-0.057695,-0.000357129,0.998334,-2.46692,-0.998304,-0.00769199,-0.0576965,-3.15814,0.00770012,-0.99997,0.0000884733,1.5171,0,0,0,1],"included":true,"visible":[false,true,false,false,false,true,true,true,true,false,true,false],"unobstructed":[false,false,false,false,false,true,false,true,true,false,true,false],"height":1.51470410293751},{"image_id":"1db1c0a09ecf40d188197efc05ced3bb","pose":[-0.442443,0.0138817,0.896688,-4.03893,-0.89679,-0.0101225,-0.442338,-3.05434,0.00293664,-0.999852,0.0169288,0.974424,0,0,0,1],"included":true,"visible":[true,false,false,false,true,true,false,false,true,false,false,true],"unobstructed":[false,false,false,false,false,true,false,false,true,false,false,true],"height":0.9701803380402906},{"image_id":"6178647ca8d14dc09370f6c1b7ed2fd6","pose":[-0.870025,0.0056275,0.492973,-3.69279,-0.493005,-0.0105975,-0.869962,1.95433,0.000328893,-0.999927,0.0119957,1.51516,0,0,0,1],"included":true,"visible":[false,false,false,true,false,false,true,true,false,false,true,false],"unobstructed":[false,false,false,true,false,false,true,true,false,true,true,false],"height":1.517582101716661},{"image_id":"565cc21cd28b4ee6bb5ba83c5270c032","pose":[0.0242634,0.000986587,-0.999704,-3.91782,0.999699,0.00333371,0.024267,0.178675,0.00335701,-0.999993,-0.0009042,1.50868,0,0,0,1],"included":true,"visible":[false,false,true,false,false,false,true,false,false,true,true,false],"unobstructed":[false,false,true,false,false,false,false,false,false,true,true,false],"height":1.5114421933143356},{"image_id":"ef638e508e054c4aabd49b38d1b88fc7","pose":[0.0820523,0.0151057,0.996513,-4.61631,-0.995947,-0.0356725,0.0825462,-2.18899,0.0367954,-0.999249,0.0121187,1.52757,0,0,0,1],"included":true,"visible":[false,true,false,false,false,true,false,false,true,false,false,true],"unobstructed":[false,false,false,false,false,true,false,false,true,false,false,true],"height":1.5162868543024455},{"image_id":"97ed68de989e44fdaf2d9b949898fab6","pose":[0.0900997,0.0149714,0.99582,-3.64126,-0.995713,-0.0195971,0.0903844,-3.16818,0.0208687,-0.999695,0.0131427,1.52081,0,0,0,1],"included":true,"visible":[true,true,false,false,true,false,false,false,true,false,false,true],"unobstructed":[true,true,false,false,true,false,false,false,true,false,false,true],"height":1.5211418713547455},{"image_id":"5fd70cff4992429a99a84fd3c117ccb5","pose":[-0.0539877,-0.000800861,-0.998541,0.0108044,0.998337,0.0201438,-0.0539926,0.00604319,0.020158,-0.999796,-0.000286778,1.51223,0,0,0,1],"included":true,"visible":[true,false,true,true,false,false,false,true,false,true,true,false],"unobstructed":[false,false,true,false,false,false,false,true,false,false,true,false],"height":1.5113248528175798},{"image_id":"86d342c576ff46a9828d2ba377cc8cd5","pose":[0.998173,0.0151118,-0.0584746,-1.78347,0.0584707,0.000718574,0.998288,-1.89835,0.0151283,-0.999885,-0.000165129,1.52238,0,0,0,1],"included":true,"visible":[true,false,true,false,false,false,true,false,false,false,true,false],"unobstructed":[true,false,true,false,false,false,true,false,false,false,true,false],"height":1.5103397372923053},{"image_id":"8dba9ff900b14f9b84ead660f5f7f701","pose":[-0.999855,-0.0144511,0.00887107,-4.11579,-0.00895392,0.00564829,-0.999943,-2.90606,0.0144005,-0.999879,-0.00577567,1.51617,0,0,0,1],"included":true,"visible":[true,true,false,false,true,true,false,false,false,false,false,true],"unobstructed":[true,true,false,false,true,true,false,false,false,false,false,true],"height":1.5112098807574073},{"image_id":"0d8c5fbfd73f44e28d6da370520611e4","pose":[0.0769887,0.00664334,0.997009,-6.15424,-0.997016,-0.00490415,0.0770216,-0.0398163,0.00540151,-0.999965,0.00624716,1.50965,0,0,0,1],"included":true,"visible":[false,false,true,true,false,false,true,false,false,false,true,false],"unobstructed":[false,false,true,true,false,false,false,false,false,false,false,false],"height":1.5058928427471967},{"image_id":"aebb1de49d21485e8bef7633dfb58761","pose":[-0.0229751,-0.0058052,-0.999718,-1.94579,0.999719,0.00553997,-0.0230069,-0.026534,0.00567231,-0.999967,0.0056775,1.50582,0,0,0,1],"included":true,"visible":[true,false,true,true,false,false,true,true,false,true,false,false],"unobstructed":[true,false,true,true,false,false,true,true,false,false,false,false],"height":1.5101720791580233},{"image_id":"e34e51f3d6584ad09c510de5db84752f","pose":[-0.0418368,-0.0124855,0.999046,-3.99281,-0.993607,-0.104406,-0.0429142,-2.13265,0.104842,-0.994456,-0.00803644,0.980264,0,0,0,1],"included":true,"visible":[false,true,false,false,true,true,false,false,true,false,false,false],"unobstructed":[false,true,false,false,true,true,false,false,true,false,false,false],"height":0.969584316081611}] -------------------------------------------------------------------------------- /simulator/connectivity/Pm6F8kyY3z2_connectivity.json: -------------------------------------------------------------------------------- 1 | [{"image_id":"dfed00b301f246989ff408657e39e88b","pose":[0.894952,-0.0207046,0.445684,-2.46151,-0.445515,0.0124171,0.895189,7.62724,-0.0240687,-0.999709,0.00188811,1.25952,0,0,0,1],"included":true,"visible":[false,false,false,false,false,false,false,true,false,false,true,true,true,false,false,true,true,false,false,false,false,true,false,false,false,false,false,false,false,false,false,false,false,false],"unobstructed":[false,false,false,false,false,false,false,true,false,false,true,true,true,false,false,true,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false],"height":1.2574198157492824},{"image_id":"e37d923ec8284523b5c9ebdb35800a3a","pose":[-0.589713,-0.0119696,0.807525,-1.97599,-0.80754,0.0222594,-0.589393,-2.37926,-0.0109204,-0.999681,-0.022793,0.690385,0,0,0,1],"included":true,"visible":[false,false,true,true,true,true,true,false,true,false,false,false,false,true,false,false,false,false,false,false,false,false,false,true,true,false,true,true,false,false,false,true,true,false],"unobstructed":[false,false,false,true,false,true,false,false,false,false,false,false,false,true,false,false,false,false,false,false,false,false,false,true,false,false,true,false,false,false,false,true,false,false],"height":1.240973054769517},{"image_id":"560d4bedd6714e57b4acfe27bbc4a01b","pose":[-0.977243,-0.00165475,-0.212119,-0.0711643,0.212078,-0.028916,-0.976825,2.94531,-0.00451739,-0.999581,0.0286085,1.26221,0,0,0,1],"included":true,"visible":[false,true,false,true,true,true,false,false,true,true,false,false,true,true,false,false,false,false,true,false,false,false,false,false,true,true,true,true,false,false,true,true,false,false],"unobstructed":[false,false,false,false,false,true,false,false,true,true,false,false,false,false,false,false,false,false,false,false,false,false,false,false,true,false,false,true,false,false,false,false,false,false],"height":1.2613789210030284},{"image_id":"312bca1ac80742faa1617cf158beaeb1","pose":[0.312174,-0.0208526,0.949797,-2.52197,-0.949897,0.00960923,0.312418,-4.66412,-0.0156417,-0.999737,-0.0168085,0.685205,0,0,0,1],"included":true,"visible":[false,true,true,false,false,true,true,false,false,true,false,false,false,false,true,false,false,false,false,true,false,false,false,true,true,true,false,true,false,true,false,true,true,false],"unobstructed":[false,true,false,false,false,false,false,false,false,false,false,false,false,false,true,false,false,false,true,false,false,false,false,false,false,false,false,false,false,true,true,true,false,false],"height":1.2426174829057284},{"image_id":"c8e422cf697348cabeac7d9bc4a84062","pose":[0.332666,-0.0162534,-0.942905,1.23423,0.942934,-0.0096225,0.332842,-1.94066,-0.0144831,-0.999822,0.0121245,0.695783,0,0,0,1],"included":true,"visible":[false,true,true,false,false,true,false,false,true,true,false,false,true,true,true,true,false,false,true,false,true,false,true,true,true,true,true,true,false,false,true,true,true,false],"unobstructed":[false,false,false,false,false,false,false,false,false,false,false,false,false,true,false,false,false,false,true,false,true,false,false,true,false,false,true,false,false,true,false,false,true,false],"height":1.2459243009091168},{"image_id":"d81df9c0b8fb4b3e8a2cd12c2007461e","pose":[-0.943048,-0.011951,-0.332443,-0.105636,0.332628,-0.0203502,-0.942839,2.11049,0.00450241,-0.999722,0.0231661,1.25942,0,0,0,1],"included":true,"visible":[false,true,true,false,true,false,true,false,true,true,false,false,true,true,false,true,false,false,true,false,false,false,true,true,true,true,true,true,false,false,true,true,false,false],"unobstructed":[false,true,true,false,false,false,true,false,false,false,false,false,false,true,false,false,false,false,false,false,false,false,false,false,true,false,false,true,false,false,false,false,false,false],"height":1.259680186683917},{"image_id":"b961fab0e5cb48979dba7f115725baba","pose":[-0.72184,-0.0146334,-0.691906,1.93977,0.692056,-0.0116192,-0.721751,4.69574,0.00252213,-0.999826,0.018514,1.26792,0,0,0,1],"included":true,"visible":[false,true,false,false,false,true,false,false,false,false,false,false,false,true,false,false,false,false,true,false,false,false,false,false,false,true,true,true,false,false,true,true,false,false],"unobstructed":[false,false,false,false,false,true,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,true,false,false,false,false,false,false],"height":1.2502711440667638},{"image_id":"fdf0fc24f7c549ccbb78e972c8024fa9","pose":[0.909305,0.00741201,-0.416064,0.00890844,0.416112,-0.00686709,0.909288,7.75754,0.00388236,-0.999949,-0.00932887,1.25305,0,0,0,1],"included":true,"visible":[true,false,false,false,false,false,false,false,false,false,true,true,true,false,false,false,false,true,false,false,false,true,false,false,false,false,false,false,true,false,false,false,false,true],"unobstructed":[true,false,false,false,false,false,false,false,false,false,true,true,true,false,false,false,false,true,false,false,false,true,false,false,false,false,false,false,false,false,false,false,false,false],"height":1.2487049450992638},{"image_id":"b8f49ffb1486488bbb72693578c17865","pose":[-0.998964,-0.00962844,0.0444879,-1.8508,-0.0443137,-0.0175915,-0.998863,3.61648,0.0104,-0.999799,0.0171463,1.26079,0,0,0,1],"included":true,"visible":[false,false,true,false,true,true,false,false,false,true,false,true,true,true,false,true,false,false,true,false,false,false,true,false,true,false,true,true,false,false,false,false,false,false],"unobstructed":[false,false,true,false,false,false,false,false,false,true,false,false,false,true,false,false,false,false,false,false,false,false,false,false,true,false,false,false,false,false,false,false,false,false],"height":1.2633617680188245},{"image_id":"cb070d66db084a79b553310df69ed31d","pose":[-0.977864,0.00832091,0.20908,-2.27665,-0.209234,-0.0284428,-0.977452,4.68783,-0.00218661,-0.999561,0.0295538,1.25989,0,0,0,1],"included":true,"visible":[false,false,true,false,false,true,false,false,true,false,false,true,true,true,true,true,false,false,true,false,false,false,true,false,true,false,true,false,false,false,false,false,false,false],"unobstructed":[false,false,true,false,false,false,false,false,true,false,false,false,false,false,false,false,false,false,false,false,false,false,true,false,false,false,false,false,false,false,false,false,false,false],"height":1.2580748740290566},{"image_id":"f6872e6001054c67a12371a542c7defe","pose":[0.0536408,-0.0235465,0.998283,-2.43097,-0.99847,-0.0147796,0.0533023,8.50381,0.013499,-0.999614,-0.0243036,1.25061,0,0,0,1],"included":true,"visible":[true,false,false,false,false,false,false,true,false,false,false,true,true,false,false,true,true,false,false,false,false,true,false,false,false,false,false,false,false,false,false,false,false,false],"unobstructed":[true,false,false,false,false,false,false,true,false,false,false,true,false,false,false,false,true,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false],"height":1.2465964780421934},{"image_id":"0c5658ec8f51460fbf29d6aedcfb4bca","pose":[0.963476,-0.0168826,0.267262,-2.96338,-0.266981,0.0172023,0.963548,7.0529,-0.0208649,-0.99971,0.0120663,1.26308,0,0,0,1],"included":true,"visible":[true,false,false,false,false,false,false,true,true,true,true,false,true,false,false,true,true,false,false,false,false,true,true,false,false,false,false,false,false,false,false,false,false,false],"unobstructed":[true,false,false,false,false,false,false,true,false,false,true,false,true,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false],"height":1.2617934647021496},{"image_id":"138252ad9fab4ae1a86997fc363e6ac7","pose":[0.996651,-0.0156732,-0.0802583,-3.11236,0.0804086,0.00921488,0.99672,6.45377,-0.0148824,-0.999835,0.0104439,1.26,0,0,0,1],"included":true,"visible":[true,false,false,false,false,true,false,true,true,true,true,true,false,true,false,true,true,false,true,false,false,true,true,false,true,false,false,false,false,false,false,false,false,false],"unobstructed":[true,false,false,false,false,false,false,true,false,false,false,true,false,false,false,true,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false],"height":1.2614477951937262},{"image_id":"094836afff5e4fbfbb6659a96ec665b8","pose":[0.964654,-0.00795989,-0.2634,-0.000922046,0.263512,0.0216581,0.964413,0.00337562,-0.00197203,-0.999734,0.0229897,1.24819,0,0,0,1],"included":true,"visible":[false,true,true,false,true,true,true,false,true,true,false,false,true,false,false,true,false,false,true,false,false,false,true,true,true,true,true,true,false,false,true,true,false,false],"unobstructed":[false,true,false,false,true,true,false,false,true,false,false,false,false,false,false,false,false,false,false,false,false,false,false,true,true,false,false,true,false,false,false,false,false,false],"height":1.2526172310761738},{"image_id":"4739d48c61a04deab15db1eb2c906a96","pose":[0.193991,-0.0171755,-0.980853,-0.816552,0.980783,0.0246068,0.193546,-4.9622,0.0208112,-0.99955,0.0216185,0.679588,0,0,0,1],"included":true,"visible":[false,false,true,true,true,true,true,false,true,true,false,true,true,false,false,true,false,false,true,true,false,false,true,false,true,true,false,true,false,true,true,true,false,false],"unobstructed":[false,false,false,true,false,false,false,false,false,false,false,false,false,false,false,false,false,false,true,false,false,false,false,false,false,true,false,false,false,true,true,true,false,false],"height":1.2596737635557855},{"image_id":"87e7b6f2006541a9abe57fba18294a0c","pose":[-0.783324,-0.016461,-0.621396,-3.0819,0.621615,-0.0208826,-0.783045,6.16326,-0.0000868454,-0.999647,0.0265899,1.25902,0,0,0,1],"included":true,"visible":[true,false,false,false,false,true,false,false,true,true,true,true,true,true,false,false,true,false,true,false,false,false,true,false,true,false,false,false,false,false,false,false,false,false],"unobstructed":[true,false,false,false,false,false,false,false,false,false,false,false,true,false,false,false,false,false,false,false,false,false,true,false,false,false,false,false,false,false,false,false,false,false],"height":1.2599343835554653},{"image_id":"c32282e053ad450e9187d9d95361b124","pose":[0.0594054,-0.0138426,0.998138,-2.53141,-0.998234,-0.00190817,0.0593847,10.2372,0.00108244,-0.999903,-0.0139319,1.24275,0,0,0,1],"included":true,"visible":[true,false,false,false,false,false,false,false,false,false,true,true,true,false,false,true,false,false,false,false,false,false,true,false,false,false,false,false,false,false,false,false,false,false],"unobstructed":[false,false,false,false,false,false,false,false,false,false,true,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false],"height":1.2459471379951932},{"image_id":"a8cb8984630b456a96923ef0d7c3aeb3","pose":[0.351449,0.0243773,0.93589,-0.0153321,-0.935855,-0.0182983,0.351913,8.89998,0.0257037,-0.999536,0.0163823,1.27666,0,0,0,1],"included":true,"visible":[false,false,false,false,false,false,false,true,false,false,false,false,false,false,false,false,false,false,false,false,false,true,false,false,false,false,false,false,true,false,false,false,false,true],"unobstructed":[false,false,false,false,false,false,false,true,false,false,false,false,false,false,false,false,false,false,false,false,false,true,false,false,false,false,false,false,false,false,false,false,false,true],"height":1.2550452676594444},{"image_id":"ff2dd63ad83245c3b4831b4f8a6911ac","pose":[0.998846,-0.00853272,0.047285,0.322808,-0.047168,0.01344,0.998797,-3.39025,-0.0091581,-0.999874,0.0130217,0.694594,0,0,0,1],"included":true,"visible":[false,false,true,true,true,true,true,false,true,true,false,true,true,true,true,true,false,false,false,false,true,false,true,true,true,true,true,true,false,false,true,false,false,false],"unobstructed":[false,false,false,true,true,false,false,false,false,false,false,false,false,false,true,false,false,false,false,false,false,false,false,false,false,false,true,false,false,false,true,false,true,false],"height":1.2549827051622604},{"image_id":"e775ea81e83c45719de5a1577f8f7e39","pose":[0.967623,-0.00934794,-0.252228,2.82818,0.252377,0.0221475,0.967376,-5.00859,-0.00345689,-0.999711,0.0237893,0.677925,0,0,0,1],"included":true,"visible":[false,false,true,true,true,true,false,false,true,true,false,false,true,true,true,true,false,false,false,false,true,false,true,false,true,true,false,false,false,true,false,false,true,false],"unobstructed":[false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,true,false,false,false,false,false,false,true,false],"height":1.2544772562107187},{"image_id":"50e617dd6c4c4def9ea571ab3578f308","pose":[-0.300165,0.0205111,-0.953667,3.22939,0.953853,-0.00206313,-0.300268,-0.541289,-0.0081265,-0.999788,-0.0189456,0.672177,0,0,0,1],"included":true,"visible":[false,false,false,false,true,false,false,false,false,false,false,false,false,true,true,false,false,false,true,true,false,false,false,false,false,false,true,false,false,false,false,false,true,false],"unobstructed":[false,false,false,false,true,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,true,false,false,false,false,false,true,false],"height":1.2328507397565847},{"image_id":"63eb69df65c641e19af921ff458b6046","pose":[0.991257,-0.000218759,-0.131947,0.493009,0.131935,-0.0120189,0.991186,7.68006,-0.00180282,-0.999928,-0.0118853,1.24896,0,0,0,1],"included":true,"visible":[true,false,false,false,false,false,false,true,false,false,true,true,true,false,false,false,false,true,false,false,false,false,false,false,false,false,false,false,true,false,false,false,false,true],"unobstructed":[false,false,false,false,false,false,false,true,false,false,false,false,false,false,false,false,false,true,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,true],"height":1.2475721046793986},{"image_id":"8078fc4f547a4a9cb1a6bb036cc18dc9","pose":[-0.114576,0.00615355,-0.993396,-2.77681,0.993265,-0.0166685,-0.114664,5.56396,-0.0172641,-0.999842,-0.00420267,1.24986,0,0,0,1],"included":true,"visible":[false,false,false,false,false,true,false,false,true,true,false,true,true,true,false,true,false,false,true,false,false,false,false,false,true,false,false,false,false,false,false,false,false,false],"unobstructed":[false,false,false,false,false,false,false,false,false,true,false,false,false,false,false,true,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false],"height":1.2512865181863293},{"image_id":"981311e64aec4d3f8e403f349363e065","pose":[0.847247,-0.0183189,-0.530884,0.0723733,0.53092,-0.0032238,0.847416,-1.30075,-0.0172354,-0.999827,0.00699417,0.704176,0,0,0,1],"included":true,"visible":[false,true,true,false,true,true,true,false,true,true,false,false,true,true,false,true,false,false,true,false,false,false,true,false,true,true,true,true,false,false,true,true,true,false],"unobstructed":[false,true,false,false,true,false,false,false,false,false,false,false,false,true,false,false,false,false,false,false,false,false,false,false,false,false,true,true,false,false,false,false,true,false],"height":1.2568391809600372},{"image_id":"4ac8e1065b0b46db9e459d79a1078e04","pose":[-0.640839,0.0218408,0.767365,-0.962216,-0.767675,-0.020208,-0.640522,2.8754,0.00151725,-0.999558,0.0297163,1.26183,0,0,0,1],"included":true,"visible":[false,true,true,true,true,true,false,false,true,true,false,false,true,true,false,true,false,false,true,false,false,false,true,false,false,false,true,true,false,false,true,false,false,false],"unobstructed":[false,false,true,false,false,true,false,false,true,false,false,false,false,true,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false],"height":1.2639418549975774},{"image_id":"9b2b12257e9a4c5d9f99f141769e9301","pose":[0.98729,-0.0123897,0.158448,0.0965094,-0.158303,0.0119552,0.987319,-4.80181,-0.014127,-0.999852,0.00984156,0.677793,0,0,0,1],"included":true,"visible":[false,false,true,true,true,true,true,false,true,true,false,true,true,true,true,true,false,false,true,true,false,false,true,true,true,false,true,true,false,true,true,false,true,false],"unobstructed":[false,false,false,false,false,false,false,false,false,false,false,false,false,false,true,false,false,false,false,true,false,false,false,false,false,false,false,false,false,false,true,false,false,false],"height":1.251745413841814},{"image_id":"2596868500734b1496d479e6936cceb3","pose":[0.888937,-0.0176477,-0.45769,0.291121,0.457814,0.00353992,0.889041,-2.06553,-0.0140695,-0.999838,0.0112258,0.706623,0,0,0,1],"included":true,"visible":[false,true,true,false,true,true,true,false,true,true,false,false,true,true,false,true,false,false,true,false,true,false,true,true,true,true,false,true,false,false,true,true,true,false],"unobstructed":[false,true,false,false,true,false,false,false,false,false,false,false,false,false,false,false,false,false,true,false,true,false,false,true,false,false,false,false,false,false,false,true,false,false],"height":1.2553193865748646},{"image_id":"b0a4ed482e7b4f8eb499f3a999f65933","pose":[-0.848526,-0.0232996,-0.528642,1.46159,0.52901,-0.0139455,-0.848501,3.05605,0.0123974,-0.999632,0.0241584,1.26843,0,0,0,1],"included":true,"visible":[false,true,true,false,false,true,true,false,true,false,false,false,false,true,false,false,false,false,true,false,true,false,false,false,true,true,true,false,false,false,true,true,false,false],"unobstructed":[false,false,true,false,false,true,true,false,false,false,false,false,false,true,false,false,false,false,false,false,false,false,false,true,false,false,false,false,false,false,false,false,false,false],"height":1.2609941282189379},{"image_id":"97db9c9bd9824a3ca623db620331ea66","pose":[-0.929713,0.018777,0.367808,0.0255591,-0.367902,-0.00175584,-0.929863,10.8131,-0.0168144,-0.999822,0.00854017,1.2626,0,0,0,1],"included":true,"visible":[false,false,false,false,false,false,false,true,false,false,false,false,false,false,false,false,false,true,false,false,false,true,false,false,false,false,false,false,false,false,false,false,false,true],"unobstructed":[false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,true],"height":1.2659524368560175},{"image_id":"f1f56adaf5034a25929562d6c3213a8c","pose":[0.825713,-0.0302824,-0.563279,-1.5378,0.563854,0.0153007,0.825733,-5.11416,-0.0163868,-0.999425,0.0297084,0.682356,0,0,0,1],"included":true,"visible":[false,false,true,true,true,true,true,false,true,true,false,true,false,false,true,false,false,false,false,true,true,false,true,false,true,true,false,true,false,false,true,true,false,false],"unobstructed":[false,false,false,true,true,false,false,false,false,false,false,false,false,false,true,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,true,false,false],"height":1.260110021251859},{"image_id":"7e094a308dec4987a5185a0426f4ec3c","pose":[0.993231,-0.0084319,0.115849,0.146963,-0.115685,0.0178686,0.993125,-4.26187,-0.0104441,-0.999805,0.0167718,0.681901,0,0,0,1],"included":true,"visible":[false,true,true,true,true,true,true,false,true,true,false,true,true,true,true,true,false,false,true,false,false,false,true,true,true,true,true,true,false,true,false,false,false,false],"unobstructed":[false,false,false,true,false,false,false,false,false,false,false,false,false,false,true,false,false,false,true,false,false,false,false,false,false,true,false,false,false,false,false,false,false,false],"height":1.2503792812204662},{"image_id":"c460d0f6ea8a4458a576efb07d043b39","pose":[-0.0100665,-0.0185998,0.999777,-2.6559,-0.999919,0.0080293,-0.00991839,-2.91143,-0.00784316,-0.999795,-0.0186794,0.694102,0,0,0,1],"included":true,"visible":[false,true,true,true,true,true,true,false,true,true,false,false,false,true,true,false,false,false,false,false,false,false,false,true,false,false,true,true,false,true,false,false,true,false],"unobstructed":[false,true,false,true,false,false,false,false,false,false,false,false,false,false,true,false,false,false,false,false,false,false,false,false,false,false,true,false,false,true,false,false,false,false],"height":1.2467171179086285},{"image_id":"be733bc20f4c4f74992d21e2741f3a17","pose":[0.996567,-0.00580693,-0.0825867,3.06603,0.0825174,-0.0112836,0.996526,-1.87107,-0.00671876,-0.99992,-0.010766,0.687863,0,0,0,1],"included":true,"visible":[false,true,true,false,true,false,false,false,false,true,false,false,false,false,false,false,false,false,true,true,true,false,false,true,false,false,true,false,false,false,false,true,false,false],"unobstructed":[false,false,false,false,true,false,false,false,false,false,false,false,false,false,false,false,false,false,true,true,true,false,false,true,false,false,false,false,false,false,false,false,false,false],"height":1.2517487620409038},{"image_id":"3fd2f1d849de4e9f8c07863c845db6b5","pose":[-0.876161,0.0123836,0.481861,0.00165134,-0.482015,-0.018302,-0.875972,9.94824,-0.00202879,-0.999756,0.0220042,1.26676,0,0,0,1],"included":true,"visible":[false,false,false,false,false,false,false,true,false,false,false,false,false,false,false,false,false,true,false,false,false,true,false,false,false,false,false,false,true,false,false,false,false,false],"unobstructed":[false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,true,false,false,false,true,false,false,false,false,false,false,true,false,false,false,false,false],"height":1.258770950284268}] -------------------------------------------------------------------------------- /simulator/connectivity/README.md: -------------------------------------------------------------------------------- 1 | ## connectivity 2 | Connectivity graphs indicating the navigable paths between viewpoints in each scan. 3 | 4 | Each json file contains an array of annotations, one for each viewpoint in the scan. All annotations share the same basic structure as follows: 5 | 6 | ``` 7 | { 8 | "image_id": str, 9 | "pose": [float x 16], 10 | "included": boolean, 11 | "visible": [boolean x num_viewpoints], 12 | "unobstructed": [boolean x num_viewpoints], 13 | "height": float 14 | } 15 | ``` 16 | - `image_id`: matterport skybox prefix 17 | - `pose`: 4x4 matrix in row major order that transforms matterport skyboxes to global coordinates (z-up). Pose matrices are based on the assumption that the camera is facing skybox image 3. 18 | - `included`: whether viewpoint is included in the simulator. Some overlapping viewpoints are excluded. 19 | - `visible`: indicates other viewpoints that can be seen from this viewpoint. 20 | - `unobstructed`: indicates transitions to other viewpoints that are considered navigable for an agent. 21 | - `height`: estimated height of the viewpoint above the floor. Not required for the simulator. 22 | 23 | Units are in metres. 24 | 25 | `scans.txt` contains a list of all the scan ids in the dataset. 26 | -------------------------------------------------------------------------------- /simulator/connectivity/YmJkqBEsHnH_connectivity.json: -------------------------------------------------------------------------------- 1 | [{"image_id":"006933a75f764c5485cf284bea0ded0b","pose":[0.210914,-0.00824746,-0.977469,-7.64722,0.977278,0.0232484,0.210677,-2.15553,0.0209873,-0.999695,0.0129646,1.56695,0,0,0,1],"included":true,"visible":[false,false,true,false,true,true,false,true,true,true,false],"unobstructed":[false,false,false,false,false,false,false,true,true,false,false],"height":1.524793092035509},{"image_id":"e4ede0695e4e4a77aae8537abb9f11d3","pose":[-0.0422212,-0.0176246,-0.998952,-0.133122,0.998904,0.0194092,-0.0425613,-0.0184591,0.0201393,-0.999656,0.016787,1.48352,0,0,0,1],"included":true,"visible":[false,false,true,true,false,false,false,false,false,false,false],"unobstructed":[false,false,true,false,false,true,false,false,false,false,false],"height":1.5227398475592409},{"image_id":"d471e89e00be49f49a7ecace814d60bf","pose":[0.426939,-0.00370058,-0.904272,-0.421886,0.904055,0.0239963,0.426739,-2.12366,0.0201203,-0.999705,0.0135916,1.49477,0,0,0,1],"included":true,"visible":[true,true,false,true,true,true,false,true,true,true,false],"unobstructed":[false,true,false,true,false,true,false,false,false,false,false],"height":1.5263900136377955},{"image_id":"b34af02ce9b642ebbd0c7e9e0ba3b553","pose":[0.960272,0.00870611,-0.278924,-0.0905727,0.278755,0.0168277,0.960214,-3.55265,0.0130537,-0.99982,0.0137334,1.49061,0,0,0,1],"included":true,"visible":[true,true,true,false,false,false,false,false,false,false,false],"unobstructed":[false,false,true,false,false,true,false,false,false,false,false],"height":1.5323637229797105},{"image_id":"01c80b5f8fbd4c969ee0bc03f1ec7a6c","pose":[0.359562,-0.0105291,-0.933061,-3.77309,0.932771,0.0313799,0.359097,-2.1838,0.0254987,-0.999452,0.0211054,1.53932,0,0,0,1],"included":true,"visible":[true,false,true,false,false,true,false,true,true,true,false],"unobstructed":[false,false,false,false,false,true,false,true,false,false,false],"height":1.5286629461398107},{"image_id":"82ea5baa30f945fe98f6cad3064af847","pose":[0.0376233,-0.0115611,-0.999224,-2.01669,0.998821,0.0310955,0.0372487,-2.16965,0.030641,-0.999449,0.0127185,1.50807,0,0,0,1],"included":true,"visible":[true,true,true,true,true,false,false,true,true,true,false],"unobstructed":[false,true,true,true,true,false,false,false,false,false,false],"height":1.5253207999550662},{"image_id":"aecbb791f30b452a9236c5a8c7030663","pose":[0.296076,-0.0242641,-0.954855,-13.5955,0.955111,0.0179483,0.2957,-2.22547,0.00996343,-0.999544,0.0284901,1.59272,0,0,0,1],"included":true,"visible":[true,false,true,false,true,true,false,true,true,true,true],"unobstructed":[false,false,false,false,false,false,false,false,false,true,true],"height":1.7557263982456066},{"image_id":"d841f7b710f9470796d55561f8f524db","pose":[0.270437,0.002913,-0.962732,-5.77716,0.962325,0.0284129,0.27041,-2.21321,0.028142,-0.999591,0.00488176,1.55947,0,0,0,1],"included":true,"visible":[true,false,true,false,true,true,false,false,true,true,false],"unobstructed":[true,false,false,false,true,false,false,false,false,false,false],"height":1.5357935019251416},{"image_id":"8e38fdd81c7949db9646968bafbbdcfc","pose":[-0.00277118,-0.0169575,-0.999852,-9.93905,0.999791,0.020127,-0.00311204,-2.17463,0.0201771,-0.999653,0.0168993,1.60592,0,0,0,1],"included":true,"visible":[true,false,true,false,true,true,false,true,false,true,true],"unobstructed":[true,false,false,false,false,false,false,false,false,true,false],"height":1.5208970888736792},{"image_id":"20fd759be0b64fc9aa96d290f0a704ec","pose":[0.227815,0.0117555,-0.973633,-12.1161,0.973367,0.0235263,0.228037,-2.15724,0.025587,-0.999654,-0.00608172,1.59969,0,0,0,1],"included":true,"visible":[true,false,true,false,true,true,true,true,true,false,true],"unobstructed":[false,false,false,false,false,false,true,false,true,false,false],"height":1.5261379179165138},{"image_id":"d838acff82244c2da0cf2651e54966cb","pose":[0.310234,-0.0632421,-0.948553,-15.2317,0.950604,0.0313736,0.308813,-2.28133,0.0102298,-0.997504,0.0698525,0.902626,0,0,0,1],"included":true,"visible":[true,false,true,false,true,true,true,true,true,true,false],"unobstructed":[false,false,false,false,false,false,true,false,false,false,false],"height":1.558854711359605}] -------------------------------------------------------------------------------- /simulator/connectivity/gZ6f7yhEvPG_connectivity.json: -------------------------------------------------------------------------------- 1 | [{"image_id":"80929af5cf234ae38ac3a2a4e60e4342","pose":[0.983395,0.00450812,-0.181418,-2.79247,0.181442,-0.00570068,0.983385,-1.38801,0.00339928,-0.999973,-0.00642298,1.42676,0,0,0,1],"included":true,"visible":[false,true,true,false,false,true,false,false],"unobstructed":[false,true,false,true,false,true,false,false],"height":1.4191402375960298},{"image_id":"ba27da20782d4e1a825f0a133ad84da9","pose":[-0.7605,-0.0115739,-0.649234,-2.38988,0.648885,0.0237502,-0.760515,-0.0538717,0.0242219,-0.999651,-0.0105509,1.4341,0,0,0,1],"included":true,"visible":[true,false,true,true,false,true,false,true],"unobstructed":[true,false,false,false,false,true,false,true],"height":1.424939020658826},{"image_id":"46cecea0b30e4786b673f5e951bf82d4","pose":[0.593129,0.0137361,-0.80499,0.99933,0.804932,0.010707,0.59327,1.17558,0.0167685,-0.999848,-0.00470498,1.41684,0,0,0,1],"included":true,"visible":[false,false,false,true,true,false,true,true],"unobstructed":[false,false,false,true,true,false,true,true],"height":1.4252108727703763},{"image_id":"bda7a9e6d1d94b3aa8ff491beb158f3a","pose":[-0.378592,-0.0208239,0.925329,-0.182918,-0.925433,-0.00820128,-0.37882,-1.72967,0.0154776,-0.999749,-0.0161651,1.42205,0,0,0,1],"included":true,"visible":[true,false,true,false,true,false,true,true],"unobstructed":[true,false,true,false,true,false,false,true],"height":1.42983949725488},{"image_id":"dbb2f8000bc04b3ebcd0a55112786149","pose":[-0.595363,0.00457706,-0.803444,1.10196,0.803383,0.0168543,-0.595222,-1.10724,0.0108174,-0.999847,-0.0137106,1.41536,0,0,0,1],"included":true,"visible":[false,false,true,true,false,false,true,true],"unobstructed":[false,false,true,true,false,false,true,true],"height":1.4186255623107038},{"image_id":"29b20fa80dcd4771974303c1ccd8953f","pose":[0.292738,0.0164579,-0.956051,-2.77306,0.956096,0.0090939,0.292909,1.55377,0.0135152,-0.999823,-0.0130722,1.43367,0,0,0,1],"included":true,"visible":[true,true,true,false,true,false,false,false],"unobstructed":[true,true,false,false,false,false,false,false],"height":1.4237594118402337},{"image_id":"0ee20663dfa34b438d48750ddcd7366c","pose":[-0.75968,-0.0019971,-0.650293,-0.111567,0.650131,0.0201598,-0.759554,1.31337,0.014627,-0.999794,-0.0140156,1.42291,0,0,0,1],"included":true,"visible":[false,false,true,true,true,false,false,true],"unobstructed":[false,false,true,false,true,false,false,true],"height":1.4276556862049736},{"image_id":"47d8a8282c1c4a7fb3eeeacc45e9d959","pose":[-0.0254788,0.00643152,-0.999654,-0.0034508,0.999603,0.0120797,-0.0253995,0.0112371,0.0119124,-0.999906,-0.00673574,1.42388,0,0,0,1],"included":true,"visible":[true,true,true,true,true,false,true,false],"unobstructed":[false,true,true,true,true,false,true,false],"height":1.4268855357216241}] -------------------------------------------------------------------------------- /simulator/connectivity/pLe4wQe7qrG_connectivity.json: -------------------------------------------------------------------------------- 1 | [{"image_id":"e4c0a4ec08104bf5ada134b123fa53e7","pose":[-0.133089,0.0111501,-0.991041,1.16811,0.991028,0.0137789,-0.132932,-2.20571,0.0121736,-0.999843,-0.0128829,1.54855,0,0,0,1],"included":true,"visible":[false,false,true,false,true,false,true,true,false,true,false,false,true,false,true,false,false,false,false,false,false,true,false,true,true,true,true,true,false,false,true],"unobstructed":[false,false,false,false,false,false,false,false,false,false,false,false,false,false,true,false,false,false,false,false,false,true,false,false,true,false,false,false,false,false,false],"height":1.5280399019555968},{"image_id":"959ea6af304a4339bbc5d97f044d11c3","pose":[0.312992,0.0130519,-0.949666,2.47951,0.948724,0.0422726,0.313263,-2.23387,0.0442338,-0.999021,0.000849325,1.58243,0,0,0,1],"included":true,"visible":[false,false,true,true,false,false,false,false,false,true,false,true,true,false,true,true,false,false,false,false,false,false,true,true,true,true,true,false,true,false,true],"unobstructed":[false,false,false,false,false,false,false,false,false,false,false,true,false,false,false,false,false,false,false,false,false,false,false,false,false,false,true,false,true,false,false],"height":1.5361363756730164},{"image_id":"ffe0e6835287419c9cfe343e9d613d87","pose":[-0.802259,-0.00971694,-0.596896,5.96539,0.59688,0.00470064,-0.802316,-2.03323,0.0106021,-0.999941,0.00202973,1.57957,0,0,0,1],"included":true,"visible":[false,true,false,false,false,false,true,false,false,false,false,false,true,true,false,true,false,false,false,false,false,true,true,false,false,false,true,false,false,true,true],"unobstructed":[false,false,false,false,false,false,false,false,false,false,false,false,true,true,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,true],"height":1.518586128876891},{"image_id":"47a69295198f4265958b9b1d497c328d","pose":[-0.90497,-0.00981301,-0.42536,2.46799,0.425363,0.00186582,-0.90502,2.04203,0.00967489,-0.99995,0.0024866,1.55214,0,0,0,1],"included":true,"visible":[false,true,false,false,false,true,true,false,true,false,false,true,false,false,false,true,false,false,true,true,true,false,false,false,true,false,false,true,true,false,false],"unobstructed":[false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,true,false,false,false,false,true,false,false,false,false,false,false,false,false,false,false],"height":1.5121750884423606},{"image_id":"3dfe07714b2f49d88bd4c8749e8bb0b7","pose":[-0.979561,-0.00709476,0.201019,-1.64821,-0.200975,-0.00640329,-0.979575,0.566531,0.0082373,-0.999954,0.00484756,1.56065,0,0,0,1],"included":true,"visible":[true,false,false,false,false,false,true,true,false,false,false,true,false,true,false,true,true,false,false,false,false,true,true,true,true,true,false,true,false,false,false],"unobstructed":[false,false,false,false,false,false,false,false,false,false,false,false,false,false,true,false,true,false,false,false,false,true,false,false,true,true,false,true,false,false,false],"height":1.5157095354765127},{"image_id":"87407bb6ed614926b91fc3e27eab766e","pose":[0.22909,0.0301697,-0.972937,4.56488,0.973286,0.00848048,0.229435,2.04904,0.0151732,-0.999508,-0.02742,1.5442,0,0,0,1],"included":true,"visible":[false,false,false,true,false,false,true,false,false,true,false,false,false,false,true,true,false,false,true,true,true,false,true,false,false,false,false,false,true,false,false],"unobstructed":[false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,true,true,false,false,false,false,false,false,false,false,false,false],"height":1.5111934219678684},{"image_id":"530f8e4126b14082a5c4ff6c3f6ae7cd","pose":[-0.172634,-0.00379856,-0.984978,8.51758,0.984978,0.00322887,-0.172647,0.14365,0.00383645,-0.999987,0.0031851,1.4578,0,0,0,1],"included":true,"visible":[false,false,true,false,true,false,false,false,false,false,true,false,false,true,false,false,false,false,false,false,false,false,true,true,false,false,false,false,false,true,false],"unobstructed":[false,false,false,false,false,false,false,false,false,false,false,false,false,true,false,false,false,false,false,false,false,false,true,false,false,false,false,false,false,true,false],"height":1.5362285111230571},{"image_id":"96782d3925ec4088ab224cdc92a4fd6a","pose":[-0.216113,-0.00838211,-0.976332,1.24213,0.976316,0.00844697,-0.216182,2.38931,0.0100594,-0.999929,0.00635911,1.53856,0,0,0,1],"included":true,"visible":[true,false,false,false,true,true,true,false,true,false,false,false,false,true,false,false,true,false,true,true,false,true,true,true,false,false,false,true,false,false,false],"unobstructed":[false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,true,false,false,false,false,true,false,false,false,false,false,true,false,false,false],"height":1.5135335729735602},{"image_id":"2dcc9c6ca2d44d5080a0a7e7b7fb9c4d","pose":[-0.951188,-0.00996649,-0.308449,-1.21085,0.308409,0.00538007,-0.951238,2.40322,0.0111403,-0.999936,-0.00204269,1.55952,0,0,0,1],"included":true,"visible":[false,false,false,false,false,false,false,true,false,true,false,false,false,false,true,false,true,true,true,false,false,true,false,false,false,true,false,true,false,false,false],"unobstructed":[false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,true,false,false,false,false,false,false,false,false,true,false,true,false,false,false],"height":1.5317176811699114},{"image_id":"0d704acada9041c48621c5d01d775da0","pose":[0.884279,0.0143861,0.466735,-1.34535,-0.466608,-0.0113974,0.88439,-2.3821,0.0180428,-0.999831,-0.00336482,1.52522,0,0,0,1],"included":true,"visible":[true,false,false,false,false,false,false,false,true,false,false,false,false,false,true,false,true,true,false,false,false,true,false,false,true,true,false,false,false,false,false],"unobstructed":[false,false,false,false,false,false,false,false,false,false,false,false,false,false,true,false,false,false,false,false,false,false,false,false,true,true,false,false,false,false,false],"height":1.5405532836763522},{"image_id":"2cbd295d838b4c51b5590dcf2a37fba0","pose":[0.246342,0.0412581,-0.968304,4.76599,0.96868,0.0216735,0.247362,0.169153,0.0311925,-0.998913,-0.0346258,1.42661,0,0,0,1],"included":true,"visible":[false,false,false,false,false,false,true,false,false,false,false,false,false,true,false,false,false,false,false,false,false,true,true,true,false,false,false,false,true,true,false],"unobstructed":[false,false,false,false,false,false,false,false,false,false,false,false,true,true,false,false,false,false,false,false,false,true,false,true,false,false,false,false,true,true,false],"height":1.5180090338091925},{"image_id":"6fbd170d8df746b0b10e3801e2dad706","pose":[-0.872353,-0.0000202749,0.488874,3.49156,-0.488854,-0.00892582,-0.872319,0.121306,0.00438157,-0.99996,0.00777758,1.41535,0,0,0,1],"included":true,"visible":[false,true,false,false,true,false,false,false,false,false,false,false,false,false,false,true,false,true,false,false,false,true,false,false,false,true,false,false,true,false,false],"unobstructed":[false,true,false,false,false,false,false,false,false,false,false,false,false,false,false,true,false,false,false,false,false,false,false,false,false,false,false,false,true,false,false],"height":1.5371204380160495},{"image_id":"31d308fee8284a168c28e238cf814363","pose":[0.998122,0.0164352,-0.0590029,6.9369,0.0592246,-0.0133283,0.998155,-2.13031,0.0156188,-0.999776,-0.0142757,1.58199,0,0,0,1],"included":true,"visible":[false,true,true,false,false,false,false,false,false,false,true,false,false,true,false,true,false,false,true,false,false,true,true,true,false,false,true,false,false,true,true],"unobstructed":[false,false,true,false,false,false,false,false,false,false,true,false,false,true,false,false,false,false,false,false,false,false,false,true,false,false,false,false,false,false,false],"height":1.5115252320863801},{"image_id":"789faffd87b949fd9ed7e6df4fadc2f1","pose":[0.998352,0.0156401,-0.0551931,6.89589,0.0551612,0.00248225,0.998474,-1.07864,0.0157535,-0.999874,0.00161644,1.58253,0,0,0,1],"included":true,"visible":[false,false,true,false,true,false,true,false,false,false,true,false,true,false,false,true,false,false,true,false,false,true,true,true,false,false,false,false,false,true,false],"unobstructed":[false,false,true,false,false,false,true,false,false,false,true,false,true,false,false,false,false,false,false,false,false,false,true,true,false,false,false,false,false,true,false],"height":1.5156362905724483},{"image_id":"a26b0e83785f45d484e5f9b83fdb4df3","pose":[0.784717,-0.00024993,0.619854,-0.356288,-0.619842,-0.00640294,0.7847,-1.3696,0.00377304,-0.999979,-0.0051784,1.5663,0,0,0,1],"included":true,"visible":[true,false,true,false,true,false,false,false,false,true,false,false,true,false,false,true,true,true,false,false,false,true,false,false,true,true,false,true,false,false,false],"unobstructed":[true,false,false,false,true,false,false,false,false,true,false,false,false,false,false,true,true,true,false,false,false,true,false,false,true,true,false,false,false,false,false],"height":1.5217725369665362},{"image_id":"df0b69b34d04453691b72a6c16923756","pose":[0.00951654,-0.00498874,-0.999942,2.41189,0.999919,0.00833186,0.00947506,0.0914117,0.00828438,-0.999952,0.00506864,1.42153,0,0,0,1],"included":true,"visible":[false,true,false,true,true,false,false,false,false,false,true,true,false,true,true,false,true,true,false,false,false,true,false,false,false,true,false,false,true,false,false],"unobstructed":[false,false,false,true,false,false,false,false,false,false,false,true,false,false,true,false,true,false,false,false,false,true,false,false,false,false,false,false,true,false,false],"height":1.5270023190896223},{"image_id":"d7d0e431bbfa40429a561060150f24cb","pose":[0.999351,0.0057182,0.0355512,-0.337565,-0.0355828,0.00559738,0.999351,1.14528,0.00551577,-0.999968,0.00579823,1.55634,0,0,0,1],"included":true,"visible":[false,false,false,false,true,false,true,true,true,false,false,false,false,true,true,true,false,true,false,false,false,true,true,false,true,false,false,true,false,false,false],"unobstructed":[false,false,false,false,true,false,false,true,true,false,false,false,false,false,true,true,false,true,false,false,false,true,false,false,false,false,false,true,false,false,false],"height":1.5126864275679581},{"image_id":"8f17854feb134826ae42e16b303e7445","pose":[-0.04737,0.0249555,-0.998565,-0.00382618,0.998875,0.00294013,-0.0473109,-0.017549,0.00175551,-0.999684,-0.0250657,1.55087,0,0,0,1],"included":true,"visible":[false,false,false,false,false,false,false,false,true,true,false,true,false,true,true,true,true,false,false,false,false,true,false,false,true,false,false,true,false,false,false],"unobstructed":[false,false,false,false,false,false,false,false,false,false,false,false,false,false,true,false,true,false,false,false,false,true,false,false,false,false,false,true,false,false,false],"height":1.5136058544662168},{"image_id":"d0584db5d0ba41ee955f6c91195afcb3","pose":[-0.0387735,-0.000627238,0.999248,6.85886,-0.999187,-0.0109357,-0.0387783,2.09848,0.0109521,-0.99994,-0.000201698,1.56982,0,0,0,1],"included":true,"visible":[false,false,false,true,false,true,false,false,false,false,false,false,true,true,true,true,false,false,false,true,true,true,true,true,false,false,false,false,false,true,false],"unobstructed":[false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,true,false,false,false,true,false,false,false,false,false,true,false],"height":1.5123722877852799},{"image_id":"87491cd48b094270a2a1aa682b8a770c","pose":[0.995378,0.0106665,0.0954335,5.60063,-0.0953334,-0.00948957,0.9954,2.17887,0.0115233,-0.999898,-0.00842783,1.55259,0,0,0,1],"included":true,"visible":[false,false,false,true,false,true,true,false,false,false,true,true,true,true,false,true,false,false,true,false,true,true,true,true,false,false,false,false,true,false,false],"unobstructed":[false,false,false,false,false,true,false,false,false,false,false,false,false,false,false,false,false,false,true,false,false,false,false,false,false,false,false,false,false,true,false],"height":1.5096271733017124},{"image_id":"8a65d3586fed4c5f9e0f28fc184b3ff2","pose":[0.999328,0.0243579,-0.0273564,3.25097,0.0277536,-0.016113,0.999485,2.12641,0.0239048,-0.999573,-0.0167772,1.55627,0,0,0,1],"included":true,"visible":[false,false,false,true,false,true,true,false,true,false,true,true,true,true,false,false,false,false,true,true,false,false,false,true,false,false,false,false,false,false,false],"unobstructed":[false,false,false,true,false,true,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false],"height":1.5216447032258948},{"image_id":"eb464984cc4847d2a61eab27e3e31e51","pose":[0.317487,0.0187868,-0.948076,1.37215,0.94826,-0.0045702,0.317459,0.120026,0.0016314,-0.999813,-0.0192648,1.55431,0,0,0,1],"included":true,"visible":[true,false,false,false,true,false,false,true,true,true,false,true,false,true,true,true,true,true,false,false,false,false,false,false,true,true,false,false,false,false,false],"unobstructed":[true,false,false,false,true,false,false,true,false,false,true,false,false,false,true,true,true,true,false,false,false,false,false,false,true,true,false,false,false,false,false],"height":1.5187432392237161},{"image_id":"ce103547e620457f935a63050cea57b3","pose":[-0.926095,-0.0151941,-0.376983,7.37065,0.376978,0.00327303,-0.926216,0.160002,0.0153072,-0.999879,0.00269771,1.43016,0,0,0,1],"included":true,"visible":[false,false,true,false,true,false,true,false,false,false,true,false,true,true,false,false,false,false,true,false,false,false,false,true,false,true,false,false,false,true,false],"unobstructed":[false,false,false,false,false,false,true,false,false,false,false,false,false,true,false,false,false,false,false,false,false,false,false,true,false,false,false,false,false,true,false],"height":1.5228214121764414},{"image_id":"fa48c6f958304aa8a8f765a72fe7e8d5","pose":[-0.994837,-0.00721806,0.101218,6.07693,-0.101252,0.00455002,-0.99485,0.0491342,0.00672061,-0.999963,-0.00525636,1.42403,0,0,0,1],"included":true,"visible":[false,false,false,false,true,false,true,false,false,false,true,false,true,true,false,false,false,false,false,false,false,false,true,false,false,true,false,false,false,true,false],"unobstructed":[false,false,false,false,false,false,false,false,false,false,true,false,true,true,false,false,false,false,true,false,false,false,true,false,false,false,false,false,false,true,false],"height":1.520425902170783},{"image_id":"50be95bc6efb466c90867d52cf32ba3f","pose":[0.803639,0.00102907,-0.595115,-0.280264,0.595001,0.0182495,0.803517,-2.40583,0.0116877,-0.999833,0.0140547,1.54308,0,0,0,1],"included":true,"visible":[true,false,true,false,true,false,false,false,false,true,false,false,true,false,true,false,true,true,false,false,false,true,true,false,false,true,false,true,false,false,false],"unobstructed":[true,false,false,false,true,false,false,false,false,true,false,false,false,false,true,false,false,false,false,false,false,true,false,false,false,true,false,false,false,false,false],"height":1.5259856691595353},{"image_id":"91d1554c155e4185a8c69636d47fd58d","pose":[0.7634,0.00593063,0.645898,-1.49105,-0.645812,-0.0117048,0.763406,-0.563949,0.0120878,-0.999914,-0.00510434,1.56479,0,0,0,1],"included":true,"visible":[true,false,false,false,true,false,true,true,true,true,false,true,false,false,true,true,false,false,false,false,false,true,true,true,true,false,false,true,false,false,false],"unobstructed":[false,false,false,false,true,false,false,false,true,true,false,false,false,false,true,false,false,false,false,false,false,true,false,false,true,false,false,true,false,false,false],"height":1.5123581928141085},{"image_id":"5d4349e09ada47b0aa8b20a0d22c54ca","pose":[0.0797542,0.0285043,-0.996407,3.62156,0.996744,0.00951931,0.080054,-2.10242,0.0117672,-0.999548,-0.0276513,1.56537,0,0,0,1],"included":true,"visible":[false,true,true,false,false,false,false,false,false,true,true,true,true,false,false,false,false,false,false,false,false,false,false,true,false,false,false,false,true,false,true],"unobstructed":[false,true,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,true,false,true],"height":1.5223704869964667},{"image_id":"298e09e5e1144e7b9762747370ca68a5","pose":[0.31306,-0.00832259,-0.949696,0.0361493,0.949732,0.00181293,0.313056,2.42577,-0.000883427,-0.999963,0.0084728,1.55565,0,0,0,1],"included":true,"visible":[true,false,false,false,true,false,true,true,true,false,false,false,false,true,true,false,true,true,true,true,false,false,false,true,true,true,false,false,false,false,false],"unobstructed":[false,false,false,false,true,false,false,true,true,false,false,false,false,false,false,false,true,true,false,false,false,false,false,false,false,true,false,false,false,false,false],"height":1.5224640014863746},{"image_id":"f8e13e216dd6477ea05e694e2f1478d9","pose":[0.998766,0.0109404,-0.0484187,2.48582,0.0482994,0.0109393,0.998773,-1.19789,0.0114569,-0.99988,0.0103984,1.57265,0,0,0,1],"included":true,"visible":[false,true,false,true,true,false,true,false,true,true,true,true,false,false,false,true,true,true,false,false,false,false,true,true,true,false,true,false,false,false,true],"unobstructed":[false,true,false,false,false,false,false,false,false,false,true,true,false,false,false,true,false,false,false,false,false,false,false,false,false,false,true,false,false,false,false],"height":1.5206684141424807},{"image_id":"e5f7cab8517b47399eda8866f0e30ab3","pose":[-0.660778,-0.00608519,-0.750556,7.08848,0.750578,-0.00299603,-0.660773,1.44662,0.00177251,-0.999977,0.00654814,1.57334,0,0,0,1],"included":true,"visible":[false,false,false,false,false,true,true,false,false,false,true,false,true,true,false,false,false,false,true,true,false,false,true,true,false,true,false,false,true,false,false],"unobstructed":[false,false,false,false,false,false,true,false,false,false,true,false,false,true,false,false,false,false,true,true,false,false,true,true,false,false,false,false,false,false,false],"height":1.5050461478205863},{"image_id":"a924a5855b954d68b26ebe82ab61c71d","pose":[-0.120428,-0.000846936,-0.992721,4.79789,0.992705,0.00559062,-0.12043,-2.05172,0.0056522,-0.999984,0.000168504,1.57612,0,0,0,1],"included":true,"visible":[false,true,true,false,false,false,true,false,true,false,true,false,true,false,false,false,true,false,false,false,false,true,true,true,false,false,true,false,false,true,false],"unobstructed":[false,false,true,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,true,false,false,false,false],"height":1.5244946264278192}] -------------------------------------------------------------------------------- /simulator/connectivity/scans.txt: -------------------------------------------------------------------------------- 1 | 17DRP5sb8fy 2 | 1LXtFkjw3qL 3 | 1pXnuDYAj8r 4 | 29hnd4uzFmX 5 | 2azQ1b91cZZ 6 | 2n8kARJN3HM 7 | 2t7WUuJeko7 8 | 5LpN3gDmAk7 9 | 5q7pvUzZiYa 10 | 5ZKStnWn8Zo 11 | 759xd9YjKW5 12 | 7y3sRwLe3Va 13 | 8194nk5LbLH 14 | 82sE5b5pLXE 15 | 8WUmhLawc2A 16 | aayBHfsNo7d 17 | ac26ZMwG7aT 18 | ARNzJeq3xxb 19 | B6ByNegPMKs 20 | b8cTxDM8gDG 21 | cV4RVeZvu5T 22 | D7G3Y4RVNrH 23 | D7N2EKCX4Sj 24 | dhjEzFoUFzH 25 | E9uDoFAP3SH 26 | e9zR4mvMWw7 27 | EDJbREhghzL 28 | EU6Fwq7SyZv 29 | fzynW3qQPVF 30 | GdvgFV5R1Z5 31 | gTV8FGcVJC9 32 | gxdoqLR6rwA 33 | gYvKGZ5eRqb 34 | gZ6f7yhEvPG 35 | HxpKQynjfin 36 | i5noydFURQK 37 | JeFG25nYj2p 38 | JF19kD82Mey 39 | jh4fc5c5qoQ 40 | JmbYfDe2QKZ 41 | jtcxE69GiFV 42 | kEZ7cmS4wCh 43 | mJXqzFtmKg4 44 | oLBMNvg9in8 45 | p5wJjkQkbXX 46 | pa4otMbVnkk 47 | pLe4wQe7qrG 48 | Pm6F8kyY3z2 49 | pRbA3pwrgk9 50 | PuKPg4mmafe 51 | PX4nDJXEHrG 52 | q9vSo1VnCiC 53 | qoiz87JEwZ2 54 | QUCTc6BB5sX 55 | r1Q1Z4BcV1o 56 | r47D5H71a5s 57 | rPc6DW4iMge 58 | RPmz2sHmrrY 59 | rqfALeAoiTq 60 | s8pcmisQ38h 61 | S9hNv5qa7GM 62 | sKLMLpTHeUy 63 | SN83YJsR3w2 64 | sT4fr6TAbpF 65 | TbHJrupSAjP 66 | ULsKaCPVFJR 67 | uNb9QFRL6hY 68 | ur6pFq6Qu1A 69 | UwV83HsGsw3 70 | Uxmj2M2itWa 71 | V2XKFyX4ASd 72 | VFuaQ6m2Qom 73 | VLzqgDo317F 74 | Vt2qJdWjCF2 75 | VVfe2KiqLaN 76 | Vvot9Ly1tCj 77 | vyrNrziPKCB 78 | VzqfbhrpDEA 79 | wc2JMjhGNzB 80 | WYY7iVyf5p8 81 | X7HyMhZNoso 82 | x8F5xyUWy9e 83 | XcA2TqTSSAj 84 | YFuZgdQ5vWj 85 | YmJkqBEsHnH 86 | yqstnuAEVhm 87 | YVUC4YcDtcY 88 | Z6MFQCViBuw 89 | ZMojNkEp431 90 | zsNo4HB9uLZ 91 | -------------------------------------------------------------------------------- /simulator/envs/__init__.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | sys.path.append('simulator/envs') 4 | -------------------------------------------------------------------------------- /simulator/envs/env.py: -------------------------------------------------------------------------------- 1 | ''' Batched Room-to-Room navigation environment ''' 2 | 3 | import numpy as np 4 | import json 5 | import networkx as nx 6 | import os 7 | import torch 8 | 9 | from collections import namedtuple 10 | from envs_utils import load_nav_graphs, structured_map 11 | from model.cuda import try_cuda 12 | 13 | ANGLE_INC = np.pi / 6. 14 | WorldState = namedtuple( 15 | "WorldState", 16 | ["scan_id", "viewpoint_id", "view_index", "heading", "elevation"] 17 | ) 18 | 19 | 20 | class EnvBatch(): 21 | ''' A simple wrapper for a batch of MatterSim environments, 22 | using discretized viewpoints and pretrained features, 23 | using an adjacency dictionary to replace the MatterSim simulator 24 | ''' 25 | 26 | def __init__(self, adj_dict=None): 27 | self.adj_dict = adj_dict 28 | assert adj_dict is not None, "Error! No adjacency dictionary!" 29 | 30 | def get_start_state(self, scan_ids, viewpoint_ids, headings): 31 | def f(scan_id, viewpoint_id, heading): 32 | elevation = 0 33 | view_index = (12 * round(elevation / ANGLE_INC + 1) 34 | + round(heading / ANGLE_INC) % 12) 35 | return WorldState(scan_id=scan_id, 36 | viewpoint_id=viewpoint_id, 37 | view_index=view_index, 38 | heading=heading, 39 | elevation=elevation) 40 | 41 | return structured_map(f, scan_ids, viewpoint_ids, headings) 42 | 43 | def get_adjs(self, world_states): 44 | def f(world_state): 45 | query = '_'.join([world_state.scan_id, 46 | world_state.viewpoint_id, 47 | str(world_state.view_index)]) 48 | return self.adj_dict[query] 49 | 50 | return structured_map(f, world_states) 51 | 52 | def make_actions(self, world_states, actions, attrs): 53 | def f(world_state, action, loc_attrs): 54 | if action == 0: 55 | return world_state 56 | else: 57 | loc_attr = loc_attrs[action] 58 | return WorldState(scan_id=world_state.scan_id, 59 | viewpoint_id=loc_attr['nextViewpointId'], 60 | view_index=loc_attr['absViewIndex'], 61 | heading=(loc_attr['absViewIndex'] % 12) * ANGLE_INC, 62 | elevation=(loc_attr['absViewIndex'] // 12 - 1) 63 | * ANGLE_INC) 64 | 65 | return structured_map(f, world_states, actions, attrs) 66 | 67 | 68 | class RoomEnv(): 69 | ''' Implements the R2R (R4R, R6R, etc.) navigation task, 70 | using discretized viewpoints and pretrained features. 71 | ''' 72 | 73 | @staticmethod 74 | def load_adj_feature(adj_list_file): 75 | with open(adj_list_file, 'r') as f: 76 | adj_dict = json.load(f) 77 | return adj_dict 78 | 79 | @staticmethod 80 | def load_graphs(): 81 | ''' Load connectivity graph for each scan. ''' 82 | scans = [] 83 | for file in os.listdir("simulator/connectivity"): 84 | if file.endswith(".json"): 85 | scans.append(file.split('_')[0]) 86 | print('Loading navigation graphs for %d scans' % len(scans)) 87 | graphs = load_nav_graphs(scans) 88 | paths = {} 89 | matrix = {} 90 | states_map = {} 91 | distances = {} 92 | for scan, G in graphs.items(): # compute all shortest paths 93 | paths[scan] = dict(nx.all_pairs_dijkstra_path(G)) 94 | matrix[scan] = nx.to_numpy_matrix(G) 95 | states_map[scan] = list(G.nodes) 96 | distances[scan] = dict(nx.all_pairs_dijkstra_path_length(G)) 97 | return paths, states_map, distances 98 | 99 | @staticmethod 100 | def make_state_embeddings(args, states_map, image_features_list): 101 | state_embedding = {} 102 | for scan, state_list in states_map.items(): 103 | embedding = np.zeros((len(state_list), args.num_views, 104 | args.mean_pooled_dim)) 105 | for i, state in enumerate(state_list): 106 | fake_state = {'scan_id': scan, 107 | 'viewpoint_id': state} 108 | feature = [featurizer.get_features(fake_state) 109 | for featurizer in image_features_list][0] 110 | embedding[i] = feature 111 | state_embedding[scan] = torch.from_numpy(embedding).float() 112 | return state_embedding 113 | 114 | @staticmethod 115 | def build_viewpoint_loc_embedding(args, view_index): 116 | """ 117 | Position embedding: heading 64D + elevation 64D 118 | 1) heading: [sin(heading) for _ in range(1, 33)] + 119 | [cos(heading) for _ in range(1, 33)] 120 | 2) elevation: [sin(elevation) for _ in range(1, 33)] + 121 | [cos(elevation) for _ in range(1, 33)] 122 | """ 123 | embedding = np.zeros((args.num_views, 128), np.float32) 124 | for abs_view_index in range(args.num_views): 125 | rel_view_index = (abs_view_index - view_index) % 12 \ 126 | + (abs_view_index // 12) * 12 127 | rel_heading = (rel_view_index % 12) * ANGLE_INC 128 | rel_elevation = (rel_view_index // 12 - 1) * ANGLE_INC 129 | embedding[abs_view_index, 0:32] = np.sin(rel_heading) 130 | embedding[abs_view_index, 32:64] = np.cos(rel_heading) 131 | embedding[abs_view_index, 64:96] = np.sin(rel_elevation) 132 | embedding[abs_view_index, 96:] = np.cos(rel_elevation) 133 | return torch.from_numpy(embedding).float() 134 | 135 | def __init__(self, args, paths, states_map, distances, state_embedding, 136 | loc_embeddings, adj_dict): 137 | self.env = EnvBatch(adj_dict=adj_dict) 138 | self.margin = 3.0 139 | self.paths = paths 140 | self.states_map = states_map 141 | self.distances = distances 142 | self.state_embedding = state_embedding 143 | self.loc_embeddings = loc_embeddings 144 | self.padding_action = try_cuda(torch.zeros(args.action_embed_size)) 145 | self.padding_feature = try_cuda(torch.zeros(args.num_views, 146 | args.action_embed_size)) 147 | self.shrink = 10 # shrink distance 10 times 148 | 149 | def _build_action_embedding(self, adj_loc_list, feature): 150 | feature_adj = feature[[adj_dict['absViewIndex'] 151 | for adj_dict in adj_loc_list]] 152 | feature_adj[0] = 0 153 | embedding = np.zeros((len(adj_loc_list), 128), np.float32) 154 | for a, adj_dict in enumerate(adj_loc_list): 155 | if a == 0: 156 | continue 157 | else: 158 | rel_heading = adj_dict['rel_heading'] 159 | rel_elevation = adj_dict['rel_elevation'] 160 | embedding[a][0:32] = np.sin(rel_heading) 161 | embedding[a][32:64] = np.cos(rel_heading) 162 | embedding[a][64:96] = np.sin(rel_elevation) 163 | embedding[a][96:] = np.cos(rel_elevation) 164 | angle_embed = torch.from_numpy(embedding).float() 165 | return try_cuda(torch.cat((feature_adj, angle_embed), dim=-1)) 166 | 167 | def _build_feature_embedding(self, view_index, feature): 168 | angle_embed = self.loc_embeddings[view_index] 169 | return try_cuda(torch.cat((feature, angle_embed), dim=-1)) 170 | 171 | def _shortest_path_action(self, state, adj_loc_list, goal_id): 172 | ''' Determine next action on the shortest path to goal, for supervised training. ''' 173 | if state.viewpoint_id == goal_id: 174 | return 0 175 | for n_a, loc_attr in enumerate(adj_loc_list): 176 | if loc_attr['nextViewpointId'] == goal_id: 177 | return n_a 178 | path = self.paths[state.scan_id][state.viewpoint_id][goal_id] 179 | next_viewpoint_id = path[1] 180 | for n_a, loc_attr in enumerate(adj_loc_list): 181 | if loc_attr['nextViewpointId'] == next_viewpoint_id: 182 | return n_a 183 | 184 | # Next viewpoint_id not found! This should not happen! 185 | print('adj_loc_list:', adj_loc_list) 186 | print('next_viewpoint_id:', next_viewpoint_id) 187 | print('longId:', '{}_{}'.format(state.scan_id, state.viewpoint_id)) 188 | raise Exception('Error: next_viewpoint_id not in adj_loc_list') 189 | 190 | def _observe(self, world_states, include_feature=True, 191 | include_teacher=True, step=None): 192 | """ 193 | Return the observations of a batch of states 194 | :param world_states: states defined as a namedtuple 195 | :param done: has done, no need to provide ob 196 | :param include_feature: whether or not to return the pretrained features 197 | :param include_teacher: whether or not to return a teacher viewpoint and 198 | teacher action (for supervision) 199 | :param step: step number in the gold trajectory 200 | :return: a list of observations, each is a dictionary 201 | """ 202 | obs = [] 203 | for i, adj_loc_list in enumerate(self.env.get_adjs(world_states)): 204 | item = self.batch[i] 205 | state = self.world_states[i] 206 | ob = { 207 | 'scan': state.scan_id, 208 | 'viewpoint': state.viewpoint_id, 209 | 'view_index': state.view_index, 210 | 'heading': state.heading, 211 | 'elevation': state.elevation, 212 | 'adj_loc_list': adj_loc_list, 213 | 'instr_id': item['instr_id'] 214 | } 215 | if include_feature: 216 | idx = self.states_map[state.scan_id].index(state.viewpoint_id) 217 | feature = self.state_embedding[state.scan_id][idx] 218 | feature_with_loc = self._build_feature_embedding(state.view_index, 219 | feature) 220 | action_embedding = self._build_action_embedding(adj_loc_list, feature) 221 | ob['feature'] = [feature_with_loc] 222 | ob['action_embedding'] = action_embedding 223 | if include_teacher: 224 | ob['goal'] = item['path'][-1] 225 | if step is not None and (step + 1) < len(item['path']): 226 | ob['teacher'] = item['path'][step + 1] 227 | else: 228 | ob['teacher'] = item['path'][-1] 229 | ob['teacher_action'] = self._shortest_path_action(state, adj_loc_list, 230 | ob['teacher']) 231 | obs.append(ob) 232 | return obs 233 | 234 | def reset(self, next_batch, step=None): 235 | ''' Load a new mini-batch and return the initial observation''' 236 | self.batch = next_batch 237 | scan_ids = [item['scan'] for item in next_batch] 238 | viewpoint_ids = [item['path'][0] for item in next_batch] 239 | headings = [item['heading'] for item in next_batch] 240 | self.world_states = self.env.get_start_state(scan_ids, viewpoint_ids, 241 | headings) 242 | obs = self._observe(self.world_states, step=step) 243 | return obs 244 | 245 | def step(self, obs, actions, step=None): 246 | ''' Take one step from the current state 247 | :param obs: last observations 248 | :param actions: current actions 249 | :param step: step information for teacher action supervision 250 | :return: current observations, and "done" (finish or not) 251 | ''' 252 | attrs = [ob['adj_loc_list'] for ob in obs] 253 | self.world_states = self.env.make_actions(self.world_states, actions, 254 | attrs) 255 | obs = self._observe(self.world_states, step=step) 256 | done = (np.array(actions) == 0).astype(np.uint8) 257 | return obs, done 258 | 259 | def _paths_to_goals(self, obs, max_steps): 260 | all_obs = [[ob] for ob in obs] 261 | all_actions = [[] for _ in obs] 262 | ended = np.zeros(len(obs)) 263 | for t in range(max_steps): 264 | actions = [ob['teacher_action'] for ob in obs] 265 | for i, a in enumerate(actions): 266 | if not ended[i]: 267 | all_actions[i].append(a) 268 | obs, ended = self.step(obs, actions, step=t + 1) 269 | for i, ob in enumerate(obs): 270 | if not ended[i] and t < max_steps - 1: 271 | all_obs[i].append(ob) 272 | if ended.all(): 273 | break 274 | return all_obs, all_actions 275 | 276 | def gold_obs_actions_and_instructions(self, batch, max_steps=100): 277 | obs = self.reset(batch, step=0) 278 | path_obs, path_actions = self._paths_to_goals(obs, max_steps) 279 | encoded_instructions = [item['instr_encoding'] for item in batch] 280 | return path_obs, path_actions, encoded_instructions 281 | 282 | def length(self, scan, nodes): 283 | return float(np.sum([self.distances[scan][edge[0]][edge[1]] 284 | for edge in zip(nodes[:-1], nodes[1:])])) 285 | 286 | def get_mix(self, scan, prediction, reference): 287 | success = self.distances[scan][prediction[-1]][reference[-1]] < self.margin 288 | pad = [0] * (len(prediction) - 1) 289 | final = self.ndtw(scan, prediction, reference) * success \ 290 | + self.cls(scan, prediction, reference) 291 | return pad + [final] 292 | 293 | def get_ndtw(self, scan, prediction, reference): 294 | success = self.distances[scan][prediction[-1]][reference[-1]] < self.margin 295 | pad = [0] * (len(prediction) - 2) 296 | return pad + [self.ndtw(scan, prediction, reference) + success] 297 | 298 | def ndtw(self, scan, prediction, reference): 299 | dtw_matrix = np.inf * np.ones((len(prediction) + 1, len(reference) + 1)) 300 | dtw_matrix[0][0] = 0 301 | for i in range(1, len(prediction) + 1): 302 | for j in range(1, len(reference) + 1): 303 | best_previous_cost = min(dtw_matrix[i - 1][j], 304 | dtw_matrix[i][j - 1], 305 | dtw_matrix[i - 1][j - 1]) 306 | cost = self.distances[scan][prediction[i - 1]][reference[j - 1]] 307 | dtw_matrix[i][j] = cost + best_previous_cost 308 | dtw = dtw_matrix[len(prediction)][len(reference)] 309 | ndtw = np.exp(-dtw / (self.margin * len(reference))) 310 | return ndtw 311 | 312 | def get_cls(self, scan, prediction, reference): 313 | success = self.distances[scan][reference[-1]][prediction[-1]] < self.margin 314 | pad = [0] * (len(prediction) - 2) 315 | return pad + [self.cls(scan, prediction, reference) + success] 316 | 317 | def cls(self, scan, prediction, reference): 318 | coverage = np.mean([np.exp( 319 | -np.min([self.distances[scan][u][v] for v in prediction]) / self.margin 320 | ) for u in reference]) 321 | expected = coverage * self.length(scan, reference) 322 | score = expected \ 323 | / (expected + np.abs(expected - self.length(scan, prediction))) 324 | return coverage * score 325 | 326 | def get_dis(self, scan, prediction, reference): 327 | goal = reference[-1] 328 | success = self.distances[scan][goal][prediction[-1]] < self.margin 329 | dis = [(self.distances[scan][goal][prediction[i]] 330 | - self.distances[scan][goal][prediction[i + 1]]) / self.shrink 331 | for i in range(len(prediction) - 1)] 332 | return dis[:-1] + [success] 333 | -------------------------------------------------------------------------------- /simulator/envs/envs_utils.py: -------------------------------------------------------------------------------- 1 | ''' Utils for the environments ''' 2 | 3 | import sys 4 | import json 5 | import numpy as np 6 | import networkx as nx 7 | import base64 8 | 9 | 10 | def load_nav_graphs(scans): 11 | ''' Load connectivity graph for each scan ''' 12 | 13 | def distance(pose1, pose2): 14 | ''' Euclidean distance between two graph poses ''' 15 | return ((pose1['pose'][3] - pose2['pose'][3]) ** 2 16 | + (pose1['pose'][7] - pose2['pose'][7]) ** 2 17 | + (pose1['pose'][11] - pose2['pose'][11]) ** 2) ** 0.5 18 | 19 | graphs = {} 20 | for scan in scans: 21 | with open('simulator/connectivity/%s_connectivity.json' % scan) as f: 22 | G = nx.Graph() 23 | positions = {} 24 | data = json.load(f) 25 | for i, item in enumerate(data): 26 | if item['included']: 27 | for j, conn in enumerate(item['unobstructed']): 28 | if conn and data[j]['included']: 29 | positions[item['image_id']] = np.array([item['pose'][3], 30 | item['pose'][7], 31 | item['pose'][11]]) 32 | assert data[j]['unobstructed'][i], 'Graph should be undirected' 33 | G.add_edge(item['image_id'], data[j]['image_id'], 34 | weight=distance(item, data[j])) 35 | nx.set_node_attributes(G, values=positions, name='position') 36 | graphs[scan] = G 37 | return graphs 38 | 39 | 40 | def decode_base64(string): 41 | if sys.version_info[0] == 2: 42 | return base64.decodestring(bytearray(string)) 43 | elif sys.version_info[0] == 3: 44 | return base64.decodebytes(bytearray(string, 'utf-8')) 45 | else: 46 | raise ValueError("decode_base64 can't handle python version {}".format( 47 | sys.version_info[0])) 48 | 49 | 50 | def structured_map(function, *args, **kwargs): 51 | nested = kwargs.get('nested', False) 52 | acc = [] 53 | for t in zip(*args): 54 | if nested: 55 | mapped = [function(*inner_t) for inner_t in zip(*t)] 56 | else: 57 | mapped = function(*t) 58 | acc.append(mapped) 59 | return acc 60 | -------------------------------------------------------------------------------- /simulator/envs/image_feature.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import sys 3 | import paths 4 | import csv 5 | 6 | from collections import defaultdict 7 | from envs_utils import decode_base64 8 | 9 | csv.field_size_limit(sys.maxsize) 10 | 11 | 12 | class ImageFeatures(object): 13 | NUM_VIEWS = 36 14 | MEAN_POOLED_DIM = 2048 15 | IMAGE_W = 640 16 | IMAGE_H = 480 17 | VFOV = 60 18 | 19 | @staticmethod 20 | def from_args(args): 21 | for image_feature_type in sorted(args.image_feature_type): 22 | assert image_feature_type == "mean_pooled" 23 | return [MeanPooledImageFeatures(args.image_feature_datasets)] 24 | 25 | @staticmethod 26 | def add_args(args): 27 | args.add_argument("--num_views", type=int, 28 | default=ImageFeatures.NUM_VIEWS) 29 | args.add_argument("--mean_pooled_dim", type=int, 30 | default=ImageFeatures.MEAN_POOLED_DIM) 31 | args.add_argument("--image_feature_type", nargs="+", 32 | default=["mean_pooled"]) 33 | args.add_argument("--image_feature_datasets", nargs="+", 34 | default=["imagenet"]) 35 | 36 | def get_features(self, state): 37 | raise NotImplementedError("get_features") 38 | 39 | 40 | class MeanPooledImageFeatures(ImageFeatures): 41 | def __init__(self, image_feature_datasets): 42 | image_feature_datasets = sorted(image_feature_datasets) 43 | self.image_feature_datasets = image_feature_datasets 44 | self.mean_pooled_feature_stores = [ 45 | paths.MEAN_POOLED_FEATURE_STORE_PATHS[dataset] 46 | for dataset in image_feature_datasets] 47 | self.feature_dim = MeanPooledImageFeatures.MEAN_POOLED_DIM \ 48 | * len(image_feature_datasets) 49 | print('Loading image features from %s' 50 | % ', '.join(self.mean_pooled_feature_stores)) 51 | tsv_fieldnames = ['scanId', 'viewpointId', 'image_w', 'image_h', 'vfov', 52 | 'features'] 53 | self.features = defaultdict(list) 54 | for mpfs in self.mean_pooled_feature_stores: 55 | with open(mpfs, "rt") as tsv_in_file: 56 | reader = csv.DictReader(tsv_in_file, delimiter='\t', 57 | fieldnames=tsv_fieldnames) 58 | for item in reader: 59 | assert int(item['image_h']) == ImageFeatures.IMAGE_H 60 | assert int(item['image_w']) == ImageFeatures.IMAGE_W 61 | assert int(item['vfov']) == ImageFeatures.VFOV 62 | long_id = self._make_id(item['scanId'], item['viewpointId']) 63 | features = np.frombuffer(decode_base64(item['features']), 64 | dtype=np.float32) 65 | features = features.reshape((ImageFeatures.NUM_VIEWS, 66 | ImageFeatures.MEAN_POOLED_DIM)) 67 | self.features[long_id].append(features) 68 | assert all( 69 | len(feats) == len(self.mean_pooled_feature_stores) 70 | for feats in self.features.values() 71 | ) 72 | self.features = { 73 | long_id: np.concatenate(feats, axis=1) 74 | for long_id, feats in self.features.items() 75 | } 76 | 77 | def _make_id(self, scan_id, viewpoint_id): 78 | return scan_id + '_' + viewpoint_id 79 | 80 | def get_features(self, state): 81 | long_id = self._make_id(state['scan_id'], state['viewpoint_id']) 82 | return self.features[long_id] 83 | -------------------------------------------------------------------------------- /simulator/envs/paths.py: -------------------------------------------------------------------------------- 1 | MEAN_POOLED_FEATURE_STORE_PATHS = { 2 | 'imagenet': 'simulator/resnet_feature/ResNet-152-imagenet.tsv', 3 | } 4 | ADJ_LIST_FILE = 'simulator/total_adj_list.json' 5 | -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sha-Lab/babywalk/4265340d5e521b59030bc50ef71f01b0d3d3ba63/src/__init__.py -------------------------------------------------------------------------------- /src/eval_follower.py: -------------------------------------------------------------------------------- 1 | ''' Evaluation of agent trajectories ''' 2 | 3 | from collections import defaultdict 4 | import numpy as np 5 | import copy 6 | from collections import namedtuple 7 | 8 | EvalResult = namedtuple("EvalResult", "nav_error, oracle_error, " 9 | "trajectory_steps, trajectory_length, " 10 | "sr, osr, spl, cls, ndtw, sdtw") 11 | 12 | 13 | class FollowerEvaluation(object): 14 | ''' Results submission format: 15 | [{'instr_id': string, 16 | 'trajectory':[viewpoint_id]}] ''' 17 | 18 | def __init__(self, env, data): 19 | self.margin = 3.0 20 | self.gt = {} 21 | self.instr_ids = [] 22 | for item in data: 23 | if item['path_id'] not in self.gt: 24 | self.gt[item['path_id']] = copy.deepcopy(item) 25 | self.gt[item['path_id']]['instructions'] = [item['instructions']] 26 | else: 27 | self.gt[item['path_id']]['instructions'].append(item['instructions']) 28 | self.instr_ids.append(item['instr_id']) 29 | self.instr_ids = set(self.instr_ids) 30 | self.distances = env.distances 31 | self.env = env 32 | 33 | def _get_nearest(self, scan, goal_id, path): 34 | near_id = path[0] 35 | near_d = self.distances[scan][near_id][goal_id] 36 | for item in path: 37 | d = self.distances[scan][item][goal_id] 38 | if d < near_d: 39 | near_id = item 40 | near_d = d 41 | return near_id 42 | 43 | def _score_item(self, gt, path): 44 | ''' Calculate error based on the final position in trajectory, and also 45 | the closest position (oracle stopping rule). ''' 46 | goal = gt['path'][-1] 47 | final_position = path[-1] 48 | nearest_position = self._get_nearest(gt['scan'], goal, path) 49 | dis = self.distances[gt['scan']][path[0]][goal] 50 | nav_error = self.distances[gt['scan']][final_position][goal] 51 | oracle_error = self.distances[gt['scan']][nearest_position][goal] 52 | trajectory_steps = len(path) - 1 53 | trajectory_length = self.env.length(gt['scan'], path) 54 | sr = nav_error < self.margin 55 | osr = oracle_error < self.margin 56 | spl = sr * dis / max(trajectory_length, dis) if dis > 0 else sr 57 | cls = self.env.cls(gt['scan'], path, gt['path']) 58 | ndtw = self.env.ndtw(gt['scan'], path, gt['path']) 59 | sdtw = ndtw * sr 60 | return EvalResult(nav_error=nav_error, oracle_error=oracle_error, 61 | trajectory_steps=trajectory_steps, 62 | trajectory_length=trajectory_length, 63 | sr=sr, osr=osr, spl=spl, cls=cls, ndtw=ndtw, sdtw=sdtw) 64 | 65 | def score_results(self, results, update_results=False): 66 | ''' 67 | evaluation on different metrics 68 | :param results: results should be a dictionary mapping instr_ids to 69 | dictionaries, with each dictionary containing (at least) 70 | a 'trajectory' field 71 | :param update_results: update the result dictionary for saving result files 72 | :return: 73 | ''' 74 | self.scores = defaultdict(list) 75 | model_scores = [] 76 | instr_ids = set(self.instr_ids) 77 | 78 | instr_count = 0 79 | for instr_id, result in results.items(): 80 | if instr_id in instr_ids: 81 | instr_count += 1 82 | instr_ids.remove(instr_id) 83 | 84 | gt = self.gt[int(instr_id.split('_')[0])] 85 | eval_result = self._score_item(gt, result['trajectory']) 86 | self.scores['nav_errors'].append(eval_result.nav_error) 87 | self.scores['oracle_errors'].append(eval_result.oracle_error) 88 | self.scores['trajectory_steps'].append(eval_result.trajectory_steps) 89 | self.scores['trajectory_lengths'].append(eval_result.trajectory_length) 90 | self.scores['sr'].append(eval_result.sr) 91 | self.scores['cls'].append(eval_result.cls) 92 | self.scores['osr'].append(eval_result.osr) 93 | self.scores['spl'].append(eval_result.spl) 94 | self.scores['ndtw'].append(eval_result.ndtw) 95 | self.scores['sdtw'].append(eval_result.sdtw) 96 | if 'score' in result: 97 | model_scores.append(result['score']) 98 | if update_results: 99 | result['nav_errors'] = eval_result.nav_error 100 | result['oracle_errors'] = eval_result.oracle_error 101 | result['trajectory_steps'] = eval_result.trajectory_steps 102 | result['trajectory_lengths'] = eval_result.trajectory_length 103 | result['sr'] = eval_result.sr 104 | result['osr'] = eval_result.osr 105 | result['spl'] = eval_result.spl 106 | result['cls'] = eval_result.cls 107 | result['ndtw'] = eval_result.ndtw 108 | result['sdtw'] = eval_result.sdtw 109 | result['expert_trajectory'] = gt['path'] 110 | result['distance'] = gt['distance'] 111 | result['scan'] = gt['scan'] 112 | result['instruction'] = \ 113 | gt['instructions'][int(instr_id.split('_')[1])] 114 | 115 | score_summary = { 116 | 'nav_error': np.average(self.scores['nav_errors']), 117 | 'oracle_error': np.average(self.scores['oracle_errors']), 118 | 'steps': np.average(self.scores['trajectory_steps']), 119 | 'lengths': np.average(self.scores['trajectory_lengths']), 120 | 'cls': np.average(self.scores['cls']), 121 | 'sr': float(sum(self.scores['sr']) / len(self.scores['sr'])), 122 | 'osr': float(sum(self.scores['osr']) / len(self.scores['osr'])), 123 | 'spl': float(sum(self.scores['spl']) / len(self.scores['spl'])), 124 | 'ndtw': float(sum(self.scores['ndtw']) / len(self.scores['ndtw'])), 125 | 'sdtw': float(sum(self.scores['sdtw']) / len(self.scores['sdtw'])), 126 | } 127 | if len(model_scores) > 0: 128 | score_summary['model_score'] = np.average(model_scores) 129 | if update_results: 130 | score_summary['sr_std'] = np.std(self.scores['sr']) 131 | score_summary['cls_std'] = np.std(self.scores['cls']) 132 | score_summary['spl_std'] = np.std(self.scores['spl']) 133 | 134 | return score_summary 135 | -------------------------------------------------------------------------------- /src/params.py: -------------------------------------------------------------------------------- 1 | RESULT_DIR = 'follower/results/' 2 | SNAPSHOT_DIR = 'follower/snapshots/' 3 | PLOT_DIR = 'follower/plots/' 4 | SUMMARY_DIR = 'follower/summary/' 5 | FOLLOWER_PATH = None 6 | SPEAKER_PATH = None 7 | ACTION_EMBEDDING_SIZE = 2048 + 128 8 | HIDDEN_SIZE = 512 9 | WEIGHT_DECAY = 0.00005 10 | FEATURE_SIZE = 2048 + 128 11 | LOG_EVERY = 500 12 | SAVE_EVERY = 5000 13 | 14 | 15 | def add_general_args(parser): 16 | # data 17 | parser.add_argument("--split_postfix", type=str, default='', 18 | help="The postfix of datasets, " 19 | "for landmark datasets it should be '_landmark', " 20 | "otherwise it should be ''") 21 | parser.add_argument("--add_augment", action='store_true') 22 | parser.add_argument("--augment_data", type=str, 23 | default='literal_speaker_data_augmentation', 24 | help="The augmentation dataset, " 25 | "only useful if --add_augment is on") 26 | parser.add_argument("--task_name", type=str, default='R2R') 27 | 28 | # learning algorithm 29 | parser.add_argument("--reward", action='store_true', 30 | help="Use RL if on") 31 | parser.add_argument("--curriculum_rl", action='store_true', 32 | help="Use CRL if on, set --reward on first") 33 | parser.add_argument("--count_curriculum", type=int, default=0, 34 | help="Set the start curriculum") 35 | parser.add_argument("--max_curriculum", type=int, default=4, 36 | help="Set the maximum curriculum") 37 | parser.add_argument("--curriculum_iters", type=int, default=10000, 38 | help="Set the # of iterations to increase curriculum") 39 | parser.add_argument("--learning_method", type=str, default="adam") 40 | parser.add_argument("--feedback_method", type=str, default="sample", 41 | help="Choose from teacher, argmax or sample") 42 | parser.add_argument("--il_mode", type=str, default=None, 43 | help="Choose from None, period_split or landmark_split") 44 | 45 | # learning params 46 | parser.add_argument("--n_iters", type=int, default=20000, 47 | help="Total training iterations") 48 | parser.add_argument("--batch_size", type=int, default=100, 49 | help="Choose carefully based on gpu memory, " 50 | "be small in large curriculum") 51 | parser.add_argument("--lr", type=float, default=0.0001) 52 | parser.add_argument("--max_ins_len", type=int, default=100, 53 | help="Max instruction length, " 54 | "for long sentences like in R8R, set it larger") 55 | parser.add_argument("--max_steps", type=int, default=10, 56 | help="Max step size, " 57 | "for long trajectories like in R8R, set it larger") 58 | parser.add_argument("--beam_size", type=int, default=8, 59 | help="Choose carefully based on gpu memory, " 60 | "be small in large curriculum") 61 | parser.add_argument("--action_embed_size", type=int, 62 | default=ACTION_EMBEDDING_SIZE) 63 | parser.add_argument("--feature_size", type=int, default=FEATURE_SIZE) 64 | parser.add_argument("--weight_decay", type=float, default=WEIGHT_DECAY) 65 | parser.add_argument("--hidden_size", type=int, default=HIDDEN_SIZE) 66 | 67 | # network 68 | parser.add_argument("--coground", action='store_true', 69 | help="Use cogrounding decoder if on") 70 | parser.add_argument("--wemb", type=int, default=300, 71 | help="Word embedding size") 72 | parser.add_argument("--dropout", type=float, default=0.5) 73 | parser.add_argument("--reward_type", type=str, default='dis', 74 | help="Choose from dis, cls, dtw and mix") 75 | parser.add_argument("--history", action='store_true', 76 | help="Use memory buffer if on") 77 | parser.add_argument("--exp_forget", type=float, default=0.5, 78 | help="Exponential forgetting ratio, for simplicity," 79 | "here -1 mean to use LSTM memory buffer") 80 | 81 | # load model 82 | parser.add_argument("--no_speaker", action='store_true', 83 | help="Use speaker to provide internal reward if on," 84 | "if not on, must provide the speaker prefix") 85 | parser.add_argument("--load_opt", action='store_true', 86 | help="When continue training, load previous optimizer") 87 | parser.add_argument("--speaker_prefix", type=str, default=SPEAKER_PATH) 88 | parser.add_argument("--follower_prefix", type=str, default=FOLLOWER_PATH) 89 | 90 | # save and log in training 91 | parser.add_argument("--no_save", action='store_true') 92 | parser.add_argument("--model_name", type=str, default="follower") 93 | parser.add_argument("--result_dir", default=RESULT_DIR) 94 | parser.add_argument("--snapshot_dir", default=SNAPSHOT_DIR) 95 | parser.add_argument("--plot_dir", default=PLOT_DIR) 96 | parser.add_argument("--summary_dir", default=SUMMARY_DIR) 97 | parser.add_argument("--log_every", type=int, default=LOG_EVERY) 98 | parser.add_argument("--save_every", type=int, default=SAVE_EVERY) 99 | 100 | # evaluation 101 | parser.add_argument("--use_test", action='store_true') 102 | parser.add_argument("--one_by_one", action='store_true', 103 | help="Evaluate one long instruction as " 104 | "a sequence of shorter instructions if on") 105 | parser.add_argument("--one_by_one_mode", type=str, default=None, 106 | help="Choose from splitting long instructions as " 107 | "period or landmark") 108 | -------------------------------------------------------------------------------- /src/process_data.py: -------------------------------------------------------------------------------- 1 | import math 2 | import copy 3 | import json 4 | from collections import defaultdict 5 | from src.vocab.vocab_path import TRAIN_VOCAB 6 | from src.vocab.tokenizer import Tokenizer, read_vocab 7 | 8 | 9 | def make_data(args): 10 | # determine splits 11 | if args.use_test: 12 | train_splits = [] 13 | val_splits = ['test'] 14 | elif args.task_name == 'R2T8': 15 | train_splits = [] 16 | val_splits = ['R2R_val_unseen', 'R4R_val_unseen', 'R6R_val_unseen', 17 | 'R8R_val_unseen'] 18 | elif args.task_name == 'R2R' or args.task_name == 'R4R': 19 | train_splits = ['train'] 20 | val_splits = ['val_seen', 'val_unseen'] 21 | else: 22 | train_splits = ['train'] 23 | val_splits = ['val_unseen'] 24 | 25 | if args.add_augment: 26 | train_splits.append(args.augment_data) 27 | vocab = read_vocab(TRAIN_VOCAB) 28 | tok = Tokenizer(vocab=vocab) 29 | 30 | # get datasets from file 31 | train_data = load_task_datasets(train_splits, args.task_name, 32 | postfix=args.split_postfix, 33 | tokenizer=tok, 34 | one_by_one_mode=args.one_by_one_mode) 35 | val_data = load_task_datasets(val_splits, args.task_name, 36 | postfix=args.split_postfix, 37 | tokenizer=tok, 38 | one_by_one_mode=args.one_by_one_mode) 39 | 40 | # split for training 41 | if len(train_data) > 0: 42 | assert len(train_data['train']) >= args.batch_size, \ 43 | "data not enough for one batch, reduce the batch size" 44 | if args.curriculum_rl: 45 | if args.one_by_one_mode == 'period': 46 | train_data = period_split_curriculum(train_data, tok, args.history) 47 | elif args.one_by_one_mode == 'landmark': 48 | train_data = landmark_split_curriculum(train_data, tok, args.history) 49 | else: 50 | raise ValueError("Error! One by one mode is not implemented.") 51 | elif args.il_mode is not None: 52 | if args.il_mode == 'period_split': 53 | train_data, counter = period_split(train_data, tok, 0, args.history) 54 | val_data, _ = period_split(val_data, tok, counter, args.history) \ 55 | if not args.one_by_one else (val_data, None) 56 | elif args.il_mode == 'landmark_split': 57 | train_data, counter = landmark_split(train_data, tok, 0, args.history) 58 | val_data, _ = landmark_split(val_data, tok, counter, args.history) \ 59 | if not args.one_by_one else (val_data, None) 60 | else: 61 | raise ValueError("Error! Training mode not available.") 62 | 63 | # make it together for evaluator 64 | train_tag = '-'.join(train_splits) 65 | train_data = merge_data(train_data) 66 | all_val_data = merge_data(val_data) if args.one_by_one_mode != 'landmark' \ 67 | else merge_data_landmark(val_data) 68 | 69 | # make single data splitted sentence by sentence for evaluation 70 | if args.one_by_one: 71 | if args.one_by_one_mode == 'period': 72 | val_data = period_split_curriculum(val_data, tok, args.history, 73 | use_test=args.use_test) 74 | elif args.one_by_one_mode == 'landmark': 75 | val_data = landmark_split_curriculum(val_data, tok, args.history, 76 | use_test=args.use_test) 77 | else: 78 | print("Error! Not implemented one by one mode!") 79 | exit(0) 80 | val_data = {tag: sorted(data, key=lambda x: len(x)) 81 | for tag, data in val_data.items()} 82 | return train_data, val_data, all_val_data, vocab, tok, train_tag 83 | 84 | 85 | def merge_data(data): 86 | total_val = [] 87 | for tag, d in data.items(): 88 | total_val += d 89 | return total_val 90 | 91 | 92 | def merge_data_landmark(data): 93 | total_val = [] 94 | for tag, data in data.items(): 95 | for d in data: 96 | new_d = dict(d) 97 | new_d['path'] = [d['path'][0][0]] 98 | for i in range(len(d['path'])): 99 | new_d['path'].extend(d['path'][i][1:]) 100 | new_d['instructions'] = ' '.join(new_d['instructions']) 101 | total_val.append(new_d) 102 | return total_val 103 | 104 | 105 | def load_dataset(split, task, postfix): 106 | data = [] 107 | with open('tasks/%s/data/%s_%s%s.json' % (task, task, split, postfix)) as f: 108 | data += json.load(f) 109 | print("Load dataset %s_%s%s" % (task, split, postfix)) 110 | return data 111 | 112 | 113 | def load_task_datasets(splits, task, postfix='', tokenizer=None, 114 | one_by_one_mode=None): 115 | dataset = {} 116 | id_list = defaultdict(lambda: 0) 117 | for split in splits: 118 | data = [] 119 | for item in load_dataset(split, task, postfix): 120 | if one_by_one_mode == "landmark": 121 | new_item = dict(item) 122 | new_item['instr_id'] = '%s_%d' \ 123 | % (item['path_id'], id_list[item['path_id']]) 124 | id_list[item['path_id']] += 1 125 | data.append(new_item) 126 | else: 127 | instructions = item['instructions'] 128 | for j, instr in enumerate(instructions): 129 | new_item = dict(item) 130 | new_item['instr_id'] = '%s_%d' % (item['path_id'], j) 131 | new_item['instructions'] = instr 132 | if tokenizer: 133 | new_item['instr_encoding'], new_item[ 134 | 'instr_length'] = tokenizer.encode_sentence(instr) 135 | data.append(new_item) 136 | dataset[split] = data 137 | return dataset 138 | 139 | 140 | def add_history_to_data(data, history_heading, history_path, history_instr, 141 | history_instr_encoding): 142 | data['history_heading'] = copy.deepcopy(history_heading) 143 | data['history_path'] = copy.deepcopy(history_path) 144 | data['history_instr'] = copy.deepcopy(history_instr) 145 | data['history_instr_encoding'] = copy.deepcopy(history_instr_encoding) 146 | history_heading.append(data['heading']) 147 | history_path.append(data['path']) 148 | history_instr.append(data['instructions']) 149 | history_instr_encoding.append(data['instr_encoding']) 150 | 151 | 152 | def period_split(datasets, tok, counter, history): 153 | splited = {} 154 | for tag, data in datasets.items(): 155 | new_data = [] 156 | for d in data: 157 | history_heading, history_path = [], [] 158 | history_instr, history_instr_encoding = [], [] 159 | ins = d['instructions'] 160 | ins_splited = ins.split('.') 161 | if not ins_splited[0]: 162 | ins_splited = ins_splited[1:-1] 163 | else: 164 | ins_splited = ins_splited[:-1] 165 | ratio = 0 166 | last_path_split_point = 1 167 | ins_len = sum([len(ins_sp.split()) for ins_sp in ins_splited]) 168 | for i, ins_sp in enumerate(ins_splited): 169 | ratio += len(ins_sp.split()) / ins_len 170 | if ratio > 1: 171 | ratio = 1 172 | path_split_point = math.ceil(len(d['path']) * ratio) 173 | 174 | new_d = copy.deepcopy(d) 175 | new_d['path_id'] = counter 176 | new_d['instr_id'] = str(counter) + '_0' 177 | new_d['path'] = d['path'][last_path_split_point - 1:path_split_point] 178 | new_d['instructions'] = ins_sp + '.' 179 | new_d['instr_encoding'], new_d['instr_length'] = tok.encode_sentence( 180 | ins_sp + '.') 181 | new_d['heading'] = d['headings'][last_path_split_point - 1] 182 | new_d['remain_split'] = len(ins_splited) - i 183 | if history: 184 | add_history_to_data(new_d, history_heading, history_path, 185 | history_instr, history_instr_encoding) 186 | 187 | last_path_split_point = path_split_point 188 | new_data.append(new_d) 189 | counter += 1 190 | assert new_d['path'][-1] == d['path'][-1] 191 | splited[tag] = new_data 192 | return splited, counter 193 | 194 | 195 | def landmark_split(datasets, tok, counter, history): 196 | splited = {} 197 | for tag, data in datasets.items(): 198 | new_data = [] 199 | for d in data: 200 | history_heading, history_path = [], [] 201 | history_instr, history_instr_encoding = [], [] 202 | for i, ins in enumerate(d['instructions']): 203 | new_d = copy.deepcopy(d) 204 | new_d['path_id'] = counter 205 | new_d['instr_id'] = str(counter) + '_0' 206 | new_d['path'] = d['path'][i] 207 | new_d['instructions'] = ins 208 | new_d['instr_encoding'], new_d['instr_length'] = tok.encode_sentence( 209 | ins) 210 | new_d['heading'] = float(d['headings'][i]) 211 | new_d['remain_split'] = len(d['instructions']) - i 212 | if history: 213 | add_history_to_data(new_d, history_heading, history_path, 214 | history_instr, history_instr_encoding) 215 | 216 | new_data.append(new_d) 217 | counter += 1 218 | splited[tag] = new_data 219 | return splited, counter 220 | 221 | 222 | def period_split_curriculum(datasets, tok, history, use_test=False): 223 | splited = {} 224 | for tag, data in datasets.items(): 225 | new_data = [] 226 | for d in data: 227 | history_heading, history_path = [], [] 228 | history_instr, history_instr_encoding = [], [] 229 | new_d_list = [] 230 | ins = d['instructions'] 231 | ins_splited = ins.split('.') 232 | if not ins_splited[0]: 233 | ins_splited = ins_splited[1:-1] 234 | else: 235 | ins_splited = ins_splited[:-1] 236 | ratio = 0 237 | last_path_split_point = 1 238 | ins_len = sum([len(ins_sp.split()) for ins_sp in ins_splited]) 239 | for i, ins_sp in enumerate(ins_splited): 240 | ratio += len(ins_sp.split()) / ins_len 241 | if ratio > 1: 242 | ratio = 1 243 | path_split_point = math.ceil(len(d['path']) * ratio) 244 | 245 | new_d = copy.deepcopy(d) 246 | new_d['instructions'] = ins_sp + '.' 247 | new_d['instr_encoding'], new_d['instr_length'] = \ 248 | tok.encode_sentence(ins_sp + '.') 249 | if use_test: 250 | new_d['path'] = d['path'] * 2 251 | new_d['heading'] = float(d['heading']) 252 | else: 253 | new_d['path'] = d['path'][last_path_split_point - 1:path_split_point] 254 | new_d['heading'] = d['headings'][last_path_split_point - 1] 255 | if history: 256 | add_history_to_data(new_d, history_heading, history_path, 257 | history_instr, history_instr_encoding) 258 | 259 | last_path_split_point = path_split_point 260 | new_d_list.append(new_d) 261 | assert new_d['path'][-1] == d['path'][-1] 262 | new_data.append(new_d_list) 263 | splited[tag] = new_data 264 | return splited 265 | 266 | 267 | def landmark_split_curriculum(datasets, tok, history, use_test=False): 268 | splited = {} 269 | for tag, data in datasets.items(): 270 | new_data = [] 271 | for d in data: 272 | history_heading, history_path = [], [] 273 | history_instr, history_instr_encoding = [], [] 274 | new_d_list = [] 275 | for i, ins in enumerate(d['instructions']): 276 | new_d = copy.deepcopy(d) 277 | new_d['instructions'] = ins 278 | new_d['instr_encoding'], new_d['instr_length'] = \ 279 | tok.encode_sentence(ins) 280 | if use_test: 281 | new_d['path'] = d['path'] * 2 282 | new_d['heading'] = float(d['heading']) 283 | else: 284 | new_d['path'] = d['path'][i] 285 | new_d['heading'] = float(d['headings'][i]) 286 | if history: 287 | add_history_to_data(new_d, history_heading, history_path, 288 | history_instr, history_instr_encoding) 289 | 290 | new_d_list.append(new_d) 291 | new_data.append(new_d_list) 292 | splited[tag] = new_data 293 | return splited 294 | -------------------------------------------------------------------------------- /src/speaker.py: -------------------------------------------------------------------------------- 1 | import json 2 | import sys 3 | import numpy as np 4 | import itertools 5 | import torch 6 | import copy 7 | import torch.nn.functional as F 8 | import torch.distributions as D 9 | 10 | from src.vocab.tokenizer import VOCAB_PAD_IDX, VOCAB_EOS_IDX 11 | from src.utils import batch_instructions_from_encoded, \ 12 | batch_observations_and_actions 13 | from model.cuda import try_cuda 14 | 15 | 16 | class Seq2SeqSpeaker(object): 17 | def __init__(self, env, results_path, args, encoder, decoder, tok): 18 | self.env = env 19 | self.tok = tok 20 | self.results_path = results_path 21 | self.results = {} 22 | self.encoder = encoder 23 | self.decoder = decoder 24 | self.max_instruction_length = args.max_ins_len 25 | 26 | def write_results(self): 27 | with open(self.results_path, 'w') as f: 28 | json.dump(self.results, f) 29 | 30 | def _score_obs_actions_and_instructions(self, path_obs, path_actions, 31 | encoded_instructions, feedback, 32 | train): 33 | batch_size = len(path_obs) 34 | instr_seq, _, _ = \ 35 | batch_instructions_from_encoded(encoded_instructions, 36 | self.max_instruction_length, cut=False) 37 | batched_image_features, batched_action_embeddings, path_mask, seq_len = \ 38 | batch_observations_and_actions(path_obs, path_actions, 39 | self.env.padding_feature, 40 | self.env.padding_action) 41 | 42 | ctx = self.encoder(batched_image_features, batched_action_embeddings, 43 | seq_len) 44 | h_t = try_cuda(torch.zeros(batch_size, ctx.size(-1))) 45 | c_t = try_cuda(torch.zeros(batch_size, ctx.size(-1))) 46 | ended = np.array([False] * batch_size) 47 | 48 | outputs = [{ 49 | 'instr_id': path_obs[i][0]['instr_id'], 50 | 'word_indices': [], 51 | 'scores': [] 52 | } for i in range(batch_size)] 53 | 54 | # Do a sequence rollout and calculate the loss 55 | loss = 0 56 | w_t = try_cuda(torch.from_numpy( 57 | np.full((batch_size, 1), self.tok.vocab_bos_idx, 58 | dtype='int64')).long()) 59 | 60 | if train: 61 | w_t = torch.cat([w_t, instr_seq], dim=1) 62 | logits, _, _ = self.decoder(w_t, ctx, path_mask, h_t, c_t) 63 | logits = logits.permute(0, 2, 1).contiguous() 64 | loss = F.cross_entropy( 65 | input=logits[:, :, :-1], # -1 for aligning 66 | target=instr_seq, # "1:" to ignore the word 67 | ignore_index=VOCAB_PAD_IDX 68 | ) 69 | else: 70 | sequence_scores = try_cuda(torch.zeros(batch_size)) 71 | for t in range(self.max_instruction_length): 72 | logit, h_t, c_t = self.decoder(w_t.view(-1, 1), ctx, path_mask, h_t, 73 | c_t) 74 | logit = logit.squeeze(1) 75 | 76 | logit[:, VOCAB_PAD_IDX] = -float('inf') 77 | target = instr_seq[:, t].contiguous() 78 | 79 | if torch.isnan(logit).sum(): 80 | print("Error: network produce nan result!") 81 | exit(0) 82 | 83 | # Determine next model inputs 84 | if feedback == 'teacher': 85 | w_t = target 86 | elif feedback == 'argmax': 87 | _, w_t = logit.max(1) 88 | w_t = w_t.detach() 89 | elif feedback == 'sample': 90 | probs = F.softmax(logit, dim=1) 91 | probs[:, VOCAB_PAD_IDX] = 0 92 | m = D.Categorical(probs) 93 | w_t = m.sample() 94 | else: 95 | sys.exit('Invalid feedback option') 96 | 97 | log_probs = F.log_softmax(logit, dim=1) 98 | word_scores = -F.nll_loss(log_probs, w_t, ignore_index=VOCAB_PAD_IDX, 99 | reduction='none') 100 | sequence_scores += word_scores 101 | loss += F.nll_loss(log_probs, target, ignore_index=VOCAB_PAD_IDX) 102 | 103 | for i in range(batch_size): 104 | word_idx = w_t[i].item() 105 | if not ended[i]: 106 | outputs[i]['word_indices'].append(int(word_idx)) 107 | outputs[i]['scores'].append(word_scores[i].item()) 108 | if word_idx == VOCAB_EOS_IDX: 109 | ended[i] = True 110 | 111 | # Early exit if all ended 112 | if ended.all(): 113 | break 114 | 115 | for i, item in enumerate(outputs): 116 | item['score'] = float(sequence_scores[i].item()) / len( 117 | item['word_indices']) 118 | item['words'] = self.tok.decode_sentence(item['word_indices'], 119 | break_on_eos=True, join=False) 120 | 121 | return outputs, loss 122 | 123 | def _rollout(self, batch, train=True): 124 | path_obs, path_actions, encoded_instructions = \ 125 | self.env.gold_obs_actions_and_instructions(batch) 126 | outputs, loss = \ 127 | self._score_obs_actions_and_instructions(path_obs, path_actions, 128 | encoded_instructions, 129 | self.feedback, train) 130 | return outputs 131 | 132 | def query(self, batch, follower_results, feedback='argmax', 133 | curriculum=False): 134 | self.feedback = feedback 135 | if not curriculum: 136 | next_batch = [copy.deepcopy(b) for b in batch] 137 | else: 138 | next_batch = [copy.deepcopy(b[-1]) for b in batch] 139 | for i, b in enumerate(next_batch): 140 | if 'history_path' in b and len(b['history_path']) > 0: 141 | b['path'] = follower_results[i]['trajectory'] 142 | b['heading'] = b['history_heading'][0] 143 | b['instr_encoding'] \ 144 | = list(itertools.chain.from_iterable(b['history_instr_encoding'])) \ 145 | + list(b['instr_encoding']) 146 | else: 147 | b['path'] = follower_results[i]['trajectory'] 148 | with torch.no_grad(): 149 | self.encoder.eval() 150 | self.decoder.eval() 151 | results = self._rollout(next_batch, train=False) 152 | return results 153 | 154 | def test(self, batch, feedback='argmax'): 155 | ''' Evaluate once on each instruction in the current environment ''' 156 | with torch.no_grad(): 157 | self.feedback = feedback 158 | self.encoder.eval() 159 | self.decoder.eval() 160 | self.results = {} 161 | 162 | count = 0 163 | looped = False 164 | while True: 165 | rollout_results = self._rollout(batch[count], train=False) 166 | count += 1 167 | 168 | for result in rollout_results: 169 | if result['instr_id'] in self.results: 170 | looped = True 171 | else: 172 | self.results[result['instr_id']] = result 173 | if looped or len(batch) == count: 174 | break 175 | return self.results 176 | 177 | def _encoder_and_decoder_paths(self, base_path): 178 | return base_path + "_enc", base_path + "_dec" 179 | 180 | def save(self, path): 181 | ''' Snapshot models ''' 182 | encoder_path, decoder_path = self._encoder_and_decoder_paths(path) 183 | torch.save(self.encoder.state_dict(), encoder_path) 184 | torch.save(self.decoder.state_dict(), decoder_path) 185 | 186 | def load(self, path, **kwargs): 187 | ''' Loads parameters (but not training state) ''' 188 | encoder_path, decoder_path = self._encoder_and_decoder_paths(path) 189 | self.encoder.load_my_state_dict(torch.load(encoder_path, **kwargs)) 190 | self.decoder.load_my_state_dict(torch.load(decoder_path, **kwargs)) 191 | -------------------------------------------------------------------------------- /src/train_follower.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import numpy as np 4 | import pandas as pd 5 | import argparse 6 | import math 7 | import sys 8 | 9 | sys.path.append('.') 10 | 11 | from torch import optim 12 | from collections import defaultdict 13 | from tensorboardX import SummaryWriter 14 | from src.vocab.vocab_path import GLOVE_PATH 15 | from src.vocab.tokenizer import VOCAB_PAD_IDX 16 | from src.utils import time_since, check_dir, make_batch, get_model_prefix, \ 17 | make_data_and_env, run 18 | from src.params import add_general_args 19 | from simulator.envs.image_feature import ImageFeatures 20 | from model.speaker_lstm import SpeakerEncoderLSTM, SpeakerDecoderLSTM 21 | from src.speaker import Seq2SeqSpeaker 22 | from model.follower_coattend import EncoderLSTM, AttnDecoderLSTM 23 | from model.follower_coground import CogroundDecoderLSTM 24 | from model.cuda import try_cuda 25 | from src.follower import Seq2SeqFollower 26 | from src.eval_follower import FollowerEvaluation 27 | 28 | 29 | def train(args, agent, train_data, val_data, evaluator, speaker, train_tag): 30 | def make_path(dir, n_iter): 31 | return os.path.join(dir, '%s_%s_iter_%d' % ( 32 | get_model_prefix(args.model_name, args.feedback_method), 33 | train_tag, n_iter)) 34 | 35 | # check result directories 36 | task_prefix = os.path.join('tasks', args.task_name) 37 | result_dir = os.path.join(task_prefix, args.result_dir) 38 | snapshot_dir = os.path.join(task_prefix, args.snapshot_dir) 39 | plot_dir = os.path.join(task_prefix, args.plot_dir) 40 | summary_dir = os.path.join(task_prefix, args.summary_dir, 41 | time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())) 42 | check_dir([result_dir, snapshot_dir, plot_dir, summary_dir]) 43 | 44 | # initialize 45 | data_log = defaultdict(list) 46 | n_iters = args.n_iters 47 | log_every = args.log_every 48 | best_metrics = {} 49 | last_model_saved = {} 50 | writer = SummaryWriter(log_dir=summary_dir) 51 | train_ix = 0 52 | 53 | print('Training with %s feedback' % args.feedback_method) 54 | start = time.time() 55 | for idx in range(0, n_iters, log_every): 56 | interval = min(log_every, n_iters - idx) 57 | iter = idx + interval 58 | data_log['iteration'].append(iter) 59 | 60 | # make mini-batch 61 | if args.curriculum_rl: 62 | train_batch, train_ix = make_batch(train_data, train_ix, interval, 63 | args.batch_size, sort_instr_len=False) 64 | else: 65 | train_batch, train_ix = make_batch(train_data, train_ix, interval, 66 | args.batch_size, sort_instr_len=True) 67 | 68 | # train 69 | if args.curriculum_rl: 70 | assert args.reward == True 71 | agent.train_crl(train_batch, interval, speaker, 72 | curriculum=args.count_curriculum, history=args.history, 73 | reward_type=args.reward_type, exp_forget=args.exp_forget, 74 | beam_size=args.beam_size, feedback=args.feedback_method) 75 | elif args.reward: 76 | agent.train_reward(train_batch, interval, speaker, 77 | history=args.history, reward_type=args.reward_type, 78 | exp_forget=args.exp_forget, beam_size=args.beam_size, 79 | feedback=args.feedback_method) 80 | else: 81 | agent.train(train_batch, interval, 82 | history=args.history, exp_forget=args.exp_forget, 83 | feedback=args.feedback_method) 84 | 85 | # output loss / reward 86 | train_loss_avg = np.array(agent.losses).mean() 87 | data_log['train loss'].append(train_loss_avg) 88 | loss_str = 'train loss: %.4f' % train_loss_avg 89 | writer.add_scalar('data/train_loss', train_loss_avg, iter) 90 | 91 | if args.reward: 92 | int_reward_avg = np.array(agent.int_rewards).mean() 93 | data_log['int_reward'].append(int_reward_avg) 94 | loss_str += ', internal reward: %.4f' % int_reward_avg 95 | writer.add_scalar('data/int_reward', int_reward_avg, iter) 96 | 97 | ext_reward_avg = np.array(agent.ext_rewards).mean() 98 | data_log['ext_reward'].append(ext_reward_avg) 99 | loss_str += ', external reward: %.4f' % ext_reward_avg 100 | writer.add_scalar('data/ext_reward', ext_reward_avg, iter) 101 | 102 | # run validation 103 | save_log = [] 104 | for tag, d in val_data.items(): 105 | it = math.ceil(len(d) / args.batch_size) 106 | test_batch, _ = make_batch(d, 0, it, args.batch_size, 107 | shuffle=False, sort_instr_len=False) 108 | agent.test(test_batch, history=args.history, one_by_one=args.one_by_one, 109 | exp_forget=args.exp_forget) 110 | agent.results_path = make_path(result_dir, iter) + '_' + tag + '.json' 111 | 112 | # evaluate results 113 | print("evaluating on {}".format(tag)) 114 | score_summary = evaluator.score_results(agent.results) 115 | 116 | loss_str += '\n%s' % (tag) 117 | for metric, val in sorted(score_summary.items()): 118 | data_log['%s %s' % (tag, metric)].append(val) 119 | writer.add_scalar('data/' + tag + '_' + metric, val, iter) 120 | if metric in ['sr', 'cls', 'sdtw']: 121 | print("%s on %s: %.3f" % (metric, tag, val)) 122 | 123 | # save model 124 | key = (tag, metric) 125 | if key not in best_metrics or best_metrics[key] < val: 126 | best_metrics[key] = val 127 | if not args.no_save: 128 | model_path = make_path(snapshot_dir, iter) \ 129 | + "_%s-%s=%.3f" % (tag, metric, val) 130 | save_log.append("new best, saved model to %s" % model_path) 131 | agent.save(model_path) 132 | if key in last_model_saved: 133 | for old_model_path in \ 134 | agent.encoder_and_decoder_paths(last_model_saved[key]): 135 | os.remove(old_model_path) 136 | last_model_saved[key] = model_path 137 | loss_str += ', %s: %.3f' % (metric, val) 138 | 139 | # report training process 140 | print(('%s (%d %d%%) %s' 141 | % (time_since(start, float(iter) / n_iters), iter, 142 | float(iter) / n_iters * 100, loss_str))) 143 | for s in save_log: 144 | print(s) 145 | if not args.no_save: 146 | if args.save_every and iter % args.save_every == 0: 147 | agent.save(make_path(snapshot_dir, iter)) 148 | df = pd.DataFrame(data_log) 149 | df.set_index('iteration') 150 | df_path = '%s%s_%s_log.csv' \ 151 | % (plot_dir, 152 | get_model_prefix(args.model_name, args.feedback_method), 153 | train_tag) 154 | df.to_csv(df_path) 155 | 156 | # update curriculum 157 | if args.curriculum_rl \ 158 | and iter % args.curriculum_iters == 0 \ 159 | and args.count_curriculum < args.max_curriculum: 160 | args.count_curriculum += 1 161 | agent.reset_rav() 162 | agent.encoder_optimizer, agent.decoder_optimizer = \ 163 | reset_optimizer(args, agent.encoder, agent.decoder) 164 | 165 | 166 | def reset_optimizer(args, encoder, decoder): 167 | def filter_param(param_list): 168 | return [p for p in param_list if p.requires_grad] 169 | 170 | enc_para = encoder.parameters() 171 | dec_para = decoder.parameters() 172 | if args.learning_method == "adam": 173 | encoder_optimizer = optim.Adam(filter_param(enc_para), lr=args.lr, 174 | weight_decay=args.weight_decay) 175 | decoder_optimizer = optim.Adam(filter_param(dec_para), lr=args.lr, 176 | weight_decay=args.weight_decay) 177 | elif args.learning_method == "sgd": 178 | encoder_optimizer = optim.SGD(filter_param(enc_para), lr=args.lr, 179 | momentum=0.9, nesterov=True, 180 | weight_decay=args.weight_decay) 181 | decoder_optimizer = optim.SGD(filter_param(dec_para), lr=args.lr, 182 | momentum=0.9, nesterov=True, 183 | weight_decay=args.weight_decay) 184 | elif args.learning_method == "rms": 185 | encoder_optimizer = optim.RMSprop(filter_param(enc_para), lr=args.lr, 186 | weight_decay=args.weight_decay) 187 | decoder_optimizer = optim.RMSprop(filter_param(dec_para), lr=args.lr, 188 | weight_decay=args.weight_decay) 189 | else: 190 | raise ValueError("Error! Learning method not correct") 191 | 192 | return encoder_optimizer, decoder_optimizer 193 | 194 | 195 | def make_speaker_models(args, vocab_size, env, tok): 196 | glove = np.load(GLOVE_PATH) 197 | encoder = SpeakerEncoderLSTM(args.feature_size, args.hidden_size, 198 | args.dropout) 199 | decoder = SpeakerDecoderLSTM(vocab_size, args.wemb, VOCAB_PAD_IDX, 200 | args.hidden_size, args.dropout, glove=glove) 201 | encoder = try_cuda(encoder) 202 | decoder = try_cuda(decoder) 203 | 204 | agent = Seq2SeqSpeaker(env, "", args, encoder, decoder, tok) 205 | return agent 206 | 207 | 208 | def make_follower_models(args, vocab_size, all_val_data, env): 209 | glove = np.load(GLOVE_PATH) 210 | encoder = EncoderLSTM(vocab_size, args.wemb, args.hidden_size, VOCAB_PAD_IDX, 211 | args.dropout, glove=glove) 212 | if args.coground: 213 | decoder = CogroundDecoderLSTM(args.action_embed_size, args.hidden_size, 214 | args.dropout, args.feature_size, 215 | args.max_ins_len, history=args.history) 216 | else: 217 | decoder = AttnDecoderLSTM(args.action_embed_size, args.hidden_size, 218 | args.dropout, args.feature_size, 219 | history=args.history, 220 | lstm_mem=args.exp_forget < 0) 221 | 222 | encoder = try_cuda(encoder) 223 | decoder = try_cuda(decoder) 224 | encoder_optimizer, decoder_optimizer = \ 225 | reset_optimizer(args, encoder, decoder) 226 | 227 | agent = Seq2SeqFollower(env, "", args, encoder, decoder, encoder_optimizer, 228 | decoder_optimizer) 229 | evaluator = FollowerEvaluation(env, all_val_data) 230 | return agent, evaluator 231 | 232 | 233 | def train_setup(args): 234 | ''' Load data, setup environment and setup agent ''' 235 | train_data, val_data, all_val_data, env, vocab, tok, train_tag = \ 236 | make_data_and_env(args) 237 | agent, evaluator = make_follower_models(args, len(vocab), all_val_data, env) 238 | if args.reward and not args.no_speaker: 239 | speaker = make_speaker_models(args, len(vocab), env, tok) 240 | speaker.load(args.speaker_prefix, **{}) 241 | print("Load speaker model %s" % args.speaker_prefix) 242 | else: 243 | speaker = None 244 | if args.follower_prefix is not None: 245 | agent.load(args.follower_prefix, args.load_opt, **{}) 246 | print("Load follower model %s" % args.follower_prefix) 247 | return agent, train_data, val_data, evaluator, speaker, train_tag 248 | 249 | 250 | def train_val(args): 251 | ''' Train on the training set, and validate on validation (seen/unseen) set. ''' 252 | agent, train_data, val_data, evaluator, speaker, train_tag = \ 253 | train_setup(args) 254 | train(args, agent, train_data, val_data, evaluator, speaker, train_tag) 255 | 256 | 257 | def make_arg_parser(): 258 | parser = argparse.ArgumentParser() 259 | ImageFeatures.add_args(parser) 260 | add_general_args(parser) 261 | return parser 262 | 263 | 264 | if __name__ == "__main__": 265 | run(make_arg_parser(), train_val) 266 | -------------------------------------------------------------------------------- /src/utils.py: -------------------------------------------------------------------------------- 1 | ''' Utils for training and evaluation ''' 2 | 3 | import os 4 | import json 5 | import time 6 | import math 7 | import random 8 | import torch 9 | import numpy as np 10 | 11 | from model.cuda import try_cuda 12 | from src.process_data import make_data 13 | from src.vocab.tokenizer import VOCAB_PAD_IDX, VOCAB_EOS_IDX 14 | from simulator.envs.image_feature import ImageFeatures 15 | from simulator.envs.env import RoomEnv 16 | from simulator.envs.paths import ADJ_LIST_FILE 17 | 18 | 19 | def random_seed(): 20 | torch.manual_seed(1) 21 | torch.cuda.manual_seed(1) 22 | 23 | 24 | def batch_observations_and_actions(path_obs, path_actions, padding_feature, 25 | padding_action): 26 | batch_size = len(path_obs) 27 | seq_lengths = np.array([len(a) for a in path_actions]) 28 | max_path_length = seq_lengths.max() 29 | mask = np.ones((batch_size, max_path_length), np.uint8) 30 | image_features = [[] for _ in range(batch_size)] 31 | action_embeddings = [[] for _ in range(batch_size)] 32 | for i in range(batch_size): 33 | assert len(path_obs[i]) == len(path_actions[i]) 34 | mask[i, :len(path_actions[i])] = 0 35 | image_features[i] = [ob['feature'][0] for ob in path_obs[i]] 36 | action_embeddings[i] = [ob['action_embedding'][path_actions[i][j]] 37 | for j, ob in enumerate(path_obs[i])] 38 | image_features[i].extend([padding_feature] 39 | * (max_path_length - len(path_actions[i]))) 40 | action_embeddings[i].extend([padding_action] 41 | * (max_path_length - len(path_actions[i]))) 42 | image_features[i] = torch.stack(image_features[i], dim=0) 43 | action_embeddings[i] = torch.stack(action_embeddings[i], dim=0) 44 | batched_image_features = torch.stack(image_features, dim=0) 45 | batched_action_embeddings = torch.stack(action_embeddings, dim=0) 46 | mask = try_cuda(torch.from_numpy(mask).byte()) 47 | return batched_image_features, batched_action_embeddings, mask, seq_lengths 48 | 49 | 50 | def batch_instructions_from_encoded(encoded_instructions, max_length, 51 | reverse=False, cut=True): 52 | num_instructions = len(encoded_instructions) 53 | seq_tensor = np.full((num_instructions, max_length), VOCAB_PAD_IDX) 54 | seq_lengths = [] 55 | for i, inst in enumerate(encoded_instructions): 56 | if len(inst) > 0 and inst[-1] == VOCAB_EOS_IDX: 57 | inst = inst[:-1] 58 | if reverse: 59 | inst = inst[::-1] 60 | inst = np.concatenate((inst, [VOCAB_EOS_IDX])) 61 | inst = inst[:max_length] 62 | seq_tensor[i, :len(inst)] = inst 63 | seq_lengths.append(len(inst)) 64 | 65 | if cut: 66 | seq_tensor = torch.from_numpy(seq_tensor)[:, :max(seq_lengths)] 67 | mask = (seq_tensor == VOCAB_PAD_IDX)[:, :max(seq_lengths)] 68 | else: 69 | seq_tensor = torch.from_numpy(seq_tensor) 70 | mask = (seq_tensor == VOCAB_PAD_IDX) 71 | 72 | return try_cuda(seq_tensor.long()), try_cuda(mask.byte()), seq_lengths 73 | 74 | 75 | def make_data_and_env(args): 76 | # make data 77 | train_data, val_data, all_val_data, vocab, tok, train_tag = make_data(args) 78 | 79 | # make env 80 | random_seed() 81 | image_features_list = ImageFeatures.from_args(args) 82 | paths, states_map, distances = RoomEnv.load_graphs() 83 | state_embedding = RoomEnv.make_state_embeddings(args, states_map, 84 | image_features_list) 85 | loc_embeddings = [RoomEnv.build_viewpoint_loc_embedding(args, viewIndex) 86 | for viewIndex in range(args.num_views)] 87 | adj_dict = RoomEnv.load_adj_feature(ADJ_LIST_FILE) 88 | env = RoomEnv(args, paths, states_map, distances, state_embedding, 89 | loc_embeddings, adj_dict) 90 | return train_data, val_data, all_val_data, env, vocab, tok, train_tag 91 | 92 | 93 | def make_batch(data, ix, n_iter, batch_size, shuffle=True, 94 | sort_instr_len=True): 95 | batches = [] 96 | new_ix = ix 97 | for i in range(n_iter): 98 | batch = data[new_ix:new_ix + batch_size] 99 | if len(batch) < batch_size: 100 | random.shuffle(data) if shuffle else None 101 | new_ix = batch_size - len(batch) 102 | batch += data[:new_ix] 103 | else: 104 | new_ix += batch_size 105 | if sort_instr_len: 106 | batch = sorted(batch, key=lambda item: item['instr_length'], 107 | reverse=True) 108 | batches.append(batch) 109 | return batches, new_ix 110 | 111 | 112 | def get_model_prefix(model_name, feedback_method): 113 | model_prefix = '{}_{}'.format(model_name, feedback_method) 114 | return model_prefix 115 | 116 | 117 | def pretty_json_dump(obj, fp): 118 | json.dump(obj, fp, sort_keys=True, indent=4, separators=(',', ':')) 119 | 120 | 121 | def as_minutes(s): 122 | m = math.floor(s / 60) 123 | s -= m * 60 124 | return '%dm %ds' % (m, s) 125 | 126 | 127 | def time_since(since, percent): 128 | now = time.time() 129 | s = now - since 130 | es = s / (percent) 131 | rs = es - s 132 | return '%s (- %s)' % (as_minutes(s), as_minutes(rs)) 133 | 134 | 135 | def run(arg_parser, entry_function): 136 | arg_parser.add_argument("--pdb", action='store_true') 137 | arg_parser.add_argument("--ipdb", action='store_true') 138 | arg_parser.add_argument("--no_cuda", action='store_true') 139 | 140 | args = arg_parser.parse_args() 141 | 142 | import torch.cuda 143 | torch.cuda.disabled = args.no_cuda 144 | 145 | if args.ipdb: 146 | import ipdb 147 | ipdb.runcall(entry_function, args) 148 | elif args.pdb: 149 | import pdb 150 | pdb.runcall(entry_function, args) 151 | else: 152 | entry_function(args) 153 | 154 | 155 | def check_dir(dir_list): 156 | for dir in dir_list: 157 | if not os.path.exists(dir): 158 | os.makedirs(dir) 159 | -------------------------------------------------------------------------------- /src/val_follower.py: -------------------------------------------------------------------------------- 1 | import os 2 | import os.path 3 | import numpy as np 4 | import argparse 5 | import math 6 | import sys 7 | 8 | sys.path.append('.') 9 | 10 | from src.params import add_general_args 11 | from src.utils import check_dir, make_batch, get_model_prefix, run 12 | from src.train_follower import train_setup 13 | from simulator.envs.image_feature import ImageFeatures 14 | 15 | 16 | def val(args, agent, val_data, evaluator): 17 | task_prefix = os.path.join('tasks', args.task_name) 18 | result_dir = os.path.join(task_prefix, args.result_dir) 19 | check_dir([result_dir]) 20 | 21 | def make_path(dir): 22 | return os.path.join(dir, '%s_%s' 23 | % (get_model_prefix(args.model_name, args.task_name), 24 | 'validation')) 25 | 26 | # run validation 27 | loss_str = '' 28 | ratio_number = [] 29 | metric_dict = { 30 | 'lengths': [], 'nav_error': [], 'sr': [], 'spl': [], 'cls': [], 'ndtw': [], 31 | 'sdtw': [], 'sr_std': [], 'spl_std': [], 'cls_std': [] 32 | } 33 | for tag, d in val_data.items(): 34 | ratio_number.append(len(d)) 35 | it = math.ceil(len(d) / args.batch_size) 36 | test_batch, _ = make_batch(d, 0, it, args.batch_size, 37 | shuffle=False, sort_instr_len=False) 38 | agent.test(test_batch, one_by_one=args.one_by_one, history=args.history, 39 | exp_forget=args.exp_forget) 40 | agent.results_path = make_path(result_dir) + '_' + tag + '.json' 41 | 42 | print("evaluating on {}".format(tag)) 43 | score_summary = evaluator.score_results(agent.results, update_results=True) 44 | loss_str += '\n%s' % (tag) 45 | for metric, val in sorted(score_summary.items()): 46 | if metric in metric_dict: 47 | metric_dict[metric].append(val) 48 | loss_str += ', %s: %.3f' % (metric, val) 49 | agent.write_results() 50 | print("PL: %.2f, NE: %.2f, SR: %.1f, SPL: %.1f, " 51 | "CLS: %.1f, NDTW: %.1f, SDTW %.1f" 52 | % (metric_dict['lengths'][-1], metric_dict['nav_error'][-1], 53 | metric_dict['sr'][-1] * 100, metric_dict['spl'][-1] * 100, 54 | metric_dict['cls'][-1] * 100, metric_dict['ndtw'][-1] * 100, 55 | metric_dict['sdtw'][-1] * 100)) 56 | 57 | print("Average\nPL: %.2f, NE: %.2f, SR: %.1f, SPL: %.1f, " 58 | "CLS: %.1f, NDTW: %.1f, SDTW %.1f" 59 | % (np.array(metric_dict['lengths']).mean(), 60 | np.array(metric_dict['nav_error']).mean(), 61 | np.array(metric_dict['sr']).mean() * 100, 62 | np.array(metric_dict['spl']).mean() * 100, 63 | np.array(metric_dict['cls']).mean() * 100, 64 | np.array(metric_dict['ndtw']).mean() * 100, 65 | np.array(metric_dict['sdtw']).mean() * 100)) 66 | print('%s' % (loss_str)) 67 | 68 | 69 | def test(args, agent, val_data): 70 | task_prefix = os.path.join('tasks', args.task_name) 71 | result_dir = os.path.join(task_prefix, args.result_dir) 72 | check_dir([result_dir]) 73 | 74 | def make_path(dir): 75 | return os.path.join(dir, '%s_%s' 76 | % (get_model_prefix(args.model_name, args.task_name), 77 | 'test')) 78 | 79 | # test 80 | for _, d in val_data.items(): 81 | it = math.ceil(len(d) / args.batch_size) 82 | test_batch, _ = make_batch(d, 0, it, args.batch_size, 83 | shuffle=False, sort_instr_len=False) 84 | agent.test(test_batch, one_by_one=args.one_by_one, history=args.history, 85 | exp_forget=args.exp_forget) 86 | agent.results_path = make_path(result_dir) + '.json' 87 | 88 | # reformat 89 | reformat_results = [] 90 | for id, r in agent.results.items(): 91 | reformat_results.append({ 92 | "instr_id": id, 93 | "trajectory": [[r["trajectory"][i]] + list(r["trajectory_radians"][i]) 94 | for i in range(len(r["trajectory"]))] 95 | }) 96 | agent.results = reformat_results 97 | agent.write_results() 98 | 99 | 100 | def train_val(args): 101 | ''' Validate on seen and unseen splits. ''' 102 | follower, _, val_data, evaluator, _, _ = train_setup(args) 103 | if args.use_test: 104 | test(args, follower, val_data) 105 | else: 106 | val(args, follower, val_data, evaluator) 107 | 108 | 109 | def make_arg_parser(): 110 | parser = argparse.ArgumentParser() 111 | ImageFeatures.add_args(parser) 112 | add_general_args(parser) 113 | return parser 114 | 115 | 116 | if __name__ == "__main__": 117 | run(make_arg_parser(), train_val) 118 | -------------------------------------------------------------------------------- /src/vocab/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sha-Lab/babywalk/4265340d5e521b59030bc50ef71f01b0d3d3ba63/src/vocab/__init__.py -------------------------------------------------------------------------------- /src/vocab/tokenizer.py: -------------------------------------------------------------------------------- 1 | import re 2 | import string 3 | import sys 4 | import numpy as np 5 | 6 | # padding, unknown word, end of sentence 7 | BASE_VOCAB = ['', '', '', ''] 8 | VOCAB_PAD_IDX = BASE_VOCAB.index('') 9 | VOCAB_UNK_IDX = BASE_VOCAB.index('') 10 | VOCAB_EOS_IDX = BASE_VOCAB.index('') 11 | 12 | 13 | def read_vocab(path): 14 | with open(path) as f: 15 | vocab = [word.strip() for word in f.readlines()] 16 | return vocab 17 | 18 | 19 | class Tokenizer(object): 20 | ''' Class to tokenize and encode a sentence. ''' 21 | # Split on any non-alphanumeric character 22 | SENTENCE_SPLIT_REGEX = re.compile(r'(\W+)') 23 | 24 | def __init__(self, vocab=None, no_glove=False): 25 | self.vocab = vocab 26 | self.word_to_index = {} 27 | 28 | if no_glove: 29 | self.vocab_bos_idx = len(vocab) 30 | self.add_word('', len(vocab)) 31 | else: 32 | self.vocab_bos_idx = BASE_VOCAB.index('') 33 | if vocab: 34 | for i, word in enumerate(vocab): 35 | self.word_to_index[word] = i 36 | 37 | def add_word(self, word, place): 38 | assert word not in self.word_to_index 39 | self.vocab.insert(place, word) 40 | 41 | @staticmethod 42 | def split_sentence(sentence): 43 | ''' Break sentence into a list of words and punctuation ''' 44 | toks = [] 45 | for word in [s.strip().lower() for s in 46 | Tokenizer.SENTENCE_SPLIT_REGEX.split(sentence.strip()) 47 | if len(s.strip()) > 0]: 48 | # Break up any words containing punctuation only, e.g. '!?', unless it is multiple full stops e.g. '..' 49 | if all(c in string.punctuation for c in word) \ 50 | and not all(c in '.' for c in word): 51 | toks += list(word) 52 | else: 53 | toks.append(word) 54 | return toks 55 | 56 | def encode_sentence(self, sentence): 57 | if len(self.word_to_index) == 0: 58 | sys.exit('Tokenizer has no vocab') 59 | encoding = [] 60 | for word in Tokenizer.split_sentence(sentence): 61 | if word in self.word_to_index: 62 | encoding.append(self.word_to_index[word]) 63 | else: 64 | encoding.append(VOCAB_UNK_IDX) 65 | arr = np.array(encoding) 66 | return arr, len(encoding) 67 | 68 | def decode_sentence(self, encoding, break_on_eos=False, join=True): 69 | sentence = [] 70 | for ix in encoding: 71 | if ix == (VOCAB_EOS_IDX if break_on_eos else VOCAB_PAD_IDX): 72 | break 73 | else: 74 | sentence.append(self.vocab[ix]) 75 | if join: 76 | return " ".join(sentence) 77 | return sentence 78 | -------------------------------------------------------------------------------- /src/vocab/vocab_path.py: -------------------------------------------------------------------------------- 1 | SUBTRAIN_VOCAB = 'src/vocab/vocab_data/sub_train_vocab.txt' 2 | TRAIN_VOCAB = 'src/vocab/vocab_data/train_vocab.txt' 3 | TRAINVAL_VOCAB = 'src/vocab/vocab_data/trainval_vocab.txt' 4 | NOUN_TRAIN = 'src/vocab/vocab_data/train_noun.txt' 5 | NOUN_PHRASE_TRAIN = 'src/vocab/vocab_data/train_spacy_noun_phrase.txt' 6 | GLOVE_PATH = 'src/vocab/vocab_data/train_glove.npy' 7 | -------------------------------------------------------------------------------- /tasks/R2R/README.md: -------------------------------------------------------------------------------- 1 | # Room-to-Room (R2R) Navigation Task 2 | 3 | 4 | ## Download Data 5 | 6 | Data consists of train/val-seen/val-unseen/test splits. There are two validation sets to better understand generalization performance between buildings that are in the training set (val-seen) and unseen buildings. The test set consists entirely of unseen buildings. 7 | 8 | To download, from the top level directory, run: 9 | ``` 10 | ./tasks/R2R/data/download.sh 11 | ``` 12 | 13 | Data is formatted as follows: 14 | ``` 15 | { 16 | "distance": float, 17 | "scan": str, 18 | "path_id": int, 19 | "path": [str x num_steps], 20 | "heading": float, 21 | "instructions": [str x 3], 22 | } 23 | ``` 24 | - `distance`: length of the path in meters. 25 | - `scan`: Matterport scan id. 26 | - `path_id`: Unique id for this path. 27 | - `path`: List of viewpoint ids (the first is is the start location, the last is the goal location) 28 | - `heading`: Agents initial heading in radians (elevation is always assumed to be zero). 29 | - `instructions`: Three unique natural language strings describing how to find the goal given the start pose. 30 | 31 | For the test set, only the first path_id (starting location) is included. We will provide a test server for scoring uploaded trajectories according to the metrics in the [paper](https://arxiv.org/abs/1711.07280). 32 | 33 | ## Directory Structure 34 | 35 | - `env.py`: Wraps the simulator and adds language instructions, with several simplifications -- namely discretized heading / elevation and pre-cached image features. This is not intended to be a standard component, or to preclude the use of continous camera actions, end-to-end training etc. Use the simulator and the data as you see fit, but this can provide a starting point. 36 | - `utils.py`: Text pre-processing, navigation graph loading etc. 37 | - `eval.py`: Evaluation script. 38 | - `model.py`: PyTorch seq2seq model with attention. 39 | - `agent.py`: Various implementations of an agent. 40 | - `train.py`: Training entrypoint, parameter settings etc. 41 | - `plot.py`: Figures from the arXiv paper. 42 | 43 | ## Prerequisites 44 | 45 | Python 2, [PyTorch](http://pytorch.org/), [NetworkX](https://networkx.github.io/). Install python dependencies by running: 46 | ``` 47 | pip install -r /tasks/R2R/requirements.txt 48 | ``` 49 | 50 | ## Training and Evaluation 51 | 52 | To train the seq2seq model with student-forcing: 53 | ``` 54 | python tasks/R2R/train.py 55 | ``` 56 | 57 | To run some simple baselines: 58 | ``` 59 | python tasks/R2R/eval.py 60 | ``` 61 | 62 | Generate figures from the paper: 63 | ``` 64 | python tasks/R2R/plot.py 65 | ``` 66 | 67 | The simple baselines include: 68 | - `ShortestAgent`: Agent that always follows the shortest path to goal (foundation for supervised training). 69 | - `RandomAgent`: Agent that randomly picks a directly, then tries to go straight for 5 viewpoints. 70 | - `StopAgent`: Agent that remains at the starting position. 71 | 72 | ![Navigation Error](plots/error.png) 73 | -------------------------------------------------------------------------------- /tasks/R2R/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sha-Lab/babywalk/4265340d5e521b59030bc50ef71f01b0d3d3ba63/tasks/R2R/__init__.py -------------------------------------------------------------------------------- /tasks/R2R/requirements.txt: -------------------------------------------------------------------------------- 1 | backports.functools-lru-cache==1.4 2 | cycler==0.10.0 3 | decorator==4.1.2 4 | matplotlib==2.1.0 5 | networkx==2.0 6 | numpy==1.13.3 7 | olefile==0.44 8 | pandas==0.21.0 9 | pillow>=6.2.2 10 | pyparsing==2.2.0 11 | python-dateutil==2.6.1 12 | pytz==2017.3 13 | pytorch=1.1.0 14 | PyYAML==5.1 15 | six==1.11.0 16 | subprocess32==3.2.7 17 | torch==0.2.0.post3 18 | torchvision==0.3.0 19 | -------------------------------------------------------------------------------- /tasks/R4R/README.md: -------------------------------------------------------------------------------- 1 | # R4R: Instruction and Path Composition for VLN 2 | 3 | [Room-to-Room](https://bringmeaspoon.org/) (R2R) is a pioneering dataset in 4 | visually-grounded natural language navigation with photo-realistic environments 5 | ([Anderson et al., 2018](https://arxiv.org/abs/1711.07280)). R2R consists of an 6 | environment and language instructions paired to reference paths. Due to the 7 | process by which the data are generated, all R2R reference paths are 8 | shortest-to-goal paths by construction. As such, they capture only a small 9 | subset of the richness of navigation. 10 | 11 | To address the lack of variety in path configurations, we propose a simple yet 12 | effective data augmentation strategy that increases the number of training 13 | examples and introduces paths that twist and turn, without additional human or 14 | low-fidelity machine annotations Quite simply, the existing paths in the 15 | dataset can be extended by joining them with other paths that start within some 16 | threshold dth of where they end. We name this the Room-for-Room (R4R) dataset. 17 | 18 | For further details, see the accompanying paper: 19 | [Stay on the Path: Instruction Fidelity in Vision-and-Language Navigation](https://arxiv.org/abs/1905.12255) 20 | 21 | ## Documentation 22 | 23 | The R4R dataset is created by joining together paths in the R2R dataset, for 24 | which the first path ends within a thresholded distance from the start of the 25 | second. We do not distribute the original R2R data here, and instead provide 26 | code that constructs R4R from it. The original R2R data can be downloaded 27 | [here](https://niessner.github.io/Matterport/). 28 | 29 | Example usage: 30 | 31 | ``` 32 | python r4r_generate_data.py \ 33 | --input_file_path="path/to/R2R_train.json" \ 34 | --output_file_path="path/to/R4R_train.json" \ 35 | --connections_dir="path/to/connections" \ 36 | --distance_threshold="3.0" 37 | ``` 38 | 39 | Command line arguments for `r4r_generate_data.py`: 40 | 41 | * `--output_file_path`: Path to the R4R data JSON file you are generating. 42 | * `--input_file_path`: Path to the original R2R data JSON file, which can be 43 | downloaded 44 | [here](https://github.com/peteanderson80/Matterport3DSimulator/blob/master/tasks/R2R/data/download.sh). 45 | * `--connections_dir`: Path to a directory containing graph connectivity 46 | files, which can be downloaded 47 | [here](https://github.com/peteanderson80/Matterport3DSimulator/tree/master/connectivity). 48 | * `--distance_threshold`: The maximum shortest-path distance between the final 49 | node of first path and the first node of the second path for the two paths 50 | to be joined. Conventionaly this is 3.0 meters 51 | ([Anderson et al., 2018](https://arxiv.org/abs/1711.07280)). 52 | * `--heading_threshold`: The maximum absolute heading angle difference in 53 | radians between the final connection of first path and the initial heading 54 | of the second path for the two paths to be joined. Conventionaly this check 55 | is disabled. 56 | 57 | Running this script on the standard R2R training and validation data with a 58 | distance threshold of 3.0 meters and no heading threshold: 59 | 60 | ``` 61 | ### R2R_train.json 62 | 63 | ******Final Results******** 64 | Total instructions generated: 233613 65 | Average path distance (meters): 20.5901583255 66 | Average shortest path distance: 10.5022469844 67 | Average path length (steps): 12.0681064404 68 | Average shortest path length: 6.4874662553 69 | Total paths generated: 25930 70 | Total distance filtered paths: 381581 71 | Total heading filtered paths: 0 72 | 73 | ### R2R_val_seen.json 74 | 75 | ******Final Results******** 76 | Total instructions generated: 1035 77 | Average path distance (meters): 20.3605171182 78 | Average shortest path distance: 11.1137253455 79 | Average path length (steps): 12.2173913043 80 | Average shortest path length: 7.0 81 | Total paths generated: 115 82 | Total distance filtered paths: 2269 83 | Total heading filtered paths: 0 84 | 85 | ### R2R_val_unseen.json 86 | 87 | ******Final Results******** 88 | Total instructions generated: 45162 89 | Average path distance (meters): 20.222094624 90 | Average shortest path distance: 10.057187751 91 | Average path length (steps): 12.147070546 92 | Average shortest path length: 6.40294938222 93 | Total paths generated: 5018 94 | Total distance filtered paths: 63401 95 | Total heading filtered paths: 0 96 | ``` 97 | 98 | Note: this script requires NetworkX and was tested on version 2.3. 99 | 100 | ## Reference 101 | 102 | If you use or discuss this dataset in your work, please cite our paper: 103 | 104 | ``` 105 | @InProceedings{sotp2019acl, 106 | title = {{Stay on the Path: Instruction Fidelity in Vision-and-Language Navigation}}, 107 | author = {Jain, Vihan and Magalhaes, Gabriel and Ku, Alexander and Vaswani, Ashish and Ie, Eugene and Baldridge, Jason}, 108 | booktitle = {Proc. of ACL}, 109 | year = {2019} 110 | } 111 | ``` 112 | 113 | ## Contact 114 | 115 | If you have a technical question regarding the dataset or publication, please 116 | create an issue in this repository. 117 | -------------------------------------------------------------------------------- /tasks/R4R/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The Google Research Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | -------------------------------------------------------------------------------- /tasks/R4R/cls.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The Google Research Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Coverage weighted by length score (CLS). 17 | 18 | Link to the original paper: 19 | https://arxiv.org/abs/1905.12255 20 | """ 21 | 22 | from __future__ import print_function 23 | 24 | import networkx as nx 25 | import numpy as np 26 | 27 | 28 | class CLS(object): 29 | """Coverage weighted by length score (CLS). 30 | 31 | Python doctest: 32 | 33 | >>> cls = CLS(nx.grid_graph([3, 4])) 34 | >>> reference = [(0, 0), (1, 0), (1, 1), (2, 1), (2, 2), (3, 2)] 35 | >>> assert np.isclose(cls(reference, reference), 1.0) 36 | >>> prediction = [(0, 0), (0, 1), (1, 1), (2, 1), (3, 1), (3, 2)] 37 | >>> assert np.isclose(cls(reference, prediction), 0.81994915125863865) 38 | >>> prediction = [(0, 1), (1, 1), (2, 1), (3, 1)] 39 | >>> assert np.isclose(cls(reference, prediction), 0.44197196102702557) 40 | 41 | Link to the original paper: 42 | https://arxiv.org/abs/1905.12255 43 | """ 44 | 45 | def __init__(self, graph, weight='weight', threshold=3.0): 46 | """Initializes a CLS object. 47 | 48 | Args: 49 | graph: networkx graph for the environment. 50 | weight: networkx edge weight key (str). 51 | threshold: distance threshold $d_{th}$ (float). 52 | """ 53 | self.graph = graph 54 | self.weight = weight 55 | self.threshold = threshold 56 | self.distance = dict( 57 | nx.all_pairs_dijkstra_path_length( 58 | self.graph, weight=self.weight)) 59 | 60 | def __call__(self, prediction, reference): 61 | """Computes the CLS metric. 62 | 63 | Args: 64 | prediction: list of nodes (str), path predicted by agent. 65 | reference: list of nodes (str), the ground truth path. 66 | 67 | Returns: 68 | the CLS between the prediction and reference path (float). 69 | """ 70 | 71 | def length(nodes): 72 | return np.sum([ 73 | self.graph.edges[edge].get(self.weight, 1.0) 74 | for edge in zip(nodes[:-1], nodes[1:]) 75 | ]) 76 | 77 | coverage = np.mean([ 78 | np.exp(-np.min([ # pylint: disable=g-complex-comprehension 79 | self.distance[u][v] for v in prediction 80 | ]) / self.threshold) for u in reference 81 | ]) 82 | expected = coverage * length(reference) 83 | score = expected / (expected + np.abs(expected - length(prediction))) 84 | return coverage * score -------------------------------------------------------------------------------- /tasks/R4R/dtw.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The Google Research Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Dynamic Time Warping based evaluation metrics for VLN.""" 17 | 18 | from __future__ import print_function 19 | 20 | import networkx as nx 21 | import numpy as np 22 | 23 | 24 | class DTW(object): 25 | """Dynamic Time Warping (DTW) evaluation metrics. 26 | 27 | Python doctest: 28 | 29 | >>> graph = nx.grid_graph([3, 4]) 30 | >>> prediction = [(0, 0), (1, 0), (2, 0), (3, 0)] 31 | >>> reference = [(0, 0), (1, 0), (2, 1), (3, 2)] 32 | >>> dtw = DTW(graph) 33 | >>> assert np.isclose(dtw(prediction, reference, 'dtw'), 3.0) 34 | >>> assert np.isclose(dtw(prediction, reference, 'ndtw'), 0.77880078307140488) 35 | >>> assert np.isclose(dtw(prediction, reference, 'sdtw'), 0.77880078307140488) 36 | >>> assert np.isclose(dtw(prediction[:2], reference, 'sdtw'), 0.0) 37 | """ 38 | 39 | def __init__(self, graph, weight='weight', threshold=3.0): 40 | """Initializes a DTW object. 41 | 42 | Args: 43 | graph: networkx graph for the environment. 44 | weight: networkx edge weight key (str). 45 | threshold: distance threshold $d_{th}$ (float). 46 | """ 47 | self.graph = graph 48 | self.weight = weight 49 | self.threshold = threshold 50 | self.distance = dict( 51 | nx.all_pairs_dijkstra_path_length(self.graph, weight=self.weight)) 52 | 53 | def __call__(self, prediction, reference, metric='sdtw'): 54 | """Computes DTW metrics. 55 | 56 | Args: 57 | prediction: list of nodes (str), path predicted by agent. 58 | reference: list of nodes (str), the ground truth path. 59 | metric: one of ['ndtw', 'sdtw', 'dtw']. 60 | 61 | Returns: 62 | the DTW between the prediction and reference path (float). 63 | """ 64 | assert metric in ['ndtw', 'sdtw', 'dtw'] 65 | 66 | dtw_matrix = np.inf * np.ones((len(prediction) + 1, len(reference) + 1)) 67 | dtw_matrix[0][0] = 0 68 | for i in range(1, len(prediction)+1): 69 | for j in range(1, len(reference)+1): 70 | best_previous_cost = min( 71 | dtw_matrix[i-1][j], dtw_matrix[i][j-1], dtw_matrix[i-1][j-1]) 72 | cost = self.distance[prediction[i-1]][reference[j-1]] 73 | dtw_matrix[i][j] = cost + best_previous_cost 74 | dtw = dtw_matrix[len(prediction)][len(reference)] 75 | 76 | if metric == 'dtw': 77 | return dtw 78 | 79 | ndtw = np.exp(-dtw/(self.threshold * len(reference))) 80 | if metric == 'ndtw': 81 | return ndtw 82 | 83 | success = self.distance[prediction[-1]][reference[-1]] <= self.threshold 84 | return success * ndtw -------------------------------------------------------------------------------- /tasks/R4R/graph_utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The Google Research Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Utils for loading and drawing graphs of the houses.""" 17 | 18 | from __future__ import print_function 19 | 20 | import json 21 | import matplotlib.pyplot as plt 22 | 23 | import networkx as nx 24 | import numpy as np 25 | from numpy.linalg import norm 26 | 27 | 28 | def load(connections_file): 29 | """Loads a networkx graph for a given scan. 30 | 31 | Args: 32 | connections_file: A string with the path to the .json file with the 33 | connectivity information. 34 | Returns: 35 | A networkx graph. 36 | """ 37 | with open(connections_file) as f: 38 | lines = json.load(f) 39 | nodes = np.array([x['image_id'] for x in lines]) 40 | matrix = np.array([x['unobstructed'] for x in lines]) 41 | mask = [x['included'] for x in lines] 42 | matrix = matrix[mask][:, mask] 43 | nodes = nodes[mask] 44 | pos2d = {x['image_id']: np.array(x['pose'])[[3, 7]] for x in lines} 45 | pos3d = {x['image_id']: np.array(x['pose'])[[3, 7, 11]] for x in lines} 46 | 47 | graph = nx.from_numpy_matrix(matrix) 48 | graph = nx.relabel.relabel_nodes(graph, dict(enumerate(nodes))) 49 | nx.set_node_attributes(graph, pos2d, 'pos2d') 50 | nx.set_node_attributes(graph, pos3d, 'pos3d') 51 | 52 | weight2d = {(u, v): norm(pos2d[u] - pos2d[v]) for u, v in graph.edges} 53 | weight3d = {(u, v): norm(pos3d[u] - pos3d[v]) for u, v in graph.edges} 54 | nx.set_edge_attributes(graph, weight2d, 'weight2d') 55 | nx.set_edge_attributes(graph, weight3d, 'weight3d') 56 | 57 | return graph 58 | 59 | 60 | def draw(graph, predicted_path, reference_path, output_filename, **kwargs): 61 | """Generates a plot showing the graph, predicted and reference paths. 62 | 63 | Args: 64 | graph: A networkx graph. 65 | predicted_path: A list with the ids of the nodes in the predicted path. 66 | reference_path: A list with the ids of the nodes in the reference path. 67 | output_filename: A string with the path where to store the generated image. 68 | **kwargs: Key-word arguments for aesthetic control. 69 | """ 70 | plt.figure(figsize=(10, 10)) 71 | ax = plt.gca() 72 | pos = nx.get_node_attributes(graph, 'pos2d') 73 | 74 | # Zoom in. 75 | xs = [pos[node][0] for node in predicted_path + reference_path] 76 | ys = [pos[node][1] for node in predicted_path + reference_path] 77 | min_x, max_x, min_y, max_y = min(xs), max(xs), min(ys), max(ys) 78 | center_x, center_y = (min_x + max_x)/2, (min_y + max_y)/2 79 | zoom_margin = kwargs.get('zoom_margin', 1.3) 80 | max_range = zoom_margin * max(max_x - min_x, max_y - min_y) 81 | half_range = max_range / 2 82 | ax.set_xlim(center_x - half_range, center_x + half_range) 83 | ax.set_ylim(center_y - half_range, center_y + half_range) 84 | 85 | # Background graph. 86 | nx.draw(graph, 87 | pos, 88 | edge_color=kwargs.get('background_edge_color', 'lightgrey'), 89 | node_color=kwargs.get('background_node_color', 'lightgrey'), 90 | node_size=kwargs.get('background_node_size', 60), 91 | width=kwargs.get('background_edge_width', 0.5)) 92 | 93 | # Prediction graph. 94 | predicted_path_graph = nx.DiGraph() 95 | predicted_path_graph.add_nodes_from(predicted_path) 96 | predicted_path_graph.add_edges_from( 97 | zip(predicted_path[:-1], predicted_path[1:])) 98 | nx.draw(predicted_path_graph, 99 | pos, 100 | arrowsize=kwargs.get('prediction_arrowsize', 15), 101 | edge_color=kwargs.get('prediction_edge_color', 'red'), 102 | node_color=kwargs.get('prediction_node_color', 'red'), 103 | node_size=kwargs.get('prediction_node_size', 130), 104 | width=kwargs.get('prediction_edge_width', 2.0)) 105 | 106 | # Reference graph. 107 | reference_path_graph = nx.DiGraph() 108 | reference_path_graph.add_nodes_from(reference_path) 109 | reference_path_graph.add_edges_from( 110 | zip(reference_path[:-1], reference_path[1:])) 111 | nx.draw(reference_path_graph, 112 | pos, 113 | arrowsize=kwargs.get('reference_arrowsize', 15), 114 | edge_color=kwargs.get('reference_edge_color', 'dodgerblue'), 115 | node_color=kwargs.get('reference_node_color', 'dodgerblue'), 116 | node_size=kwargs.get('reference_node_size', 130), 117 | width=kwargs.get('reference_edge_width', 2.0)) 118 | 119 | # Intersection graph. 120 | intersection_path_graph = nx.DiGraph() 121 | common_nodes = set(predicted_path_graph.nodes.keys()).intersection( 122 | set(reference_path_graph.nodes.keys())) 123 | intersection_path_graph.add_nodes_from(common_nodes) 124 | common_edges = set(predicted_path_graph.edges.keys()).intersection( 125 | set(reference_path_graph.edges.keys())) 126 | intersection_path_graph.add_edges_from(common_edges) 127 | nx.draw(intersection_path_graph, 128 | pos, 129 | arrowsize=kwargs.get('intersection_arrowsize', 15), 130 | edge_color=kwargs.get('intersection_edge_color', 'limegreen'), 131 | node_color=kwargs.get('intersection_node_color', 'limegreen'), 132 | node_size=kwargs.get('intersection_node_size', 130), 133 | width=kwargs.get('intersection_edge_width', 2.0)) 134 | 135 | plt.savefig(output_filename) 136 | plt.close() 137 | -------------------------------------------------------------------------------- /tasks/R4R/r4r_generate_data.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The Google Research Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Script to build R4R data from the original R2R data. 17 | 18 | Link to the original R2R: 19 | https://niessner.github.io/Matterport/ 20 | """ 21 | 22 | from __future__ import print_function 23 | 24 | import argparse 25 | import collections 26 | import json 27 | import os 28 | 29 | import graph_utils 30 | 31 | import networkx as nx 32 | import numpy as np 33 | 34 | 35 | def main(args): 36 | """Generate R4R data from the original R2R data. 37 | 38 | Args: 39 | args: argparse containing paths to input and output files. 40 | """ 41 | print('******Generating R4R Data********') 42 | print(' Distance threshold: {} meters'.format(args.distance_threshold)) 43 | print(' Heading threshold: {} radians'.format(args.heading_threshold)) 44 | 45 | def _connections_file_path(scan): 46 | return os.path.join( 47 | args.connections_dir, '{}_connectivity.json'.format(scan)) 48 | 49 | inputs = json.load(open(args.input_file_path)) 50 | outputs = list() 51 | filtered = collections.Counter() 52 | 53 | # Group by scan to save memory. 54 | scans = dict() 55 | for value in inputs: 56 | scan = value['scan'] 57 | if scan not in scans: 58 | scans[scan] = [] 59 | scans[scan].append(value) 60 | 61 | for scan, values in scans.items(): 62 | print('Loading graph for scan {}.'.format(scan)) 63 | graph = graph_utils.load(_connections_file_path(scan)) 64 | pos2d = nx.get_node_attributes(graph, 'pos2d') 65 | 66 | # Cache format: (node, (distance, path)) ((node obj, (dict, dict))) 67 | cache = dict(nx.all_pairs_dijkstra(graph, weight='weight3d')) 68 | shortest_distance = {k: v[0] for k, v in cache.items()} 69 | shortest_path = {k: v[1] for k, v in cache.items()} 70 | 71 | for first in values: 72 | for second in values: 73 | first_target = first['path'][-1] 74 | second_source = second['path'][0] 75 | 76 | # Compute the end-start distance (meters). 77 | distance = shortest_distance[first_target][second_source] 78 | 79 | # Compute the absolute end-start heading difference (radians). 80 | x, y = pos2d[first['path'][-1]] - pos2d[first['path'][-2]] 81 | heading = abs(second['heading'] - np.arctan2(y, x) % (2 * np.pi)) 82 | 83 | if (args.distance_threshold is not None 84 | and distance > args.distance_threshold): 85 | filtered['distance'] += 1 86 | elif (args.heading_threshold is not None 87 | and heading > args.heading_threshold): 88 | filtered['heading'] += 1 89 | else: 90 | value = dict() 91 | value['path'] = ( 92 | first['path'][:-1] 93 | + shortest_path[first_target][second_source] 94 | + second['path'][1:]) 95 | value['distance'] = ( 96 | first['distance'] 97 | + shortest_distance[first_target][second_source] 98 | + second['distance']) 99 | value['instructions'] = [ 100 | x + y # pylint: disable=g-complex-comprehension 101 | for x in first['instructions'] 102 | for y in second['instructions']] 103 | value['heading'] = first['heading'] 104 | value['path_id'] = len(outputs) 105 | value['scan'] = scan 106 | 107 | # Additional data. 108 | path_source = first['path'][0] 109 | path_target = second['path'][-1] 110 | value['shortest_path_distance'] = cache[path_source][0][path_target] 111 | value['shortest_path'] = cache[path_source][1][path_target] 112 | value['first_path_id'] = first['path_id'] 113 | value['second_path_id'] = second['path_id'] 114 | 115 | outputs.append(value) 116 | 117 | with open(args.output_file_path, 'w') as f: 118 | json.dump(outputs, f, indent=2, sort_keys=True, separators=(',', ': ')) 119 | 120 | # Dataset summary metrics. 121 | tot_instructions = np.sum([len(x['instructions']) for x in outputs]) 122 | avg_distance = np.mean([x['distance'] for x in outputs]) 123 | avg_path_len = np.mean([len(x['path']) for x in outputs]) 124 | avg_sp_distance = np.mean([x['shortest_path_distance'] for x in outputs]) 125 | avg_sp_path_len = np.mean([len(x['shortest_path']) for x in outputs]) 126 | 127 | print('******Final Results********') 128 | print(' Total instructions generated: {}'.format(tot_instructions)) 129 | print(' Average path distance (meters): {}'.format(avg_distance)) 130 | print(' Average shortest path distance: {}'.format(avg_sp_distance)) 131 | print(' Average path length (steps): {}'.format(avg_path_len)) 132 | print(' Average shortest path length: {}'.format(avg_sp_path_len)) 133 | print(' Total paths generated: {}'.format(len(outputs))) 134 | print(' Total distance filtered paths: {}'.format(filtered['distance'])) 135 | print(' Total heading filtered paths: {}'.format(filtered['heading'])) 136 | 137 | 138 | if __name__ == '__main__': 139 | parser = argparse.ArgumentParser() 140 | parser.add_argument( 141 | '--connections_dir', 142 | dest='connections_dir', 143 | required=True, 144 | help='Path to the Matterport simulator connection data.') 145 | parser.add_argument( 146 | '--input_file_path', 147 | dest='input_file_path', 148 | required=True, 149 | help='Path to read the R2R input data.') 150 | parser.add_argument( 151 | '--output_file_path', 152 | dest='output_file_path', 153 | required=True, 154 | help='Path to write the R4R output data.') 155 | parser.add_argument( 156 | '--distance_threshold', 157 | dest='distance_threshold', 158 | required=False, 159 | nargs='?', 160 | const=3.0, 161 | type=float, 162 | help='Maximum end-start distance (meters) to join R2R paths.') 163 | parser.add_argument( 164 | '--heading_threshold', 165 | dest='heading_threshold', 166 | required=False, 167 | nargs='?', 168 | const=None, 169 | type=float, 170 | help='Maximum end-start heading difference (radians) to join R2R paths.') 171 | main(parser.parse_args()) -------------------------------------------------------------------------------- /tasks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sha-Lab/babywalk/4265340d5e521b59030bc50ef71f01b0d3d3ba63/tasks/__init__.py -------------------------------------------------------------------------------- /teaser/babywalk_curriculum.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sha-Lab/babywalk/4265340d5e521b59030bc50ef71f01b0d3d3ba63/teaser/babywalk_curriculum.jpg -------------------------------------------------------------------------------- /teaser/pytorch-logo-dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sha-Lab/babywalk/4265340d5e521b59030bc50ef71f01b0d3d3ba63/teaser/pytorch-logo-dark.png --------------------------------------------------------------------------------