├── .gitignore
├── .gitmodules
├── LICENSE
├── README.md
├── download.sh
├── download_model.sh
├── model
├── __init__.py
├── attentions.py
├── context_encoder.py
├── cuda.py
├── follower_coattend.py
├── follower_coground.py
└── speaker_lstm.py
├── requirements.txt
├── simulator
├── connectivity
│ ├── 17DRP5sb8fy_connectivity.json
│ ├── 1LXtFkjw3qL_connectivity.json
│ ├── 1pXnuDYAj8r_connectivity.json
│ ├── 29hnd4uzFmX_connectivity.json
│ ├── 2azQ1b91cZZ_connectivity.json
│ ├── 2n8kARJN3HM_connectivity.json
│ ├── 2t7WUuJeko7_connectivity.json
│ ├── 5LpN3gDmAk7_connectivity.json
│ ├── 5ZKStnWn8Zo_connectivity.json
│ ├── 5q7pvUzZiYa_connectivity.json
│ ├── 759xd9YjKW5_connectivity.json
│ ├── 7y3sRwLe3Va_connectivity.json
│ ├── 8194nk5LbLH_connectivity.json
│ ├── 82sE5b5pLXE_connectivity.json
│ ├── 8WUmhLawc2A_connectivity.json
│ ├── ARNzJeq3xxb_connectivity.json
│ ├── B6ByNegPMKs_connectivity.json
│ ├── D7G3Y4RVNrH_connectivity.json
│ ├── D7N2EKCX4Sj_connectivity.json
│ ├── E9uDoFAP3SH_connectivity.json
│ ├── EDJbREhghzL_connectivity.json
│ ├── EU6Fwq7SyZv_connectivity.json
│ ├── GdvgFV5R1Z5_connectivity.json
│ ├── HxpKQynjfin_connectivity.json
│ ├── JF19kD82Mey_connectivity.json
│ ├── JeFG25nYj2p_connectivity.json
│ ├── JmbYfDe2QKZ_connectivity.json
│ ├── PX4nDJXEHrG_connectivity.json
│ ├── Pm6F8kyY3z2_connectivity.json
│ ├── PuKPg4mmafe_connectivity.json
│ ├── QUCTc6BB5sX_connectivity.json
│ ├── README.md
│ ├── RPmz2sHmrrY_connectivity.json
│ ├── S9hNv5qa7GM_connectivity.json
│ ├── SN83YJsR3w2_connectivity.json
│ ├── TbHJrupSAjP_connectivity.json
│ ├── ULsKaCPVFJR_connectivity.json
│ ├── UwV83HsGsw3_connectivity.json
│ ├── Uxmj2M2itWa_connectivity.json
│ ├── V2XKFyX4ASd_connectivity.json
│ ├── VFuaQ6m2Qom_connectivity.json
│ ├── VLzqgDo317F_connectivity.json
│ ├── VVfe2KiqLaN_connectivity.json
│ ├── Vt2qJdWjCF2_connectivity.json
│ ├── Vvot9Ly1tCj_connectivity.json
│ ├── VzqfbhrpDEA_connectivity.json
│ ├── WYY7iVyf5p8_connectivity.json
│ ├── X7HyMhZNoso_connectivity.json
│ ├── XcA2TqTSSAj_connectivity.json
│ ├── YFuZgdQ5vWj_connectivity.json
│ ├── YVUC4YcDtcY_connectivity.json
│ ├── YmJkqBEsHnH_connectivity.json
│ ├── Z6MFQCViBuw_connectivity.json
│ ├── ZMojNkEp431_connectivity.json
│ ├── aayBHfsNo7d_connectivity.json
│ ├── ac26ZMwG7aT_connectivity.json
│ ├── b8cTxDM8gDG_connectivity.json
│ ├── cV4RVeZvu5T_connectivity.json
│ ├── dhjEzFoUFzH_connectivity.json
│ ├── e9zR4mvMWw7_connectivity.json
│ ├── fzynW3qQPVF_connectivity.json
│ ├── gTV8FGcVJC9_connectivity.json
│ ├── gYvKGZ5eRqb_connectivity.json
│ ├── gZ6f7yhEvPG_connectivity.json
│ ├── gxdoqLR6rwA_connectivity.json
│ ├── i5noydFURQK_connectivity.json
│ ├── jh4fc5c5qoQ_connectivity.json
│ ├── jtcxE69GiFV_connectivity.json
│ ├── kEZ7cmS4wCh_connectivity.json
│ ├── mJXqzFtmKg4_connectivity.json
│ ├── oLBMNvg9in8_connectivity.json
│ ├── p5wJjkQkbXX_connectivity.json
│ ├── pLe4wQe7qrG_connectivity.json
│ ├── pRbA3pwrgk9_connectivity.json
│ ├── pa4otMbVnkk_connectivity.json
│ ├── q9vSo1VnCiC_connectivity.json
│ ├── qoiz87JEwZ2_connectivity.json
│ ├── r1Q1Z4BcV1o_connectivity.json
│ ├── r47D5H71a5s_connectivity.json
│ ├── rPc6DW4iMge_connectivity.json
│ ├── rqfALeAoiTq_connectivity.json
│ ├── s8pcmisQ38h_connectivity.json
│ ├── sKLMLpTHeUy_connectivity.json
│ ├── sT4fr6TAbpF_connectivity.json
│ ├── scans.txt
│ ├── uNb9QFRL6hY_connectivity.json
│ ├── ur6pFq6Qu1A_connectivity.json
│ ├── vyrNrziPKCB_connectivity.json
│ ├── wc2JMjhGNzB_connectivity.json
│ ├── x8F5xyUWy9e_connectivity.json
│ ├── yqstnuAEVhm_connectivity.json
│ └── zsNo4HB9uLZ_connectivity.json
├── envs
│ ├── __init__.py
│ ├── env.py
│ ├── envs_utils.py
│ ├── image_feature.py
│ └── paths.py
└── resnet_feature
│ └── ResNet-152-deploy.prototxt
├── src
├── __init__.py
├── eval_follower.py
├── follower.py
├── params.py
├── process_data.py
├── speaker.py
├── train_follower.py
├── utils.py
├── val_follower.py
└── vocab
│ ├── __init__.py
│ ├── tokenizer.py
│ └── vocab_path.py
├── tasks
├── R2R
│ ├── README.md
│ ├── __init__.py
│ └── requirements.txt
├── R4R
│ ├── README.md
│ ├── __init__.py
│ ├── cls.py
│ ├── dtw.py
│ ├── graph_utils.py
│ └── r4r_generate_data.py
└── __init__.py
└── teaser
├── babywalk_curriculum.jpg
└── pytorch-logo-dark.png
/.gitignore:
--------------------------------------------------------------------------------
1 | *~
2 | /data
3 | /build
4 | /doxygen
5 | *.tsv
6 | /sim_imgs
7 | *.so
8 | *kdev4*
9 | *.caffemodel
10 | *.caffemodel.h5
11 | *.pyc
12 | *.out
13 | *.zip
14 | plot*
15 |
16 | simulator/*~
17 | simulator/data
18 | simulator/build*
19 | simulator/doxygen
20 | simulator/*.tsv
21 | simulator/sim_imgs
22 | simulator/*.so
23 | simulator/*kdev4*
24 | simulator/*.caffemodel
25 | simulator/*.caffemodel.h5
26 | simulator/*.pyc
27 | simulator/*.out
28 | simulator/*.zip
29 | simulator/envs/*.swp
30 |
31 | /tasks/R2R/data/*.json
32 | /tasks/R2R/plot*
33 | /tasks/R2R/results/
34 | /tasks/R2R/snapshots/
35 | /tasks/R2R/data/v1
36 | /tasks/R2R/*.swp
37 | /tasks/R2R/follower/
38 | /tasks/R2R/speaker/
39 | /tasks/R2R/follower_traj/
40 |
41 | /tasks/R4R/data/*.json
42 | /tasks/R4R/plot*
43 | /tasks/R4R/results/
44 | /tasks/R4R/snapshots/
45 | /tasks/R4R/data/v1
46 | /tasks/R4R/*.swp
47 | /tasks/R4R/follower/
48 | /tasks/R4R/speaker/
49 | /tasks/R4R/follower_traj/
50 |
51 | /model/*.swp
52 | # intellij
53 | .idea/
54 | *.iml
55 |
56 | img_features
57 | /run_script.sh
58 | /val_all.bash
59 | /val_all.sh
60 |
--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "simulator/pybind11"]
2 | path = simulator/pybind11
3 | url = https://github.com/pybind/pybind11.git
4 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2019 Sha-Lab
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # BabyWalk: Going Farther in Vision-and-Language Navigationby Taking Baby Steps
2 |
[](https://opensource.org/licenses/MIT)
3 |
4 | This is the PyTorch implementation of our paper:
5 |
6 | **BabyWalk: Going Farther in Vision-and-Language Navigationby Taking Baby Steps**
7 | Wang Zhu*, Hexiang Hu*, Jiacheng Chen, Zhiwei Deng, Vihan Jain, Eugene Ie, Fei Sha
8 | 2020 Annual Conference of the Association for Computational Linguistics (ACL 2020)
9 |
10 | [[arXiv](http://arxiv.org/abs/2005.04625)] [[GitHub](https://github.com/Sha-Lab/babywalk)]
11 |
12 | ## Abstract
13 | Learning to follow instructions is of fundamental importance to autonomous agents for vision-and-language navigation (VLN). In this paper, we study how an agent can navigate long paths when learning from a corpus that consists of shorter ones. We show that existing state-of-the-art agents do not generalize well. To this end, we propose BabyWalk, a new VLN agent that is learned to navigate by decomposing long instructions into shorter ones (BabySteps) and completing them sequentially. A special design memory buffer is used by the agent to turn its past experiences into contexts for future steps. The learning process is composed of two phases. In the first phase, the agent uses imitation learning from demonstration to accomplish BabySteps. In the second phase, the agent uses curriculum-based reinforcement learning to maximize rewards on navigation tasks with increasingly longer instructions. We create two new benchmark datasets (of long navigation tasks) and use them in conjunction with existing ones to examine BabyWalk's generalization ability. Empirical results show that BabyWalk achieves state-of-the-art results on several metrics, in particular, is able to follow long instructions better.
14 |
15 |
16 |
17 | ## Installation
18 |
19 | 1. Install Python 3.7 (Anaconda recommended: https://www.anaconda.com/distribution/).
20 | 2. Install PyTorch following the instructions on https://pytorch.org/ (we used PyTorch 1.1.0 in our experiments).
21 | 3. Download this repository or clone with Git, and then enter the root directory of the repository:
22 | ```
23 | git clone https://github.com/Sha-Lab/babywalk
24 | cd babywalk
25 | ```
26 | 4. Check the installation of required packages in requirement.txt.
27 | 5. Download and preprocess the data
28 | ```
29 | chmod +x download.sh
30 | ./download.sh
31 | ```
32 | After this step, check
33 | + `simulator/resnet_feature/` should contain `ResNet-152-imagenet.tsv`.
34 | + `simulator` should contain `total_adj_list.json`, which replace the Matterport3D simulator
35 | + `src/vocab/vocab_data` should contain vocabulary and its glove embedding files `train_vocab.txt` and `train_glove.npy`.
36 | + `tasks/` should contain `R2R`, `R4R`, `R6R`, `R8R`, `R2T8`, each which a data folder in it containing training/evaluation data.
37 |
38 | **Updates**: The old link for the ResNet feature is expired. Please see [here](https://drive.google.com/file/d/1HjEH3EQt-aHSjolg0VnX_YF1UEHiXLfT/view?usp=sharing) for the new link and the additional [**landmark alignment code**](https://drive.google.com/file/d/1soXYE-IMveMpvjAi6lNSpJURE3B4IYk3/view?usp=sharing).
39 |
40 | ## Training and evaluation
41 | Here we take training on R2R as an example, using BABYWALK.
42 |
43 | ### Warmup with IL
44 | ```
45 | CUDA_VISIBLE_DEVICES=0 python src/train_follower.py \
46 | --split_postfix "_landmark" \
47 | --task_name R2R \
48 | --n_iters 50000 \
49 | --model_name "follower_bbw" \
50 | --il_mode "landmark_split" \
51 | --one_by_one \
52 | --one_by_one_mode "landmark" \
53 | --history \
54 | --log_every 100
55 | ```
56 |
57 | ### Training with CRL
58 | ```
59 | CUDA_VISIBLE_DEVICES=0 python src/train_follower.py \
60 | --split_postfix "_landmark" \
61 | --task_name R2R \
62 | --n_iters 30000 \
63 | --curriculum_iters 5000 \
64 | --model_name "follower_bbw_crl" \
65 | --one_by_one \
66 | --one_by_one_mode "landmark" \
67 | --history \
68 | --log_every 100 \
69 | --reward \
70 | --reward_type "cls" \
71 | --batch_size 64 \
72 | --curriculum_rl \
73 | --max_curriculum 4 \
74 | --no_speaker \
75 | --follower_prefix "tasks/R2R/follower/snapshots/follower_bbw_sample_train_iter_30000"
76 | ```
77 |
78 | ### Other baselines
79 | Here we take training on R2R as an example, using Speaker-Follower and Reinforced Cross-modal Matching.
80 | + Speaker-Follower
81 | ```
82 | CUDA_VISIBLE_DEVICES=0 python src/train_follower.py \
83 | --task_name R2R \
84 | --n_iters 50000 \
85 | --model_name "follower_sf_aug" \
86 | --add_augment
87 | CUDA_VISIBLE_DEVICES=0 python src/train_follower.py \
88 | --task_name R2R \
89 | --n_iters 20000 \
90 | --model_name "follower_sf" \
91 | --follower_prefix "tasks/R2R/follower/snapshots/best_model"
92 | ```
93 | + Reinforced Cross-modal Matching
94 | ```
95 | CUDA_VISIBLE_DEVICES=0 python src/train_follower.py \
96 | --task_name R2R \
97 | --n_iters 20000 \
98 | --model_name "follower_rcm_cls" \
99 | --reward \
100 | --reward_type "cls" \
101 | --batch_size 64 \
102 | --no_speaker \
103 | --follower_prefix "tasks/R2R/follower/snapshots/follower_sf_aug_sample_train-literal_speaker_data_augmentation_iter_50000"
104 | ```
105 |
106 | ### Evaluation
107 | Here we take model trained on R2R, using BABYWALK as an example.
108 | + Evaluate on the validation unseen data of Room 2-to-8.
109 | ```
110 | CUDA_VISIBLE_DEVICES=0 python src/val_follower.py \
111 | --task_name R2T8 \
112 | --split_postfix "_landmark" \
113 | --one_by_one \
114 | --one_by_one_mode "landmark" \
115 | --model_name "follower_bbw" \
116 | --history \
117 | --follower_prefix "tasks/R2R/follower/snapshots/best_model"
118 | ```
119 |
120 | + Evaluate on the validation seen / unseen data of R**x**R (**x**=2,4,6,8).
121 | + change ``` --task_name R2T8 ``` to ``` --task_name RxR ```
122 | + Evaluate on the test data of R2R.
123 | + set ``` --task_name R2R ```
124 | + add ``` --use test ```
125 | + For SF/RCM models, evaluate on R**x**R (**x**=2,4,6,8).
126 | + set ``` --task_name RxR ```
127 | + set ``` --max_steps 5*x ``` and ``` --max_ins_len 50*x ```
128 | ## Download reported models in our paper
129 | ```
130 | chmod +x download_model.sh
131 | ./download_model.sh
132 | ```
133 | ### Performance comparison on SDTW
134 | **Models trained on R4R**
135 | | Model | Eval R2R | Eval R4R | Eval R6R | Eval R8R |
136 | |:-------------------:|:--------:|:--------:|:--------:|:--------:|
137 | | SF | 14.8 | 9.2 | 5.2 | 5.0 |
138 | | RCM(FIDELITY) | 18.3 | 13.7 | 7.9 | 6.1 |
139 | | REGRETFUL | 13.4 | 13.5 | 7.5 | 5.6 |
140 | | FAST | 14.2 | 15.5 | 7.7 | 6.3 |
141 | | BABYWALK | 27.8 | 17.3 | 13.1 | 11.5 |
142 | | BABYWALK(COGROUND) | ***31.6*** | ***20.0*** | ***15.9*** | ***13.9*** |
143 |
144 |
145 | **Models trained on R2R**
146 | | Model | Eval R2R | Eval R4R | Eval R6R | Eval R8R |
147 | |:-------------------:|:--------:|:--------:|:--------:|:--------:|
148 | | SF | 27.2 | 6.7 | 7.2 | 3.8 |
149 | | RCM(FIDELITY) | 34.4 | 7.2 | 8.4 | 4.3 |
150 | | REGRETFUL | 40.6 | 9.8 | 6.8 | 2.4 |
151 | | FAST | ***45.4*** | 7.2 | 8.5 | 2.4 |
152 | | BABYWALK | 36.9 | ***13.8*** | ***11.2*** | ***9.8*** |
153 |
154 |
155 | # Citation
156 |
157 | Please citing the follow BibTex entry if you are using any content from this repository:
158 | ```
159 | @inproceedings{zhu2020babywalk,
160 | title = "{B}aby{W}alk: Going Farther in Vision-and-Language Navigation by Taking Baby Steps",
161 | author = "Zhu, Wang and Hu, Hexiang and Chen, Jiacheng and Deng, Zhiwei and Jain, Vihan and Ie, Eugene and Sha, Fei",
162 | booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics",
163 | year = "2020",
164 | publisher = "Association for Computational Linguistics",
165 | pages = "2539--2556",
166 | }
167 | ```
168 |
169 |
--------------------------------------------------------------------------------
/download.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | # vocab data
4 | mkdir -p src/vocab/vocab_data
5 | wget https://www.dropbox.com/s/r71i31xpm1zy3oy/sub_train_vocab.txt?dl=0 -O src/vocab/vocab_data/sub_train_vocab.txt
6 | wget https://www.dropbox.com/s/xqt6et0i1g41t88/train_glove.npy?dl=0 -O src/vocab/vocab_data/train_glove.npy
7 | wget https://www.dropbox.com/s/l7dee5fls07t9q0/train_vocab.txt?dl=0 -O src/vocab/vocab_data/train_vocab.txt
8 | wget https://www.dropbox.com/s/cjapgv3rpxrq1ie/trainval_glove.npy?dl=0 -O src/vocab/vocab_data/trainval_glove.npy
9 | wget https://www.dropbox.com/s/3s2plada1vttxuv/trainval_vocab.txt?dl=0 -O src/vocab/vocab_data/trainval_vocab.txt
10 |
11 | # resnet feature
12 | mkdir -p simulator/resnet_feature/
13 | curl -c /tmp/cookies "https://drive.google.com/uc?export=download&id=1HjEH3EQt-aHSjolg0VnX_YF1UEHiXLfT" > /tmp/intermezzo.html
14 | curl -L -b /tmp/cookies "https://drive.google.com$(cat /tmp/intermezzo.html | grep -Po 'uc-download-link" [^>]* href="\K[^"]*' | sed 's/\&/\&/g')" > simulator/resnet_feature/ResNet-152-imagenet.zip
15 | unzip simulator/resnet_feature/ResNet-152-imagenet.zip -d simulator/resnet_feature
16 |
17 | # adjacency dict
18 | wget https://www.dropbox.com/s/6a076293c3o77gi/total_adj_list.json?dl=0 -O simulator/total_adj_list.json
19 |
20 | # training/eval data
21 | mkdir -p tasks/R2R/data
22 | mkdir -p tasks/R4R/data
23 | mkdir -p tasks/R6R/data
24 | mkdir -p tasks/R8R/data
25 | mkdir -p tasks/R2T8/data
26 | wget https://www.dropbox.com/s/2v3f72vpoj53r6d/R2R_data.zip?dl=0 -O tasks/R2R/data/R2R_data.zip
27 | wget https://www.dropbox.com/s/7n7ptzkjr601dq9/R4R_data.zip?dl=0 -O tasks/R4R/data/R4R_data.zip
28 | wget https://www.dropbox.com/s/bjqwu9tn0t6f50r/R6R_data.zip?dl=0 -O tasks/R6R/data/R6R_data.zip
29 | wget https://www.dropbox.com/s/kdid25goi88sgxo/R8R_data.zip?dl=0 -O tasks/R8R/data/R8R_data.zip
30 | wget https://www.dropbox.com/s/aswlh36v68x3al0/R2T8_data.zip?dl=0 -O tasks/R2T8/data/R2T8_data.zip
31 | unzip tasks/R2R/data/R2R_data.zip -d tasks/R2R/data
32 | unzip tasks/R4R/data/R4R_data.zip -d tasks/R4R/data
33 | unzip tasks/R6R/data/R6R_data.zip -d tasks/R6R/data
34 | unzip tasks/R8R/data/R8R_data.zip -d tasks/R8R/data
35 | unzip tasks/R2T8/data/R2T8_data.zip -d tasks/R2T8/data
36 |
37 | # download speaker model
38 | mkdir -p tasks/R2R/speaker/snapshots
39 | mkdir -p tasks/R4R/speaker/snapshots
40 | mkdir -p tasks/R6R/speaker/snapshots
41 | mkdir -p tasks/R8R/speaker/snapshots
42 | wget https://www.dropbox.com/s/65z90zktd7w6dtz/speaker.zip?dl=0 -O tasks/R2R/speaker/snapshots/speaker.zip
43 | wget https://www.dropbox.com/s/q223j0vn1ofd89z/speaker.zip?dl=0 -O tasks/R4R/speaker/snapshots/speaker.zip
44 | unzip tasks/R2R/speaker/snapshots/speaker.zip -d tasks/R2R/speaker/snapshots
45 | unzip tasks/R4R/speaker/snapshots/speaker.zip -d tasks/R4R/speaker/snapshots
46 | cp tasks/R4R/speaker/snapshots/* tasks/R6R/speaker/snapshots
47 | cp tasks/R6R/speaker/snapshots/* tasks/R8R/speaker/snapshots
--------------------------------------------------------------------------------
/download_model.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | # babywalk models
4 | wget https://www.dropbox.com/s/buxgob1xusp4401/follower_r2r_bbw.zip?dl=0 -O tasks/R2R/follower/follower_r2r_bbw.zip
5 | wget https://www.dropbox.com/s/okpjhgmdvzrkwiq/follower_r4r_bbw.zip?dl=0 -O tasks/R4R/follower/follower_r4r_bbw.zip
6 | wget https://www.dropbox.com/s/mc8iuav1g5buqfp/follower_r6r_bbw.zip?dl=0 -O tasks/R6R/follower/follower_r6r_bbw.zip
7 | wget https://www.dropbox.com/s/ttwff1nv5sthd9t/follower_r8r_bbw.zip?dl=0 -O tasks/R8R/follower/follower_r8r_bbw.zip
8 |
9 | # babywalk(coground) models
10 | wget https://www.dropbox.com/s/l2tnmb0ej6y5l0y/follower_coground_r4r_bbw.zip?dl=0 -O tasks/R4R/follower/follower_coground_r4r_bbw.zip
11 |
--------------------------------------------------------------------------------
/model/__init__.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | sys.path.append("model")
4 |
--------------------------------------------------------------------------------
/model/attentions.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 |
4 |
5 | class SoftDotAttention(nn.Module):
6 | '''Soft Dot Attention.
7 |
8 | Ref: http://www.aclweb.org/anthology/D15-1166
9 | Adapted from PyTorch OPEN NMT.
10 | '''
11 |
12 | def __init__(self, dim, ctx_dim=None):
13 | '''Initialize layer.'''
14 | super(SoftDotAttention, self).__init__()
15 | if ctx_dim is None:
16 | ctx_dim = dim
17 | self.linear_in = nn.Linear(dim, ctx_dim, bias=False)
18 | self.sm = nn.Softmax(dim=1)
19 | self.linear_out = nn.Linear(dim + ctx_dim, dim, bias=False)
20 | self.tanh = nn.Tanh()
21 |
22 | def forward(self, h, context, mask=None):
23 | '''Propagate h through the network.
24 |
25 | h: batch x dim
26 | context: batch x seq_len x dim
27 | mask: batch x seq_len indices to be masked
28 | '''
29 | target = self.linear_in(h).unsqueeze(2) # batch x dim x 1
30 |
31 | # Get attention
32 | attn = torch.bmm(context, target).squeeze(2) # batch x seq_len
33 | if mask is not None:
34 | # -Inf masking prior to the softmax
35 | attn.data.masked_fill_(mask, -float('inf'))
36 | attn = self.sm(attn)
37 | attn3 = attn.view(attn.size(0), 1, attn.size(1)) # batch x 1 x seq_len
38 |
39 | weighted_context = torch.bmm(attn3, context).squeeze(1) # batch x dim
40 | h_tilde = torch.cat((weighted_context, h), 1)
41 |
42 | h_tilde = self.tanh(self.linear_out(h_tilde))
43 | return h_tilde, attn
44 |
45 |
46 | class WhSoftDotAttention(nn.Module):
47 | ''' Visual Dot Attention Layer. '''
48 |
49 | def __init__(self, h_dim, v_dim=None):
50 | '''Initialize layer.'''
51 | super(WhSoftDotAttention, self).__init__()
52 | if v_dim is None:
53 | v_dim = h_dim
54 | self.h_dim = h_dim
55 | self.v_dim = v_dim
56 | self.linear_in_h = nn.Linear(h_dim, v_dim, bias=True)
57 | self.sm = nn.Softmax(dim=1)
58 |
59 | def forward(self, h, k, mask=None, v=None):
60 | '''Propagate h through the network.
61 | h: batch x h_dim
62 | k: batch x v_num x v_dim
63 | '''
64 | target = self.linear_in_h(h).unsqueeze(2) # batch x dot_dim x 1
65 | attn = torch.bmm(k, target).squeeze(2) # batch x v_num
66 | if mask is not None:
67 | attn.data.masked_fill_(mask, -float('inf'))
68 | attn_sm = self.sm(attn)
69 | attn3 = attn_sm.view(attn.size(0), 1, attn.size(1)) # batch x 1 x v_num
70 | ctx = v if v is not None else k
71 | weighted_context = torch.bmm(attn3, ctx).squeeze(1) # batch x v_dim
72 | return weighted_context, attn
73 |
74 |
75 | class TextDotAttention(nn.Module):
76 | '''Soft Dot Attention.
77 |
78 | Ref: http://www.aclweb.org/anthology/D15-1166
79 | Adapted from PyTorch OPEN NMT.
80 | '''
81 |
82 | def __init__(self, dim):
83 | '''Initialize layer.'''
84 | super(TextDotAttention, self).__init__()
85 | self.linear_in = nn.Linear(dim * 2, dim, bias=False)
86 | self.sm = nn.Softmax(dim=1)
87 | self.linear_out = nn.Linear(dim * 2, dim, bias=False)
88 | self.tanh = nn.Tanh()
89 |
90 | def forward(self, h, c, context, mask=None):
91 | '''Propagate h through the network.
92 |
93 | h: batch x dim
94 | context: batch x seq_len x dim
95 | mask: batch x seq_len indices to be masked
96 | '''
97 | target = self.linear_in(torch.cat((h, c), -1)).unsqueeze(
98 | 2) # batch x dim x 1
99 |
100 | # Get attention
101 | attn = torch.bmm(context, target).squeeze(2) # batch x seq_len
102 | if mask is not None:
103 | # -Inf masking prior to the softmax
104 | attn.data.masked_fill_(mask, -float('inf'))
105 | attn = self.sm(attn)
106 | attn3 = attn.view(attn.size(0), 1, attn.size(1)) # batch x 1 x seq_len
107 |
108 | weighted_context = torch.bmm(attn3, context).squeeze(1) # batch x dim
109 | h_tilde = torch.cat((weighted_context, h), 1)
110 |
111 | h_tilde = self.tanh(self.linear_out(h_tilde))
112 | return h_tilde, attn
113 |
114 |
115 | class VisualSoftDotAttention(nn.Module):
116 | ''' Visual Dot Attention Layer. '''
117 |
118 | def __init__(self, h_dim, v_dim, dot_dim=256):
119 | '''Initialize layer.'''
120 | super(VisualSoftDotAttention, self).__init__()
121 | self.linear_in_h = nn.Linear(h_dim, dot_dim, bias=True)
122 | self.linear_in_v = nn.Linear(v_dim, dot_dim, bias=True)
123 | self.sm = nn.Softmax(dim=1)
124 |
125 | def forward(self, h, visual_context, mask=None):
126 | '''Propagate h through the network.
127 |
128 | h: batch x h_dim
129 | visual_context: batch x v_num x v_dim
130 | '''
131 | target = self.linear_in_h(h).unsqueeze(2) # batch x dot_dim x 1
132 | context = self.linear_in_v(visual_context) # batch x v_num x dot_dim
133 |
134 | # Get attention
135 | attn = torch.bmm(context, target).squeeze(2) # batch x v_num
136 | attn = self.sm(attn)
137 | attn3 = attn.view(attn.size(0), 1, attn.size(1)) # batch x 1 x v_num
138 |
139 | weighted_context = torch.bmm(
140 | attn3, visual_context).squeeze(1) # batch x v_dim
141 | return weighted_context, attn
142 |
--------------------------------------------------------------------------------
/model/context_encoder.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
4 | from attentions import SoftDotAttention
5 | from cuda import try_cuda
6 |
7 |
8 | class ContextEncoder(nn.Module):
9 | def __init__(self, feature_size, hidden_size, dropout_ratio):
10 | ''' Bidirectional LSTM ContextEncoder. '''
11 | super().__init__()
12 | self.hidden_size = hidden_size
13 | self.feature_size = feature_size
14 | self.lstm = nn.LSTM(feature_size, self.hidden_size // 2, 1,
15 | batch_first=True, bidirectional=True)
16 | self.drop = nn.Dropout(p=dropout_ratio)
17 | self.attention_layer = SoftDotAttention(self.hidden_size,
18 | ctx_dim=feature_size)
19 | self.post_lstm = nn.LSTM(self.hidden_size, self.hidden_size // 2, 1,
20 | batch_first=True, bidirectional=True)
21 |
22 | def forward(self, feature, action_embeds, lengths):
23 | '''
24 | :param action_embeds: (batch_size, length, 2048). The feature of the view
25 | :param feature: (batch_size, length, 36, 2048). The action taken (with the image feature)
26 | :param lengths:
27 | :return: context with shape (batch_size, length, hidden_size) -> (batch_size, hidden_size)
28 | '''
29 |
30 | # LSTM on the action embed
31 | new_lengths = [1 if l == 0 else l for l in lengths]
32 | packed_embeds = pack_padded_sequence(action_embeds, new_lengths,
33 | enforce_sorted=False,
34 | batch_first=True)
35 |
36 | enc_h, _ = self.lstm(packed_embeds)
37 | ctx, _ = pad_packed_sequence(enc_h, batch_first=True)
38 | ctx = self.drop(ctx)
39 |
40 | # Att and Handle with the shape
41 | batch_size, max_length, _ = ctx.size()
42 | x, _ = self.attention_layer( # Attend to the feature map
43 | ctx.contiguous().view(-1, self.hidden_size),
44 | # (batch, length, hidden) --> (batch x length, hidden)
45 | feature.view(batch_size * max_length, -1, self.feature_size),
46 | # (batch, length, # of images, feature_size) --> (batch x length, # of images, feature_size)
47 | )
48 | x = x.view(batch_size, max_length, -1)
49 | x = self.drop(x)
50 |
51 | # Post LSTM layer
52 | packed_x = pack_padded_sequence(x, new_lengths,
53 | enforce_sorted=False, batch_first=True)
54 | # self.post_lstm.flatten_parameters()
55 | enc_x, _ = self.post_lstm(packed_x)
56 | x, _ = pad_packed_sequence(enc_x, batch_first=True)
57 |
58 | out = torch.stack([x[i, l - 1, :] if l > 0
59 | else try_cuda(torch.zeros(self.hidden_size))
60 | for i, l in enumerate(lengths)], dim=0)
61 | return out
62 |
63 |
64 | class LSTMMemory(nn.Module):
65 | def __init__(self, hidden_size):
66 | '''Initialize layer.'''
67 | super().__init__()
68 | self.hidden_size = hidden_size
69 | self.lstm = nn.LSTM(self.hidden_size, self.hidden_size,
70 | batch_first=True, bidirectional=False)
71 |
72 | def forward(self, hidden_variables, lengths):
73 | if len(hidden_variables) != 0:
74 | x = hidden_variables # [batch * length * hidden]
75 | ctx, _ = self.lstm(x)
76 | out = torch.stack([ctx[i, l - 1, :] if l > 0
77 | else try_cuda(torch.zeros(self.hidden_size))
78 | for i, l in enumerate(lengths)], dim=0)
79 | return out
80 | else:
81 | return try_cuda(torch.zeros(len(lengths), self.hidden_size))
82 |
--------------------------------------------------------------------------------
/model/cuda.py:
--------------------------------------------------------------------------------
1 | def try_cuda(pytorch_obj):
2 | import torch.cuda
3 | try:
4 | disabled = torch.cuda.disabled
5 | except:
6 | disabled = False
7 | if torch.cuda.is_available() and not disabled:
8 | return pytorch_obj.cuda()
9 | else:
10 | return pytorch_obj
11 |
--------------------------------------------------------------------------------
/model/follower_coattend.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 |
5 | from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
6 | from cuda import try_cuda
7 | from attentions import SoftDotAttention, VisualSoftDotAttention
8 | from context_encoder import ContextEncoder, LSTMMemory
9 |
10 |
11 | class EltwiseProdScoring(nn.Module):
12 | '''
13 | Linearly mapping h and v to the same dimension, and do a elementwise
14 | multiplication and a linear scoring
15 | '''
16 |
17 | def __init__(self, h_dim, a_dim, dot_dim=256):
18 | '''Initialize layer.'''
19 | super(EltwiseProdScoring, self).__init__()
20 | self.linear_in_h = nn.Linear(h_dim, dot_dim, bias=True)
21 | self.linear_in_a = nn.Linear(a_dim, dot_dim, bias=True)
22 | self.linear_out = nn.Linear(dot_dim, 1, bias=True)
23 |
24 | def forward(self, h, all_u_t, mask=None):
25 | '''Propagate h through the network.
26 |
27 | h: batch x h_dim
28 | all_u_t: batch x a_num x a_dim
29 | '''
30 | target = self.linear_in_h(h).unsqueeze(1) # batch x 1 x dot_dim
31 | context = self.linear_in_a(all_u_t) # batch x a_num x dot_dim
32 | eltprod = torch.mul(target, context) # batch x a_num x dot_dim
33 | logits = self.linear_out(eltprod).squeeze(2) # batch x a_num
34 | return logits
35 |
36 |
37 | class EltwiseProdScoringWithContext(nn.Module):
38 | '''
39 | Linearly mapping h and v to the same dimension, and do a elementwise
40 | multiplication and a linear scoring
41 | '''
42 |
43 | def __init__(self, h_dim, a_dim, dot_dim=512, dropout=0.5):
44 | '''Initialize layer.'''
45 | super(EltwiseProdScoringWithContext, self).__init__()
46 | self.linear_combine = nn.Sequential(
47 | nn.Linear(h_dim * 3, dot_dim, bias=True),
48 | nn.ReLU(),
49 | nn.Linear(dot_dim, dot_dim, bias=True)
50 | )
51 | self.linear_in_a = nn.Linear(a_dim, dot_dim, bias=True)
52 | self.linear_out = nn.Linear(dot_dim, 1, bias=True)
53 |
54 | def forward(self, h, context, text_context, all_u_t, mask=None):
55 | '''Propagate h through the network.
56 |
57 | h: batch x h_dim
58 | all_u_t: batch x a_num x a_dim
59 | '''
60 | combine = torch.cat([F.normalize(h),
61 | F.normalize(context),
62 | F.normalize(text_context)], dim=1)
63 | target = self.linear_combine(combine).unsqueeze(1) # batch x 1 x dot_dim
64 | actions = self.linear_in_a(all_u_t) # batch x a_num x dot_dim
65 | eltprod = torch.mul(target, actions) # batch x a_num x dot_dim
66 | logits = self.linear_out(eltprod).squeeze(2) # batch x a_num
67 | return logits
68 |
69 |
70 | class EncoderLSTM(nn.Module):
71 | ''' Encodes navigation instructions, returning hidden state context (for
72 | attention methods) and a decoder initial state.
73 | '''
74 |
75 | def __init__(self, vocab_size, embedding_size, hidden_size, padding_idx,
76 | dropout_ratio, glove=None):
77 | """ Simple LSTM encoder """
78 | super(EncoderLSTM, self).__init__()
79 | self.embedding_size = embedding_size
80 | self.hidden_size = hidden_size
81 | self.drop = nn.Dropout(p=dropout_ratio)
82 | self.embedding = nn.Embedding(vocab_size, embedding_size, padding_idx)
83 | self.use_glove = glove is not None
84 | if self.use_glove:
85 | print('Using GloVe embedding')
86 | self.embedding.weight.data[...] = torch.from_numpy(glove)
87 | self.embedding.weight.requires_grad = False
88 | self.lstm = nn.LSTM(embedding_size, hidden_size, 1,
89 | batch_first=True,
90 | bidirectional=False)
91 | self.encoder2decoder = nn.Linear(hidden_size, hidden_size)
92 |
93 | def load_my_state_dict(self, state_dict):
94 | own_state = self.state_dict()
95 | for name, param in state_dict.items():
96 | if name not in own_state:
97 | continue
98 | if isinstance(param, nn.Parameter):
99 | param = param.data
100 | own_state[name].copy_(param)
101 |
102 | def init_state(self, batch_size):
103 | ''' Initialize to zero cell states and hidden states.'''
104 | h0 = try_cuda(torch.zeros(1, batch_size, self.hidden_size))
105 | c0 = try_cuda(torch.zeros(1, batch_size, self.hidden_size))
106 | return h0, c0
107 |
108 | def forward(self, *args, **kwargs):
109 | '''Encode history instructions (text context) or encode current instructions.'''
110 | if 'context' in kwargs and kwargs['context'] == True:
111 | return self.forward_context(*args)
112 | else:
113 | return self.forward_current(*args)
114 |
115 | def forward_current(self, inputs, lengths):
116 | ''' Expects input vocab indices as (batch, seq_len). Also requires a
117 | list of lengths for dynamic batching.
118 | '''
119 | batch_size = inputs.size(0)
120 | embeds = self.embedding(inputs) # (batch, seq_len, embedding_size)
121 | if not self.use_glove:
122 | embeds = self.drop(embeds)
123 | h0, c0 = self.init_state(batch_size)
124 | packed_embeds = pack_padded_sequence(embeds, lengths,
125 | enforce_sorted=False,
126 | batch_first=True)
127 | enc_h, (enc_h_t, enc_c_t) = self.lstm(packed_embeds, (h0, c0))
128 | h_t = enc_h_t[-1]
129 | c_t = enc_c_t[-1] # (batch, hidden_size)
130 |
131 | ctx, lengths = pad_packed_sequence(enc_h, batch_first=True)
132 | decoder_init = nn.Tanh()(self.encoder2decoder(h_t))
133 | ctx = self.drop(ctx)
134 | return ctx, decoder_init, c_t
135 |
136 | def forward_context(self, inputs, lengths):
137 | ''' Expects input vocab indices as (batch, seq_len). Also requires a
138 | list of lengths for dynamic batching.
139 | '''
140 | batch_size = inputs.size(0)
141 | embeds = self.embedding(inputs) # (batch, seq_len, embedding_size)
142 | if not self.use_glove:
143 | embeds = self.drop(embeds)
144 | h0, c0 = self.init_state(batch_size)
145 | packed_embeds = pack_padded_sequence(embeds, lengths,
146 | enforce_sorted=False,
147 | batch_first=True)
148 | enc_h, (enc_h_t, enc_c_t) = self.lstm(packed_embeds, (h0, c0))
149 | h_t = enc_h_t[-1]
150 | return h_t
151 |
152 |
153 | class AttnDecoderLSTM(nn.Module):
154 | '''
155 | An unrolled LSTM with attention over instructions for decoding navigation
156 | actions.
157 | '''
158 |
159 | def __init__(self, embedding_size, hidden_size, dropout_ratio, feature_size,
160 | history=False, lstm_mem=False):
161 | super(AttnDecoderLSTM, self).__init__()
162 | self.embedding_size = embedding_size
163 | self.feature_size = feature_size
164 | self.hidden_size = hidden_size
165 | self.drop = nn.Dropout(p=dropout_ratio)
166 | self.lstm = nn.LSTMCell(embedding_size + feature_size, hidden_size)
167 | self.visual_attention_layer = \
168 | VisualSoftDotAttention(hidden_size, feature_size)
169 | self.text_attention_layer = SoftDotAttention(hidden_size)
170 | if history:
171 | self.linear_context_out = nn.Linear(hidden_size, hidden_size, bias=True)
172 | self.linear_text_out = nn.Linear(hidden_size, hidden_size, bias=True)
173 | self.context_encoder = \
174 | ContextEncoder(feature_size, hidden_size, dropout_ratio)
175 | self.decoder2action_text_context = \
176 | EltwiseProdScoringWithContext(hidden_size, embedding_size)
177 | if lstm_mem:
178 | self.context_lstm = LSTMMemory(hidden_size)
179 | self.text_context_lstm = LSTMMemory(hidden_size)
180 | else:
181 | self.decoder2action = EltwiseProdScoring(hidden_size, embedding_size)
182 |
183 | def load_my_state_dict(self, state_dict):
184 | own_state = self.state_dict()
185 | for name, param in state_dict.items():
186 | if name not in own_state:
187 | continue
188 | if isinstance(param, nn.Parameter):
189 | param = param.data
190 | own_state[name].copy_(param)
191 |
192 | def forward(self, *args, **kwargs):
193 | '''Encode history trajectories (visual context) or decode current trajectories.'''
194 | if 'context' in kwargs and kwargs['context'] == True:
195 | return self.forward_context(*args)
196 | else:
197 | return self.forward_current(*args,
198 | ctx_mask=kwargs['ctx_mask'],
199 | history_context=kwargs['history_context'])
200 |
201 | def forward_current(self, u_t_prev, all_u_t, visual_context, h_0, c_0, ctx,
202 | ctx_mask=None, history_context=None):
203 | ''' Takes a single step in the decoder LSTM (allowing sampling).
204 |
205 | u_t_prev: batch x embedding_size
206 | all_u_t: batch x a_num x embedding_size
207 | visual_context: batch x v_num x feature_size
208 | h_0: batch x hidden_size
209 | c_0: batch x hidden_size
210 | ctx: batch x seq_len x dim
211 | ctx_mask: batch x seq_len - indices to be masked
212 | history_context: None or [batch x hidden_size, batch x hidden_size]
213 | '''
214 | feature, alpha_v = self.visual_attention_layer(h_0, visual_context)
215 | concat_input = torch.cat((u_t_prev, feature), 1)
216 | concat_drop = self.drop(concat_input)
217 |
218 | h_1, c_1 = self.lstm(concat_drop, (h_0, c_0))
219 | h_1_drop = self.drop(h_1)
220 | h_tilde, alpha = self.text_attention_layer(h_1_drop, ctx, ctx_mask)
221 |
222 | if history_context is not None:
223 | context = self.linear_context_out(history_context[0])
224 | text = self.linear_text_out(history_context[1])
225 | logit = self.decoder2action_text_context(h_tilde, context, text, all_u_t)
226 | else:
227 | logit = self.decoder2action(h_tilde, all_u_t)
228 | return h_1, c_1, alpha, logit, alpha_v
229 |
230 | def forward_context(self, *args):
231 | return self.context_encoder(*args)
232 |
--------------------------------------------------------------------------------
/model/follower_coground.py:
--------------------------------------------------------------------------------
1 | import math
2 | import torch
3 | import torch.nn as nn
4 |
5 | from torch.autograd import Variable
6 | from cuda import try_cuda
7 | from attentions import WhSoftDotAttention
8 | from context_encoder import ContextEncoder
9 |
10 |
11 | class PositionalEncoding(nn.Module):
12 | def __init__(self, d_model, dropout, max_len):
13 | super(PositionalEncoding, self).__init__()
14 | self.dropout = nn.Dropout(p=dropout)
15 |
16 | # Compute the PE once
17 | pe = torch.zeros(max_len, d_model)
18 | position = torch.arange(0, max_len).unsqueeze(1).float()
19 | div_term = torch.exp(
20 | torch.arange(0, d_model, 2).float() / d_model * (-math.log(10000.0))
21 | )
22 | pe[:, 0::2] = torch.sin(position * div_term)
23 | pe[:, 1::2] = torch.cos(position * div_term)
24 | pe = pe.unsqueeze(0)
25 | self.register_buffer('pe', pe)
26 |
27 | def forward(self, x):
28 | x = x + Variable(self.pe[:, :x.size(1)], requires_grad=False)
29 | return self.dropout(x)
30 |
31 |
32 | class CogroundDecoderLSTM(nn.Module):
33 | def __init__(self, embedding_size, hidden_size, dropout_ratio, feature_size,
34 | max_len, history=False, visual_hidden_size=1024):
35 | super(CogroundDecoderLSTM, self).__init__()
36 | self.embedding_size = embedding_size
37 | self.feature_size = feature_size
38 | self.hidden_size = hidden_size
39 | self.u_begin = try_cuda(Variable(torch.zeros(embedding_size),
40 | requires_grad=False))
41 | self.drop = nn.Dropout(p=dropout_ratio)
42 | self.lstm = nn.LSTMCell(2 * embedding_size + hidden_size, hidden_size)
43 | self.text_attention_layer = WhSoftDotAttention(hidden_size, hidden_size)
44 | self.positional_encoding = PositionalEncoding(hidden_size,
45 | dropout=0, max_len=max_len)
46 | self.visual_attention_layer = WhSoftDotAttention(hidden_size,
47 | visual_hidden_size)
48 | self.visual_mlp = nn.Sequential(
49 | nn.BatchNorm1d(feature_size),
50 | nn.Linear(feature_size, visual_hidden_size),
51 | nn.BatchNorm1d(visual_hidden_size),
52 | nn.Dropout(dropout_ratio),
53 | nn.ReLU()
54 | )
55 | self.action_attention_layer = WhSoftDotAttention(hidden_size * 2,
56 | visual_hidden_size)
57 | self.sm = nn.Softmax(dim=1)
58 | if history:
59 | self.linear_context_out = nn.Linear(hidden_size, hidden_size, bias=True)
60 | self.linear_text_out = nn.Linear(hidden_size, hidden_size, bias=True)
61 | self.context_encoder = ContextEncoder(feature_size, hidden_size,
62 | dropout_ratio)
63 | self.linear_combine = nn.Sequential(
64 | nn.Linear(hidden_size * 4, hidden_size * 2, bias=True),
65 | nn.ReLU(),
66 | nn.Linear(hidden_size * 2, hidden_size * 2, bias=True)
67 | )
68 |
69 | def load_my_state_dict(self, state_dict):
70 | own_state = self.state_dict()
71 | for name, param in state_dict.items():
72 | if name not in own_state:
73 | continue
74 | if isinstance(param, nn.Parameter):
75 | param = param.data
76 | own_state[name].copy_(param)
77 |
78 | def forward(self, *args, **kwargs):
79 | if 'context' in kwargs and kwargs['context'] == True:
80 | return self.forward_context(*args)
81 | else:
82 | return self.forward_current(*args, ctx_mask=kwargs['ctx_mask'],
83 | history_context=kwargs['history_context'])
84 |
85 | def forward_current(self, u_t_prev, all_u_t, visual_context, h_0, c_0, ctx,
86 | ctx_mask=None, history_context=None):
87 | '''
88 | u_t_prev: batch x embedding_size
89 | all_u_t: batch x a_num x embedding_size
90 | visual_context: batch x v_num x feature_size => panoramic view, DEP
91 | h_0: batch x hidden_size
92 | c_0: batch x hidden_size
93 | ctx: batch x seq_len x dim
94 | ctx_mask: batch x seq_len - indices to be masked
95 | '''
96 | ctx_pos = self.positional_encoding(ctx)
97 | attn_text, _alpha_text = \
98 | self.text_attention_layer(h_0, ctx_pos, v=ctx, mask=ctx_mask)
99 | alpha_text = self.sm(_alpha_text)
100 |
101 | batch_size, a_size, _ = all_u_t.size()
102 | g_v = all_u_t.view(-1, self.feature_size)
103 | g_v = self.visual_mlp(g_v).view(batch_size, a_size, -1)
104 | attn_vision, _alpha_vision = \
105 | self.visual_attention_layer(h_0, g_v, v=all_u_t)
106 | alpha_vision = self.sm(_alpha_vision)
107 |
108 | concat_input = torch.cat((attn_text, attn_vision, u_t_prev), 1)
109 | drop = concat_input
110 | h_1, c_1 = self.lstm(drop, (h_0, c_0))
111 |
112 | if history_context is not None:
113 | context = self.linear_context_out(history_context[0])
114 | text = self.linear_text_out(history_context[1])
115 | action_selector = self.linear_combine(
116 | torch.cat((attn_text, h_1, context, text), 1))
117 | else:
118 | action_selector = torch.cat((attn_text, h_1), 1)
119 | _, alpha_action = self.action_attention_layer(action_selector, g_v)
120 | return h_1, c_1, alpha_text, alpha_action, alpha_vision
121 |
122 | def forward_context(self, *args):
123 | return self.context_encoder(*args)
124 |
--------------------------------------------------------------------------------
/model/speaker_lstm.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | from attentions import SoftDotAttention
4 |
5 |
6 | class SpeakerEncoderLSTM(nn.Module):
7 | def __init__(self, feature_size, hidden_size, dropout_ratio):
8 | ''' Bidirectional LSTM Speaker'''
9 | super().__init__()
10 | self.hidden_size = hidden_size
11 | self.feature_size = feature_size
12 | self.lstm = nn.LSTM(feature_size, self.hidden_size // 2, 1,
13 | batch_first=True,
14 | bidirectional=True)
15 | self.drop = nn.Dropout(p=dropout_ratio)
16 | self.drop3 = nn.Dropout(p=0.3)
17 | self.attention_layer = SoftDotAttention(self.hidden_size, feature_size)
18 | self.post_lstm = nn.LSTM(self.hidden_size, self.hidden_size // 2, 1,
19 | batch_first=True,
20 | bidirectional=True)
21 |
22 | def load_my_state_dict(self, state_dict):
23 | own_state = self.state_dict()
24 | for name, param in state_dict.items():
25 | if name not in own_state:
26 | continue
27 | if isinstance(param, nn.Parameter):
28 | param = param.data
29 | own_state[name].copy_(param)
30 |
31 | def forward(self, feature, action_embeds, lengths):
32 | """
33 | :param action_embeds: (batch_size, length, 2052). The feature of the view
34 | :param feature: (batch_size, length, 36, 2052). The action taken (with the image feature)
35 | :param lengths: Not used in it
36 | :return: context with shape (batch_size, length, hidden_size)
37 | """
38 | x = action_embeds
39 | x[..., :-128] = self.drop3(
40 | x[..., :-128]) # Do not dropout the spatial features
41 |
42 | # LSTM on the action embed
43 | ctx, _ = self.lstm(x)
44 | ctx = self.drop(ctx)
45 |
46 | # Att and Handle with the shape
47 | batch_size, max_length, _ = ctx.size()
48 | feature[..., :-128] = self.drop3(
49 | feature[..., :-128]) # Dropout the image feature
50 | x, _ = self.attention_layer( # Attend to the feature map
51 | ctx.contiguous().view(-1, self.hidden_size),
52 | # (batch, length, hidden) --> (batch x length, hidden)
53 | feature.view(batch_size * max_length, -1, self.feature_size),
54 | # (batch, length, # of images, feature_size) --> (batch x length, # of images, feature_size)
55 | )
56 | x = x.view(batch_size, max_length, -1)
57 | x = self.drop(x)
58 |
59 | # Post LSTM layer
60 | x, _ = self.post_lstm(x)
61 | x = self.drop(x)
62 |
63 | return x
64 |
65 |
66 | class SpeakerDecoderLSTM(nn.Module):
67 | def __init__(self, vocab_size, embedding_size, padding_idx, hidden_size,
68 | dropout_ratio, glove=None):
69 | super().__init__()
70 | self.hidden_size = hidden_size
71 | self.embedding = torch.nn.Embedding(vocab_size, embedding_size,
72 | padding_idx)
73 | self.use_glove = glove is not None
74 | if self.use_glove:
75 | print('Using GloVe embedding')
76 | self.embedding.weight.data[...] = torch.from_numpy(glove)
77 | self.embedding.weight.requires_grad = False
78 | self.lstm = nn.LSTM(embedding_size, hidden_size, batch_first=True)
79 | self.drop = nn.Dropout(dropout_ratio)
80 | self.attention_layer = SoftDotAttention(hidden_size, hidden_size)
81 | self.projection = nn.Linear(hidden_size, vocab_size)
82 | self.baseline_projection = nn.Sequential(
83 | nn.Linear(hidden_size, 128),
84 | nn.ReLU(),
85 | nn.Dropout(dropout_ratio),
86 | nn.Linear(128, 1)
87 | )
88 |
89 | def load_my_state_dict(self, state_dict):
90 | own_state = self.state_dict()
91 | for name, param in state_dict.items():
92 | if name not in own_state:
93 | continue
94 | if isinstance(param, nn.Parameter):
95 | param = param.data
96 | own_state[name].copy_(param)
97 |
98 | def forward(self, words, ctx, ctx_mask, h0, c0):
99 | h0, c0 = h0.unsqueeze(0), c0.unsqueeze(0)
100 | embeds = self.embedding(words)
101 | embeds = self.drop(embeds)
102 | x, (h1, c1) = self.lstm(embeds, (h0, c0))
103 |
104 | x = self.drop(x)
105 |
106 | # Get the size
107 | batchXlength = words.size(0) * words.size(1)
108 | multiplier = batchXlength // ctx.size(0)
109 |
110 | # Att and Handle with the shape
111 | # Reshaping x --> (b(word)*l(word), r)
112 | # Expand the ctx from (b, a, r) --> (b(word)*l(word), a, r)
113 | # Expand the ctx_mask (b, a) --> (b(word)*l(word), a)
114 | x, _ = self.attention_layer(
115 | x.contiguous().view(batchXlength, self.hidden_size),
116 | ctx.unsqueeze(1).expand(-1, multiplier, -1, -1).contiguous().view(
117 | batchXlength, -1, self.hidden_size),
118 | mask=ctx_mask.unsqueeze(1).expand(-1, multiplier, -1).contiguous().view(
119 | batchXlength, -1)
120 | )
121 | x = x.view(words.size(0), words.size(1), self.hidden_size)
122 |
123 | # Output the prediction logit
124 | x = self.drop(x)
125 | logit = self.projection(x)
126 |
127 | return logit, h1.squeeze(0), c1.squeeze(0)
128 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | # This file may be used to create an environment using:
2 | # $ conda create --name --file
3 | # platform: linux-64
4 | python=3.7.3
5 | pytorch=1.1.0
6 | torchvision=0.3.0
7 | numpy=1.16.2
8 | networkx=2.2
--------------------------------------------------------------------------------
/simulator/connectivity/8194nk5LbLH_connectivity.json:
--------------------------------------------------------------------------------
1 | [{"image_id":"c9e8dc09263e4d0da77d16de0ecddd39","pose":[-0.611043,-0.00396746,-0.791588,-0.213904,0.791585,-0.00882497,-0.610996,2.305,-0.00456166,-0.999953,0.00853306,1.56916,0,0,0,1],"included":true,"visible":[false,false,false,false,true,true,false,true,true,true,false,false,false,false,false,false,false,false,false,false],"unobstructed":[false,false,false,false,true,false,false,false,true,true,false,false,false,false,false,false,false,false,false,false],"height":1.5826326295962942},{"image_id":"286b0c2d9a46408ba80b6ccebb21e582","pose":[0.951596,0.00201098,0.307346,6.58012,-0.307351,0.00915895,0.951552,-2.96479,-0.000901435,-0.999956,0.00933374,4.36353,0,0,0,1],"included":true,"visible":[false,false,true,true,false,false,false,false,false,false,false,false,false,true,false,true,false,true,false,true],"unobstructed":[false,false,false,true,false,false,false,false,false,false,false,false,false,true,false,false,false,true,true,false],"height":1.5712253956498747},{"image_id":"6776097c17ed4b93aee61704eb32f06c","pose":[-0.711582,-0.00419131,-0.702591,-1.68941,0.702575,0.00464776,-0.711594,-5.37908,0.00624796,-0.99998,-0.000362505,1.58622,0,0,0,1],"included":true,"visible":[false,false,false,false,false,true,true,true,false,true,false,true,false,false,true,false,true,false,false,false],"unobstructed":[false,false,false,false,false,true,true,false,false,false,false,true,false,false,false,false,false,false,false,true],"height":1.5804941871490743},{"image_id":"8c7e8da7d4a44ab695e6b3195eac0cf1","pose":[0.709879,0.011247,0.704234,8.62929,-0.70424,-0.00407304,0.70995,-1.77115,0.0108531,-0.999928,0.00502926,4.38556,0,0,0,1],"included":true,"visible":[false,true,false,false,false,false,false,false,false,false,true,false,true,true,false,false,false,true,true,false],"unobstructed":[false,true,false,false,false,false,false,false,false,false,true,false,false,true,false,false,false,true,true,false],"height":1.585645804390483},{"image_id":"f33c718aaf2c41469389a87944442c62","pose":[0.619478,0.0166688,0.784837,-3.88437,-0.784902,-0.00375152,0.619609,-0.528748,0.0132725,-0.999854,0.0107595,1.58368,0,0,0,1],"included":true,"visible":[true,false,false,false,false,false,false,false,false,true,false,false,false,false,false,false,false,false,false,true],"unobstructed":[true,false,false,false,false,false,false,false,false,true,false,false,false,false,false,false,false,false,false,true],"height":1.5829827809014503},{"image_id":"fcd90a404061413385286bef9662630e","pose":[-0.111393,0.00837906,0.993741,2.80245,-0.993773,-0.00348217,-0.111367,-3.78204,0.0025272,-0.999959,0.00871482,1.58057,0,0,0,1],"included":true,"visible":[true,false,true,false,false,false,false,true,false,false,false,false,false,false,false,true,false,false,false,false],"unobstructed":[false,false,true,false,false,false,true,true,true,false,false,false,false,false,false,false,false,false,false,false],"height":1.5763528408163245},{"image_id":"c07d4ae8330542a09cf8f8dddb9728ce","pose":[-0.985207,-0.0101267,0.171069,0.656519,-0.171094,0.00168538,-0.985253,-5.08928,0.00968898,-0.999947,-0.00339301,1.57611,0,0,0,1],"included":true,"visible":[true,false,true,false,false,true,false,true,false,false,false,false,false,false,true,false,true,false,false,true],"unobstructed":[false,false,true,false,false,true,false,true,false,false,false,true,false,false,false,false,false,false,false,false],"height":1.575276915205382},{"image_id":"2393bffb53fe4205bcc67796c6fb76e3","pose":[-0.241654,0.00228344,-0.97036,3.33582,0.970294,0.0124463,-0.241608,-5.90025,0.0115256,-0.99992,-0.00522325,1.57791,0,0,0,1],"included":true,"visible":[false,false,true,false,false,true,true,false,false,false,false,false,false,false,true,false,true,false,false,false],"unobstructed":[false,false,false,false,false,true,true,false,false,false,false,false,false,false,true,false,false,false,false,false],"height":1.5730354249357412},{"image_id":"71bf74df73cd4e24a191ef4f2338ca22","pose":[0.906931,-0.00688335,-0.421222,0.122562,0.421182,-0.00662188,0.906952,-0.00319673,-0.00903217,-0.999954,-0.00310641,1.57207,0,0,0,1],"included":true,"visible":[true,false,false,false,true,true,true,true,false,true,false,false,false,false,false,false,false,false,false,false],"unobstructed":[true,false,false,false,false,true,false,false,false,true,false,false,false,false,false,false,false,false,false,false],"height":1.570272020216938},{"image_id":"be8a2edacab34ec8887ba6a7b1e4945f","pose":[0.791463,0.0101015,0.611133,-3.50132,-0.611154,-0.00121731,0.791511,1.58103,0.00873934,-0.999948,0.00521015,1.56992,0,0,0,1],"included":true,"visible":[true,false,true,false,true,false,false,true,false,false,false,true,false,false,false,false,false,false,false,true],"unobstructed":[true,false,false,false,true,false,false,false,true,false,false,false,false,false,false,false,false,false,false,false],"height":1.577126892771864},{"image_id":"9bdde31adaa1443bb206b09bfa3c474c","pose":[0.799844,0.0047414,0.60019,8.67581,-0.600208,0.0075118,0.799809,-4.8108,-0.000716311,-0.99996,0.00885413,2.82261,0,0,0,1],"included":true,"visible":[false,false,false,true,false,false,false,false,false,false,false,false,false,true,false,false,true,true,false,false],"unobstructed":[false,false,false,true,false,false,false,false,false,false,false,false,false,false,false,false,true,false,false,false],"height":1.58264400638767},{"image_id":"66d4adb61b57494aa2c1ad141a0fad9b","pose":[-0.34536,-0.0108675,-0.938407,-2.27885,0.938436,0.00459882,-0.345423,-3.2282,0.00806945,-0.99993,0.00861029,1.58739,0,0,0,1],"included":true,"visible":[false,false,true,false,false,true,true,true,false,true,false,false,false,false,true,true,false,false,false,true],"unobstructed":[false,false,true,false,false,false,true,false,false,false,false,false,false,false,false,false,false,false,false,true],"height":1.5705441219971223},{"image_id":"83ff709c0e3e46079836153ea5c7feac","pose":[0.68423,0.0137303,0.729137,3.42529,-0.729235,0.00364543,0.684254,1.65175,0.00673696,-0.999899,0.012507,4.37069,0,0,0,1],"included":true,"visible":[false,false,false,true,false,false,false,false,false,false,false,false,false,false,false,false,false,false,true,false],"unobstructed":[false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,true,false],"height":1.578378655072358},{"image_id":"d9e325df2f3948679c78b93d8025e2da","pose":[0.826698,0.0192407,0.562317,8.49764,-0.562455,0.00220125,0.826825,-0.816805,0.0146709,-0.999812,0.0126418,4.38875,0,0,0,1],"included":true,"visible":[false,true,false,true,false,false,false,false,false,false,true,false,true,false,false,false,false,true,true,false],"unobstructed":[false,true,false,true,false,false,false,false,false,false,false,false,false,false,false,false,false,true,true,false],"height":1.5865892751674604},{"image_id":"423efb97f77f4e7995f19c66fe82afbc","pose":[0.958879,0.00141119,0.283813,5.51819,-0.283808,0.0124035,0.958801,-5.67527,-0.00216725,-0.999922,0.012294,1.58856,0,0,0,1],"included":true,"visible":[false,false,true,false,false,false,true,true,false,false,false,false,false,false,false,false,true,false,false,false],"unobstructed":[false,false,false,false,false,false,false,true,false,false,false,false,false,false,false,false,true,false,false,false],"height":1.5784339701720043},{"image_id":"6c49579a5cd34df8acb7f790b74e9eae","pose":[-0.95716,-0.00676032,-0.289482,-6.48379,0.289538,-0.00977451,-0.957117,-2.57899,0.00364085,-0.999929,0.0113132,1.59886,0,0,0,1],"included":true,"visible":[false,false,false,false,false,false,false,false,false,false,false,true,false,false,false,false,false,false,false,true],"unobstructed":[false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,true],"height":1.5798282335589897},{"image_id":"aeed67040d744240b188f66f17d87d43","pose":[0.132175,0.0257204,0.990893,7.67989,-0.991226,0.00381825,0.132121,-5.81072,-0.000385302,-0.999662,0.0259995,2.29866,0,0,0,1],"included":true,"visible":[false,false,true,false,false,false,true,true,false,false,true,false,false,false,true,false,false,false,false,false],"unobstructed":[false,false,false,false,false,false,false,false,false,false,true,false,false,false,true,false,false,false,false,false],"height":1.6026680667792301},{"image_id":"aae01016bb354f78bd6db86e9d71af2b","pose":[0.0788252,0.00384462,0.996881,6.79041,-0.996887,0.00184069,0.0788186,-0.995862,-0.00153193,-0.999991,0.0039778,4.37219,0,0,0,1],"included":true,"visible":[false,true,false,true,false,false,false,false,false,false,false,false,false,true,false,false,false,false,true,false],"unobstructed":[false,true,false,true,false,false,false,false,false,false,false,false,false,true,false,false,false,false,true,false],"height":1.5770919536040346},{"image_id":"346b680ac5904359a1859c929ad312b6","pose":[-0.589008,0.00463239,0.808114,5.58585,-0.808123,0.00000695791,-0.589015,0.644327,-0.00273419,-0.999989,0.00373948,4.38174,0,0,0,1],"included":true,"visible":[false,true,false,true,false,false,false,false,false,false,false,false,true,true,false,false,false,true,false,false],"unobstructed":[false,true,false,true,false,false,false,false,false,false,false,false,true,true,false,false,false,true,false,false],"height":1.5707587596461066},{"image_id":"ae91518ed77047b3bdeeca864cd04029","pose":[0.310985,0.0070688,0.950389,-4.60607,-0.950392,-0.00460962,0.31102,-2.5949,0.00657945,-0.999964,0.00528466,1.58581,0,0,0,1],"included":true,"visible":[false,false,true,false,true,true,false,true,false,true,false,true,false,false,false,true,false,false,false,false],"unobstructed":[false,false,true,false,true,false,false,false,false,false,false,true,false,false,false,true,false,false,false,false],"height":1.5747548700639524}]
--------------------------------------------------------------------------------
/simulator/connectivity/GdvgFV5R1Z5_connectivity.json:
--------------------------------------------------------------------------------
1 | [{"image_id":"0b02e18654324edd8d74c078b66bfb20","pose":[-0.057695,-0.000357129,0.998334,-2.46692,-0.998304,-0.00769199,-0.0576965,-3.15814,0.00770012,-0.99997,0.0000884733,1.5171,0,0,0,1],"included":true,"visible":[false,true,false,false,false,true,true,true,true,false,true,false],"unobstructed":[false,false,false,false,false,true,false,true,true,false,true,false],"height":1.51470410293751},{"image_id":"1db1c0a09ecf40d188197efc05ced3bb","pose":[-0.442443,0.0138817,0.896688,-4.03893,-0.89679,-0.0101225,-0.442338,-3.05434,0.00293664,-0.999852,0.0169288,0.974424,0,0,0,1],"included":true,"visible":[true,false,false,false,true,true,false,false,true,false,false,true],"unobstructed":[false,false,false,false,false,true,false,false,true,false,false,true],"height":0.9701803380402906},{"image_id":"6178647ca8d14dc09370f6c1b7ed2fd6","pose":[-0.870025,0.0056275,0.492973,-3.69279,-0.493005,-0.0105975,-0.869962,1.95433,0.000328893,-0.999927,0.0119957,1.51516,0,0,0,1],"included":true,"visible":[false,false,false,true,false,false,true,true,false,false,true,false],"unobstructed":[false,false,false,true,false,false,true,true,false,true,true,false],"height":1.517582101716661},{"image_id":"565cc21cd28b4ee6bb5ba83c5270c032","pose":[0.0242634,0.000986587,-0.999704,-3.91782,0.999699,0.00333371,0.024267,0.178675,0.00335701,-0.999993,-0.0009042,1.50868,0,0,0,1],"included":true,"visible":[false,false,true,false,false,false,true,false,false,true,true,false],"unobstructed":[false,false,true,false,false,false,false,false,false,true,true,false],"height":1.5114421933143356},{"image_id":"ef638e508e054c4aabd49b38d1b88fc7","pose":[0.0820523,0.0151057,0.996513,-4.61631,-0.995947,-0.0356725,0.0825462,-2.18899,0.0367954,-0.999249,0.0121187,1.52757,0,0,0,1],"included":true,"visible":[false,true,false,false,false,true,false,false,true,false,false,true],"unobstructed":[false,false,false,false,false,true,false,false,true,false,false,true],"height":1.5162868543024455},{"image_id":"97ed68de989e44fdaf2d9b949898fab6","pose":[0.0900997,0.0149714,0.99582,-3.64126,-0.995713,-0.0195971,0.0903844,-3.16818,0.0208687,-0.999695,0.0131427,1.52081,0,0,0,1],"included":true,"visible":[true,true,false,false,true,false,false,false,true,false,false,true],"unobstructed":[true,true,false,false,true,false,false,false,true,false,false,true],"height":1.5211418713547455},{"image_id":"5fd70cff4992429a99a84fd3c117ccb5","pose":[-0.0539877,-0.000800861,-0.998541,0.0108044,0.998337,0.0201438,-0.0539926,0.00604319,0.020158,-0.999796,-0.000286778,1.51223,0,0,0,1],"included":true,"visible":[true,false,true,true,false,false,false,true,false,true,true,false],"unobstructed":[false,false,true,false,false,false,false,true,false,false,true,false],"height":1.5113248528175798},{"image_id":"86d342c576ff46a9828d2ba377cc8cd5","pose":[0.998173,0.0151118,-0.0584746,-1.78347,0.0584707,0.000718574,0.998288,-1.89835,0.0151283,-0.999885,-0.000165129,1.52238,0,0,0,1],"included":true,"visible":[true,false,true,false,false,false,true,false,false,false,true,false],"unobstructed":[true,false,true,false,false,false,true,false,false,false,true,false],"height":1.5103397372923053},{"image_id":"8dba9ff900b14f9b84ead660f5f7f701","pose":[-0.999855,-0.0144511,0.00887107,-4.11579,-0.00895392,0.00564829,-0.999943,-2.90606,0.0144005,-0.999879,-0.00577567,1.51617,0,0,0,1],"included":true,"visible":[true,true,false,false,true,true,false,false,false,false,false,true],"unobstructed":[true,true,false,false,true,true,false,false,false,false,false,true],"height":1.5112098807574073},{"image_id":"0d8c5fbfd73f44e28d6da370520611e4","pose":[0.0769887,0.00664334,0.997009,-6.15424,-0.997016,-0.00490415,0.0770216,-0.0398163,0.00540151,-0.999965,0.00624716,1.50965,0,0,0,1],"included":true,"visible":[false,false,true,true,false,false,true,false,false,false,true,false],"unobstructed":[false,false,true,true,false,false,false,false,false,false,false,false],"height":1.5058928427471967},{"image_id":"aebb1de49d21485e8bef7633dfb58761","pose":[-0.0229751,-0.0058052,-0.999718,-1.94579,0.999719,0.00553997,-0.0230069,-0.026534,0.00567231,-0.999967,0.0056775,1.50582,0,0,0,1],"included":true,"visible":[true,false,true,true,false,false,true,true,false,true,false,false],"unobstructed":[true,false,true,true,false,false,true,true,false,false,false,false],"height":1.5101720791580233},{"image_id":"e34e51f3d6584ad09c510de5db84752f","pose":[-0.0418368,-0.0124855,0.999046,-3.99281,-0.993607,-0.104406,-0.0429142,-2.13265,0.104842,-0.994456,-0.00803644,0.980264,0,0,0,1],"included":true,"visible":[false,true,false,false,true,true,false,false,true,false,false,false],"unobstructed":[false,true,false,false,true,true,false,false,true,false,false,false],"height":0.969584316081611}]
--------------------------------------------------------------------------------
/simulator/connectivity/Pm6F8kyY3z2_connectivity.json:
--------------------------------------------------------------------------------
1 | [{"image_id":"dfed00b301f246989ff408657e39e88b","pose":[0.894952,-0.0207046,0.445684,-2.46151,-0.445515,0.0124171,0.895189,7.62724,-0.0240687,-0.999709,0.00188811,1.25952,0,0,0,1],"included":true,"visible":[false,false,false,false,false,false,false,true,false,false,true,true,true,false,false,true,true,false,false,false,false,true,false,false,false,false,false,false,false,false,false,false,false,false],"unobstructed":[false,false,false,false,false,false,false,true,false,false,true,true,true,false,false,true,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false],"height":1.2574198157492824},{"image_id":"e37d923ec8284523b5c9ebdb35800a3a","pose":[-0.589713,-0.0119696,0.807525,-1.97599,-0.80754,0.0222594,-0.589393,-2.37926,-0.0109204,-0.999681,-0.022793,0.690385,0,0,0,1],"included":true,"visible":[false,false,true,true,true,true,true,false,true,false,false,false,false,true,false,false,false,false,false,false,false,false,false,true,true,false,true,true,false,false,false,true,true,false],"unobstructed":[false,false,false,true,false,true,false,false,false,false,false,false,false,true,false,false,false,false,false,false,false,false,false,true,false,false,true,false,false,false,false,true,false,false],"height":1.240973054769517},{"image_id":"560d4bedd6714e57b4acfe27bbc4a01b","pose":[-0.977243,-0.00165475,-0.212119,-0.0711643,0.212078,-0.028916,-0.976825,2.94531,-0.00451739,-0.999581,0.0286085,1.26221,0,0,0,1],"included":true,"visible":[false,true,false,true,true,true,false,false,true,true,false,false,true,true,false,false,false,false,true,false,false,false,false,false,true,true,true,true,false,false,true,true,false,false],"unobstructed":[false,false,false,false,false,true,false,false,true,true,false,false,false,false,false,false,false,false,false,false,false,false,false,false,true,false,false,true,false,false,false,false,false,false],"height":1.2613789210030284},{"image_id":"312bca1ac80742faa1617cf158beaeb1","pose":[0.312174,-0.0208526,0.949797,-2.52197,-0.949897,0.00960923,0.312418,-4.66412,-0.0156417,-0.999737,-0.0168085,0.685205,0,0,0,1],"included":true,"visible":[false,true,true,false,false,true,true,false,false,true,false,false,false,false,true,false,false,false,false,true,false,false,false,true,true,true,false,true,false,true,false,true,true,false],"unobstructed":[false,true,false,false,false,false,false,false,false,false,false,false,false,false,true,false,false,false,true,false,false,false,false,false,false,false,false,false,false,true,true,true,false,false],"height":1.2426174829057284},{"image_id":"c8e422cf697348cabeac7d9bc4a84062","pose":[0.332666,-0.0162534,-0.942905,1.23423,0.942934,-0.0096225,0.332842,-1.94066,-0.0144831,-0.999822,0.0121245,0.695783,0,0,0,1],"included":true,"visible":[false,true,true,false,false,true,false,false,true,true,false,false,true,true,true,true,false,false,true,false,true,false,true,true,true,true,true,true,false,false,true,true,true,false],"unobstructed":[false,false,false,false,false,false,false,false,false,false,false,false,false,true,false,false,false,false,true,false,true,false,false,true,false,false,true,false,false,true,false,false,true,false],"height":1.2459243009091168},{"image_id":"d81df9c0b8fb4b3e8a2cd12c2007461e","pose":[-0.943048,-0.011951,-0.332443,-0.105636,0.332628,-0.0203502,-0.942839,2.11049,0.00450241,-0.999722,0.0231661,1.25942,0,0,0,1],"included":true,"visible":[false,true,true,false,true,false,true,false,true,true,false,false,true,true,false,true,false,false,true,false,false,false,true,true,true,true,true,true,false,false,true,true,false,false],"unobstructed":[false,true,true,false,false,false,true,false,false,false,false,false,false,true,false,false,false,false,false,false,false,false,false,false,true,false,false,true,false,false,false,false,false,false],"height":1.259680186683917},{"image_id":"b961fab0e5cb48979dba7f115725baba","pose":[-0.72184,-0.0146334,-0.691906,1.93977,0.692056,-0.0116192,-0.721751,4.69574,0.00252213,-0.999826,0.018514,1.26792,0,0,0,1],"included":true,"visible":[false,true,false,false,false,true,false,false,false,false,false,false,false,true,false,false,false,false,true,false,false,false,false,false,false,true,true,true,false,false,true,true,false,false],"unobstructed":[false,false,false,false,false,true,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,true,false,false,false,false,false,false],"height":1.2502711440667638},{"image_id":"fdf0fc24f7c549ccbb78e972c8024fa9","pose":[0.909305,0.00741201,-0.416064,0.00890844,0.416112,-0.00686709,0.909288,7.75754,0.00388236,-0.999949,-0.00932887,1.25305,0,0,0,1],"included":true,"visible":[true,false,false,false,false,false,false,false,false,false,true,true,true,false,false,false,false,true,false,false,false,true,false,false,false,false,false,false,true,false,false,false,false,true],"unobstructed":[true,false,false,false,false,false,false,false,false,false,true,true,true,false,false,false,false,true,false,false,false,true,false,false,false,false,false,false,false,false,false,false,false,false],"height":1.2487049450992638},{"image_id":"b8f49ffb1486488bbb72693578c17865","pose":[-0.998964,-0.00962844,0.0444879,-1.8508,-0.0443137,-0.0175915,-0.998863,3.61648,0.0104,-0.999799,0.0171463,1.26079,0,0,0,1],"included":true,"visible":[false,false,true,false,true,true,false,false,false,true,false,true,true,true,false,true,false,false,true,false,false,false,true,false,true,false,true,true,false,false,false,false,false,false],"unobstructed":[false,false,true,false,false,false,false,false,false,true,false,false,false,true,false,false,false,false,false,false,false,false,false,false,true,false,false,false,false,false,false,false,false,false],"height":1.2633617680188245},{"image_id":"cb070d66db084a79b553310df69ed31d","pose":[-0.977864,0.00832091,0.20908,-2.27665,-0.209234,-0.0284428,-0.977452,4.68783,-0.00218661,-0.999561,0.0295538,1.25989,0,0,0,1],"included":true,"visible":[false,false,true,false,false,true,false,false,true,false,false,true,true,true,true,true,false,false,true,false,false,false,true,false,true,false,true,false,false,false,false,false,false,false],"unobstructed":[false,false,true,false,false,false,false,false,true,false,false,false,false,false,false,false,false,false,false,false,false,false,true,false,false,false,false,false,false,false,false,false,false,false],"height":1.2580748740290566},{"image_id":"f6872e6001054c67a12371a542c7defe","pose":[0.0536408,-0.0235465,0.998283,-2.43097,-0.99847,-0.0147796,0.0533023,8.50381,0.013499,-0.999614,-0.0243036,1.25061,0,0,0,1],"included":true,"visible":[true,false,false,false,false,false,false,true,false,false,false,true,true,false,false,true,true,false,false,false,false,true,false,false,false,false,false,false,false,false,false,false,false,false],"unobstructed":[true,false,false,false,false,false,false,true,false,false,false,true,false,false,false,false,true,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false],"height":1.2465964780421934},{"image_id":"0c5658ec8f51460fbf29d6aedcfb4bca","pose":[0.963476,-0.0168826,0.267262,-2.96338,-0.266981,0.0172023,0.963548,7.0529,-0.0208649,-0.99971,0.0120663,1.26308,0,0,0,1],"included":true,"visible":[true,false,false,false,false,false,false,true,true,true,true,false,true,false,false,true,true,false,false,false,false,true,true,false,false,false,false,false,false,false,false,false,false,false],"unobstructed":[true,false,false,false,false,false,false,true,false,false,true,false,true,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false],"height":1.2617934647021496},{"image_id":"138252ad9fab4ae1a86997fc363e6ac7","pose":[0.996651,-0.0156732,-0.0802583,-3.11236,0.0804086,0.00921488,0.99672,6.45377,-0.0148824,-0.999835,0.0104439,1.26,0,0,0,1],"included":true,"visible":[true,false,false,false,false,true,false,true,true,true,true,true,false,true,false,true,true,false,true,false,false,true,true,false,true,false,false,false,false,false,false,false,false,false],"unobstructed":[true,false,false,false,false,false,false,true,false,false,false,true,false,false,false,true,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false],"height":1.2614477951937262},{"image_id":"094836afff5e4fbfbb6659a96ec665b8","pose":[0.964654,-0.00795989,-0.2634,-0.000922046,0.263512,0.0216581,0.964413,0.00337562,-0.00197203,-0.999734,0.0229897,1.24819,0,0,0,1],"included":true,"visible":[false,true,true,false,true,true,true,false,true,true,false,false,true,false,false,true,false,false,true,false,false,false,true,true,true,true,true,true,false,false,true,true,false,false],"unobstructed":[false,true,false,false,true,true,false,false,true,false,false,false,false,false,false,false,false,false,false,false,false,false,false,true,true,false,false,true,false,false,false,false,false,false],"height":1.2526172310761738},{"image_id":"4739d48c61a04deab15db1eb2c906a96","pose":[0.193991,-0.0171755,-0.980853,-0.816552,0.980783,0.0246068,0.193546,-4.9622,0.0208112,-0.99955,0.0216185,0.679588,0,0,0,1],"included":true,"visible":[false,false,true,true,true,true,true,false,true,true,false,true,true,false,false,true,false,false,true,true,false,false,true,false,true,true,false,true,false,true,true,true,false,false],"unobstructed":[false,false,false,true,false,false,false,false,false,false,false,false,false,false,false,false,false,false,true,false,false,false,false,false,false,true,false,false,false,true,true,true,false,false],"height":1.2596737635557855},{"image_id":"87e7b6f2006541a9abe57fba18294a0c","pose":[-0.783324,-0.016461,-0.621396,-3.0819,0.621615,-0.0208826,-0.783045,6.16326,-0.0000868454,-0.999647,0.0265899,1.25902,0,0,0,1],"included":true,"visible":[true,false,false,false,false,true,false,false,true,true,true,true,true,true,false,false,true,false,true,false,false,false,true,false,true,false,false,false,false,false,false,false,false,false],"unobstructed":[true,false,false,false,false,false,false,false,false,false,false,false,true,false,false,false,false,false,false,false,false,false,true,false,false,false,false,false,false,false,false,false,false,false],"height":1.2599343835554653},{"image_id":"c32282e053ad450e9187d9d95361b124","pose":[0.0594054,-0.0138426,0.998138,-2.53141,-0.998234,-0.00190817,0.0593847,10.2372,0.00108244,-0.999903,-0.0139319,1.24275,0,0,0,1],"included":true,"visible":[true,false,false,false,false,false,false,false,false,false,true,true,true,false,false,true,false,false,false,false,false,false,true,false,false,false,false,false,false,false,false,false,false,false],"unobstructed":[false,false,false,false,false,false,false,false,false,false,true,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false],"height":1.2459471379951932},{"image_id":"a8cb8984630b456a96923ef0d7c3aeb3","pose":[0.351449,0.0243773,0.93589,-0.0153321,-0.935855,-0.0182983,0.351913,8.89998,0.0257037,-0.999536,0.0163823,1.27666,0,0,0,1],"included":true,"visible":[false,false,false,false,false,false,false,true,false,false,false,false,false,false,false,false,false,false,false,false,false,true,false,false,false,false,false,false,true,false,false,false,false,true],"unobstructed":[false,false,false,false,false,false,false,true,false,false,false,false,false,false,false,false,false,false,false,false,false,true,false,false,false,false,false,false,false,false,false,false,false,true],"height":1.2550452676594444},{"image_id":"ff2dd63ad83245c3b4831b4f8a6911ac","pose":[0.998846,-0.00853272,0.047285,0.322808,-0.047168,0.01344,0.998797,-3.39025,-0.0091581,-0.999874,0.0130217,0.694594,0,0,0,1],"included":true,"visible":[false,false,true,true,true,true,true,false,true,true,false,true,true,true,true,true,false,false,false,false,true,false,true,true,true,true,true,true,false,false,true,false,false,false],"unobstructed":[false,false,false,true,true,false,false,false,false,false,false,false,false,false,true,false,false,false,false,false,false,false,false,false,false,false,true,false,false,false,true,false,true,false],"height":1.2549827051622604},{"image_id":"e775ea81e83c45719de5a1577f8f7e39","pose":[0.967623,-0.00934794,-0.252228,2.82818,0.252377,0.0221475,0.967376,-5.00859,-0.00345689,-0.999711,0.0237893,0.677925,0,0,0,1],"included":true,"visible":[false,false,true,true,true,true,false,false,true,true,false,false,true,true,true,true,false,false,false,false,true,false,true,false,true,true,false,false,false,true,false,false,true,false],"unobstructed":[false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,true,false,false,false,false,false,false,true,false],"height":1.2544772562107187},{"image_id":"50e617dd6c4c4def9ea571ab3578f308","pose":[-0.300165,0.0205111,-0.953667,3.22939,0.953853,-0.00206313,-0.300268,-0.541289,-0.0081265,-0.999788,-0.0189456,0.672177,0,0,0,1],"included":true,"visible":[false,false,false,false,true,false,false,false,false,false,false,false,false,true,true,false,false,false,true,true,false,false,false,false,false,false,true,false,false,false,false,false,true,false],"unobstructed":[false,false,false,false,true,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,true,false,false,false,false,false,true,false],"height":1.2328507397565847},{"image_id":"63eb69df65c641e19af921ff458b6046","pose":[0.991257,-0.000218759,-0.131947,0.493009,0.131935,-0.0120189,0.991186,7.68006,-0.00180282,-0.999928,-0.0118853,1.24896,0,0,0,1],"included":true,"visible":[true,false,false,false,false,false,false,true,false,false,true,true,true,false,false,false,false,true,false,false,false,false,false,false,false,false,false,false,true,false,false,false,false,true],"unobstructed":[false,false,false,false,false,false,false,true,false,false,false,false,false,false,false,false,false,true,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,true],"height":1.2475721046793986},{"image_id":"8078fc4f547a4a9cb1a6bb036cc18dc9","pose":[-0.114576,0.00615355,-0.993396,-2.77681,0.993265,-0.0166685,-0.114664,5.56396,-0.0172641,-0.999842,-0.00420267,1.24986,0,0,0,1],"included":true,"visible":[false,false,false,false,false,true,false,false,true,true,false,true,true,true,false,true,false,false,true,false,false,false,false,false,true,false,false,false,false,false,false,false,false,false],"unobstructed":[false,false,false,false,false,false,false,false,false,true,false,false,false,false,false,true,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false],"height":1.2512865181863293},{"image_id":"981311e64aec4d3f8e403f349363e065","pose":[0.847247,-0.0183189,-0.530884,0.0723733,0.53092,-0.0032238,0.847416,-1.30075,-0.0172354,-0.999827,0.00699417,0.704176,0,0,0,1],"included":true,"visible":[false,true,true,false,true,true,true,false,true,true,false,false,true,true,false,true,false,false,true,false,false,false,true,false,true,true,true,true,false,false,true,true,true,false],"unobstructed":[false,true,false,false,true,false,false,false,false,false,false,false,false,true,false,false,false,false,false,false,false,false,false,false,false,false,true,true,false,false,false,false,true,false],"height":1.2568391809600372},{"image_id":"4ac8e1065b0b46db9e459d79a1078e04","pose":[-0.640839,0.0218408,0.767365,-0.962216,-0.767675,-0.020208,-0.640522,2.8754,0.00151725,-0.999558,0.0297163,1.26183,0,0,0,1],"included":true,"visible":[false,true,true,true,true,true,false,false,true,true,false,false,true,true,false,true,false,false,true,false,false,false,true,false,false,false,true,true,false,false,true,false,false,false],"unobstructed":[false,false,true,false,false,true,false,false,true,false,false,false,false,true,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false],"height":1.2639418549975774},{"image_id":"9b2b12257e9a4c5d9f99f141769e9301","pose":[0.98729,-0.0123897,0.158448,0.0965094,-0.158303,0.0119552,0.987319,-4.80181,-0.014127,-0.999852,0.00984156,0.677793,0,0,0,1],"included":true,"visible":[false,false,true,true,true,true,true,false,true,true,false,true,true,true,true,true,false,false,true,true,false,false,true,true,true,false,true,true,false,true,true,false,true,false],"unobstructed":[false,false,false,false,false,false,false,false,false,false,false,false,false,false,true,false,false,false,false,true,false,false,false,false,false,false,false,false,false,false,true,false,false,false],"height":1.251745413841814},{"image_id":"2596868500734b1496d479e6936cceb3","pose":[0.888937,-0.0176477,-0.45769,0.291121,0.457814,0.00353992,0.889041,-2.06553,-0.0140695,-0.999838,0.0112258,0.706623,0,0,0,1],"included":true,"visible":[false,true,true,false,true,true,true,false,true,true,false,false,true,true,false,true,false,false,true,false,true,false,true,true,true,true,false,true,false,false,true,true,true,false],"unobstructed":[false,true,false,false,true,false,false,false,false,false,false,false,false,false,false,false,false,false,true,false,true,false,false,true,false,false,false,false,false,false,false,true,false,false],"height":1.2553193865748646},{"image_id":"b0a4ed482e7b4f8eb499f3a999f65933","pose":[-0.848526,-0.0232996,-0.528642,1.46159,0.52901,-0.0139455,-0.848501,3.05605,0.0123974,-0.999632,0.0241584,1.26843,0,0,0,1],"included":true,"visible":[false,true,true,false,false,true,true,false,true,false,false,false,false,true,false,false,false,false,true,false,true,false,false,false,true,true,true,false,false,false,true,true,false,false],"unobstructed":[false,false,true,false,false,true,true,false,false,false,false,false,false,true,false,false,false,false,false,false,false,false,false,true,false,false,false,false,false,false,false,false,false,false],"height":1.2609941282189379},{"image_id":"97db9c9bd9824a3ca623db620331ea66","pose":[-0.929713,0.018777,0.367808,0.0255591,-0.367902,-0.00175584,-0.929863,10.8131,-0.0168144,-0.999822,0.00854017,1.2626,0,0,0,1],"included":true,"visible":[false,false,false,false,false,false,false,true,false,false,false,false,false,false,false,false,false,true,false,false,false,true,false,false,false,false,false,false,false,false,false,false,false,true],"unobstructed":[false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,true],"height":1.2659524368560175},{"image_id":"f1f56adaf5034a25929562d6c3213a8c","pose":[0.825713,-0.0302824,-0.563279,-1.5378,0.563854,0.0153007,0.825733,-5.11416,-0.0163868,-0.999425,0.0297084,0.682356,0,0,0,1],"included":true,"visible":[false,false,true,true,true,true,true,false,true,true,false,true,false,false,true,false,false,false,false,true,true,false,true,false,true,true,false,true,false,false,true,true,false,false],"unobstructed":[false,false,false,true,true,false,false,false,false,false,false,false,false,false,true,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,true,false,false],"height":1.260110021251859},{"image_id":"7e094a308dec4987a5185a0426f4ec3c","pose":[0.993231,-0.0084319,0.115849,0.146963,-0.115685,0.0178686,0.993125,-4.26187,-0.0104441,-0.999805,0.0167718,0.681901,0,0,0,1],"included":true,"visible":[false,true,true,true,true,true,true,false,true,true,false,true,true,true,true,true,false,false,true,false,false,false,true,true,true,true,true,true,false,true,false,false,false,false],"unobstructed":[false,false,false,true,false,false,false,false,false,false,false,false,false,false,true,false,false,false,true,false,false,false,false,false,false,true,false,false,false,false,false,false,false,false],"height":1.2503792812204662},{"image_id":"c460d0f6ea8a4458a576efb07d043b39","pose":[-0.0100665,-0.0185998,0.999777,-2.6559,-0.999919,0.0080293,-0.00991839,-2.91143,-0.00784316,-0.999795,-0.0186794,0.694102,0,0,0,1],"included":true,"visible":[false,true,true,true,true,true,true,false,true,true,false,false,false,true,true,false,false,false,false,false,false,false,false,true,false,false,true,true,false,true,false,false,true,false],"unobstructed":[false,true,false,true,false,false,false,false,false,false,false,false,false,false,true,false,false,false,false,false,false,false,false,false,false,false,true,false,false,true,false,false,false,false],"height":1.2467171179086285},{"image_id":"be733bc20f4c4f74992d21e2741f3a17","pose":[0.996567,-0.00580693,-0.0825867,3.06603,0.0825174,-0.0112836,0.996526,-1.87107,-0.00671876,-0.99992,-0.010766,0.687863,0,0,0,1],"included":true,"visible":[false,true,true,false,true,false,false,false,false,true,false,false,false,false,false,false,false,false,true,true,true,false,false,true,false,false,true,false,false,false,false,true,false,false],"unobstructed":[false,false,false,false,true,false,false,false,false,false,false,false,false,false,false,false,false,false,true,true,true,false,false,true,false,false,false,false,false,false,false,false,false,false],"height":1.2517487620409038},{"image_id":"3fd2f1d849de4e9f8c07863c845db6b5","pose":[-0.876161,0.0123836,0.481861,0.00165134,-0.482015,-0.018302,-0.875972,9.94824,-0.00202879,-0.999756,0.0220042,1.26676,0,0,0,1],"included":true,"visible":[false,false,false,false,false,false,false,true,false,false,false,false,false,false,false,false,false,true,false,false,false,true,false,false,false,false,false,false,true,false,false,false,false,false],"unobstructed":[false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,true,false,false,false,true,false,false,false,false,false,false,true,false,false,false,false,false],"height":1.258770950284268}]
--------------------------------------------------------------------------------
/simulator/connectivity/README.md:
--------------------------------------------------------------------------------
1 | ## connectivity
2 | Connectivity graphs indicating the navigable paths between viewpoints in each scan.
3 |
4 | Each json file contains an array of annotations, one for each viewpoint in the scan. All annotations share the same basic structure as follows:
5 |
6 | ```
7 | {
8 | "image_id": str,
9 | "pose": [float x 16],
10 | "included": boolean,
11 | "visible": [boolean x num_viewpoints],
12 | "unobstructed": [boolean x num_viewpoints],
13 | "height": float
14 | }
15 | ```
16 | - `image_id`: matterport skybox prefix
17 | - `pose`: 4x4 matrix in row major order that transforms matterport skyboxes to global coordinates (z-up). Pose matrices are based on the assumption that the camera is facing skybox image 3.
18 | - `included`: whether viewpoint is included in the simulator. Some overlapping viewpoints are excluded.
19 | - `visible`: indicates other viewpoints that can be seen from this viewpoint.
20 | - `unobstructed`: indicates transitions to other viewpoints that are considered navigable for an agent.
21 | - `height`: estimated height of the viewpoint above the floor. Not required for the simulator.
22 |
23 | Units are in metres.
24 |
25 | `scans.txt` contains a list of all the scan ids in the dataset.
26 |
--------------------------------------------------------------------------------
/simulator/connectivity/YmJkqBEsHnH_connectivity.json:
--------------------------------------------------------------------------------
1 | [{"image_id":"006933a75f764c5485cf284bea0ded0b","pose":[0.210914,-0.00824746,-0.977469,-7.64722,0.977278,0.0232484,0.210677,-2.15553,0.0209873,-0.999695,0.0129646,1.56695,0,0,0,1],"included":true,"visible":[false,false,true,false,true,true,false,true,true,true,false],"unobstructed":[false,false,false,false,false,false,false,true,true,false,false],"height":1.524793092035509},{"image_id":"e4ede0695e4e4a77aae8537abb9f11d3","pose":[-0.0422212,-0.0176246,-0.998952,-0.133122,0.998904,0.0194092,-0.0425613,-0.0184591,0.0201393,-0.999656,0.016787,1.48352,0,0,0,1],"included":true,"visible":[false,false,true,true,false,false,false,false,false,false,false],"unobstructed":[false,false,true,false,false,true,false,false,false,false,false],"height":1.5227398475592409},{"image_id":"d471e89e00be49f49a7ecace814d60bf","pose":[0.426939,-0.00370058,-0.904272,-0.421886,0.904055,0.0239963,0.426739,-2.12366,0.0201203,-0.999705,0.0135916,1.49477,0,0,0,1],"included":true,"visible":[true,true,false,true,true,true,false,true,true,true,false],"unobstructed":[false,true,false,true,false,true,false,false,false,false,false],"height":1.5263900136377955},{"image_id":"b34af02ce9b642ebbd0c7e9e0ba3b553","pose":[0.960272,0.00870611,-0.278924,-0.0905727,0.278755,0.0168277,0.960214,-3.55265,0.0130537,-0.99982,0.0137334,1.49061,0,0,0,1],"included":true,"visible":[true,true,true,false,false,false,false,false,false,false,false],"unobstructed":[false,false,true,false,false,true,false,false,false,false,false],"height":1.5323637229797105},{"image_id":"01c80b5f8fbd4c969ee0bc03f1ec7a6c","pose":[0.359562,-0.0105291,-0.933061,-3.77309,0.932771,0.0313799,0.359097,-2.1838,0.0254987,-0.999452,0.0211054,1.53932,0,0,0,1],"included":true,"visible":[true,false,true,false,false,true,false,true,true,true,false],"unobstructed":[false,false,false,false,false,true,false,true,false,false,false],"height":1.5286629461398107},{"image_id":"82ea5baa30f945fe98f6cad3064af847","pose":[0.0376233,-0.0115611,-0.999224,-2.01669,0.998821,0.0310955,0.0372487,-2.16965,0.030641,-0.999449,0.0127185,1.50807,0,0,0,1],"included":true,"visible":[true,true,true,true,true,false,false,true,true,true,false],"unobstructed":[false,true,true,true,true,false,false,false,false,false,false],"height":1.5253207999550662},{"image_id":"aecbb791f30b452a9236c5a8c7030663","pose":[0.296076,-0.0242641,-0.954855,-13.5955,0.955111,0.0179483,0.2957,-2.22547,0.00996343,-0.999544,0.0284901,1.59272,0,0,0,1],"included":true,"visible":[true,false,true,false,true,true,false,true,true,true,true],"unobstructed":[false,false,false,false,false,false,false,false,false,true,true],"height":1.7557263982456066},{"image_id":"d841f7b710f9470796d55561f8f524db","pose":[0.270437,0.002913,-0.962732,-5.77716,0.962325,0.0284129,0.27041,-2.21321,0.028142,-0.999591,0.00488176,1.55947,0,0,0,1],"included":true,"visible":[true,false,true,false,true,true,false,false,true,true,false],"unobstructed":[true,false,false,false,true,false,false,false,false,false,false],"height":1.5357935019251416},{"image_id":"8e38fdd81c7949db9646968bafbbdcfc","pose":[-0.00277118,-0.0169575,-0.999852,-9.93905,0.999791,0.020127,-0.00311204,-2.17463,0.0201771,-0.999653,0.0168993,1.60592,0,0,0,1],"included":true,"visible":[true,false,true,false,true,true,false,true,false,true,true],"unobstructed":[true,false,false,false,false,false,false,false,false,true,false],"height":1.5208970888736792},{"image_id":"20fd759be0b64fc9aa96d290f0a704ec","pose":[0.227815,0.0117555,-0.973633,-12.1161,0.973367,0.0235263,0.228037,-2.15724,0.025587,-0.999654,-0.00608172,1.59969,0,0,0,1],"included":true,"visible":[true,false,true,false,true,true,true,true,true,false,true],"unobstructed":[false,false,false,false,false,false,true,false,true,false,false],"height":1.5261379179165138},{"image_id":"d838acff82244c2da0cf2651e54966cb","pose":[0.310234,-0.0632421,-0.948553,-15.2317,0.950604,0.0313736,0.308813,-2.28133,0.0102298,-0.997504,0.0698525,0.902626,0,0,0,1],"included":true,"visible":[true,false,true,false,true,true,true,true,true,true,false],"unobstructed":[false,false,false,false,false,false,true,false,false,false,false],"height":1.558854711359605}]
--------------------------------------------------------------------------------
/simulator/connectivity/gZ6f7yhEvPG_connectivity.json:
--------------------------------------------------------------------------------
1 | [{"image_id":"80929af5cf234ae38ac3a2a4e60e4342","pose":[0.983395,0.00450812,-0.181418,-2.79247,0.181442,-0.00570068,0.983385,-1.38801,0.00339928,-0.999973,-0.00642298,1.42676,0,0,0,1],"included":true,"visible":[false,true,true,false,false,true,false,false],"unobstructed":[false,true,false,true,false,true,false,false],"height":1.4191402375960298},{"image_id":"ba27da20782d4e1a825f0a133ad84da9","pose":[-0.7605,-0.0115739,-0.649234,-2.38988,0.648885,0.0237502,-0.760515,-0.0538717,0.0242219,-0.999651,-0.0105509,1.4341,0,0,0,1],"included":true,"visible":[true,false,true,true,false,true,false,true],"unobstructed":[true,false,false,false,false,true,false,true],"height":1.424939020658826},{"image_id":"46cecea0b30e4786b673f5e951bf82d4","pose":[0.593129,0.0137361,-0.80499,0.99933,0.804932,0.010707,0.59327,1.17558,0.0167685,-0.999848,-0.00470498,1.41684,0,0,0,1],"included":true,"visible":[false,false,false,true,true,false,true,true],"unobstructed":[false,false,false,true,true,false,true,true],"height":1.4252108727703763},{"image_id":"bda7a9e6d1d94b3aa8ff491beb158f3a","pose":[-0.378592,-0.0208239,0.925329,-0.182918,-0.925433,-0.00820128,-0.37882,-1.72967,0.0154776,-0.999749,-0.0161651,1.42205,0,0,0,1],"included":true,"visible":[true,false,true,false,true,false,true,true],"unobstructed":[true,false,true,false,true,false,false,true],"height":1.42983949725488},{"image_id":"dbb2f8000bc04b3ebcd0a55112786149","pose":[-0.595363,0.00457706,-0.803444,1.10196,0.803383,0.0168543,-0.595222,-1.10724,0.0108174,-0.999847,-0.0137106,1.41536,0,0,0,1],"included":true,"visible":[false,false,true,true,false,false,true,true],"unobstructed":[false,false,true,true,false,false,true,true],"height":1.4186255623107038},{"image_id":"29b20fa80dcd4771974303c1ccd8953f","pose":[0.292738,0.0164579,-0.956051,-2.77306,0.956096,0.0090939,0.292909,1.55377,0.0135152,-0.999823,-0.0130722,1.43367,0,0,0,1],"included":true,"visible":[true,true,true,false,true,false,false,false],"unobstructed":[true,true,false,false,false,false,false,false],"height":1.4237594118402337},{"image_id":"0ee20663dfa34b438d48750ddcd7366c","pose":[-0.75968,-0.0019971,-0.650293,-0.111567,0.650131,0.0201598,-0.759554,1.31337,0.014627,-0.999794,-0.0140156,1.42291,0,0,0,1],"included":true,"visible":[false,false,true,true,true,false,false,true],"unobstructed":[false,false,true,false,true,false,false,true],"height":1.4276556862049736},{"image_id":"47d8a8282c1c4a7fb3eeeacc45e9d959","pose":[-0.0254788,0.00643152,-0.999654,-0.0034508,0.999603,0.0120797,-0.0253995,0.0112371,0.0119124,-0.999906,-0.00673574,1.42388,0,0,0,1],"included":true,"visible":[true,true,true,true,true,false,true,false],"unobstructed":[false,true,true,true,true,false,true,false],"height":1.4268855357216241}]
--------------------------------------------------------------------------------
/simulator/connectivity/pLe4wQe7qrG_connectivity.json:
--------------------------------------------------------------------------------
1 | [{"image_id":"e4c0a4ec08104bf5ada134b123fa53e7","pose":[-0.133089,0.0111501,-0.991041,1.16811,0.991028,0.0137789,-0.132932,-2.20571,0.0121736,-0.999843,-0.0128829,1.54855,0,0,0,1],"included":true,"visible":[false,false,true,false,true,false,true,true,false,true,false,false,true,false,true,false,false,false,false,false,false,true,false,true,true,true,true,true,false,false,true],"unobstructed":[false,false,false,false,false,false,false,false,false,false,false,false,false,false,true,false,false,false,false,false,false,true,false,false,true,false,false,false,false,false,false],"height":1.5280399019555968},{"image_id":"959ea6af304a4339bbc5d97f044d11c3","pose":[0.312992,0.0130519,-0.949666,2.47951,0.948724,0.0422726,0.313263,-2.23387,0.0442338,-0.999021,0.000849325,1.58243,0,0,0,1],"included":true,"visible":[false,false,true,true,false,false,false,false,false,true,false,true,true,false,true,true,false,false,false,false,false,false,true,true,true,true,true,false,true,false,true],"unobstructed":[false,false,false,false,false,false,false,false,false,false,false,true,false,false,false,false,false,false,false,false,false,false,false,false,false,false,true,false,true,false,false],"height":1.5361363756730164},{"image_id":"ffe0e6835287419c9cfe343e9d613d87","pose":[-0.802259,-0.00971694,-0.596896,5.96539,0.59688,0.00470064,-0.802316,-2.03323,0.0106021,-0.999941,0.00202973,1.57957,0,0,0,1],"included":true,"visible":[false,true,false,false,false,false,true,false,false,false,false,false,true,true,false,true,false,false,false,false,false,true,true,false,false,false,true,false,false,true,true],"unobstructed":[false,false,false,false,false,false,false,false,false,false,false,false,true,true,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,true],"height":1.518586128876891},{"image_id":"47a69295198f4265958b9b1d497c328d","pose":[-0.90497,-0.00981301,-0.42536,2.46799,0.425363,0.00186582,-0.90502,2.04203,0.00967489,-0.99995,0.0024866,1.55214,0,0,0,1],"included":true,"visible":[false,true,false,false,false,true,true,false,true,false,false,true,false,false,false,true,false,false,true,true,true,false,false,false,true,false,false,true,true,false,false],"unobstructed":[false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,true,false,false,false,false,true,false,false,false,false,false,false,false,false,false,false],"height":1.5121750884423606},{"image_id":"3dfe07714b2f49d88bd4c8749e8bb0b7","pose":[-0.979561,-0.00709476,0.201019,-1.64821,-0.200975,-0.00640329,-0.979575,0.566531,0.0082373,-0.999954,0.00484756,1.56065,0,0,0,1],"included":true,"visible":[true,false,false,false,false,false,true,true,false,false,false,true,false,true,false,true,true,false,false,false,false,true,true,true,true,true,false,true,false,false,false],"unobstructed":[false,false,false,false,false,false,false,false,false,false,false,false,false,false,true,false,true,false,false,false,false,true,false,false,true,true,false,true,false,false,false],"height":1.5157095354765127},{"image_id":"87407bb6ed614926b91fc3e27eab766e","pose":[0.22909,0.0301697,-0.972937,4.56488,0.973286,0.00848048,0.229435,2.04904,0.0151732,-0.999508,-0.02742,1.5442,0,0,0,1],"included":true,"visible":[false,false,false,true,false,false,true,false,false,true,false,false,false,false,true,true,false,false,true,true,true,false,true,false,false,false,false,false,true,false,false],"unobstructed":[false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,true,true,false,false,false,false,false,false,false,false,false,false],"height":1.5111934219678684},{"image_id":"530f8e4126b14082a5c4ff6c3f6ae7cd","pose":[-0.172634,-0.00379856,-0.984978,8.51758,0.984978,0.00322887,-0.172647,0.14365,0.00383645,-0.999987,0.0031851,1.4578,0,0,0,1],"included":true,"visible":[false,false,true,false,true,false,false,false,false,false,true,false,false,true,false,false,false,false,false,false,false,false,true,true,false,false,false,false,false,true,false],"unobstructed":[false,false,false,false,false,false,false,false,false,false,false,false,false,true,false,false,false,false,false,false,false,false,true,false,false,false,false,false,false,true,false],"height":1.5362285111230571},{"image_id":"96782d3925ec4088ab224cdc92a4fd6a","pose":[-0.216113,-0.00838211,-0.976332,1.24213,0.976316,0.00844697,-0.216182,2.38931,0.0100594,-0.999929,0.00635911,1.53856,0,0,0,1],"included":true,"visible":[true,false,false,false,true,true,true,false,true,false,false,false,false,true,false,false,true,false,true,true,false,true,true,true,false,false,false,true,false,false,false],"unobstructed":[false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,true,false,false,false,false,true,false,false,false,false,false,true,false,false,false],"height":1.5135335729735602},{"image_id":"2dcc9c6ca2d44d5080a0a7e7b7fb9c4d","pose":[-0.951188,-0.00996649,-0.308449,-1.21085,0.308409,0.00538007,-0.951238,2.40322,0.0111403,-0.999936,-0.00204269,1.55952,0,0,0,1],"included":true,"visible":[false,false,false,false,false,false,false,true,false,true,false,false,false,false,true,false,true,true,true,false,false,true,false,false,false,true,false,true,false,false,false],"unobstructed":[false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,true,false,false,false,false,false,false,false,false,true,false,true,false,false,false],"height":1.5317176811699114},{"image_id":"0d704acada9041c48621c5d01d775da0","pose":[0.884279,0.0143861,0.466735,-1.34535,-0.466608,-0.0113974,0.88439,-2.3821,0.0180428,-0.999831,-0.00336482,1.52522,0,0,0,1],"included":true,"visible":[true,false,false,false,false,false,false,false,true,false,false,false,false,false,true,false,true,true,false,false,false,true,false,false,true,true,false,false,false,false,false],"unobstructed":[false,false,false,false,false,false,false,false,false,false,false,false,false,false,true,false,false,false,false,false,false,false,false,false,true,true,false,false,false,false,false],"height":1.5405532836763522},{"image_id":"2cbd295d838b4c51b5590dcf2a37fba0","pose":[0.246342,0.0412581,-0.968304,4.76599,0.96868,0.0216735,0.247362,0.169153,0.0311925,-0.998913,-0.0346258,1.42661,0,0,0,1],"included":true,"visible":[false,false,false,false,false,false,true,false,false,false,false,false,false,true,false,false,false,false,false,false,false,true,true,true,false,false,false,false,true,true,false],"unobstructed":[false,false,false,false,false,false,false,false,false,false,false,false,true,true,false,false,false,false,false,false,false,true,false,true,false,false,false,false,true,true,false],"height":1.5180090338091925},{"image_id":"6fbd170d8df746b0b10e3801e2dad706","pose":[-0.872353,-0.0000202749,0.488874,3.49156,-0.488854,-0.00892582,-0.872319,0.121306,0.00438157,-0.99996,0.00777758,1.41535,0,0,0,1],"included":true,"visible":[false,true,false,false,true,false,false,false,false,false,false,false,false,false,false,true,false,true,false,false,false,true,false,false,false,true,false,false,true,false,false],"unobstructed":[false,true,false,false,false,false,false,false,false,false,false,false,false,false,false,true,false,false,false,false,false,false,false,false,false,false,false,false,true,false,false],"height":1.5371204380160495},{"image_id":"31d308fee8284a168c28e238cf814363","pose":[0.998122,0.0164352,-0.0590029,6.9369,0.0592246,-0.0133283,0.998155,-2.13031,0.0156188,-0.999776,-0.0142757,1.58199,0,0,0,1],"included":true,"visible":[false,true,true,false,false,false,false,false,false,false,true,false,false,true,false,true,false,false,true,false,false,true,true,true,false,false,true,false,false,true,true],"unobstructed":[false,false,true,false,false,false,false,false,false,false,true,false,false,true,false,false,false,false,false,false,false,false,false,true,false,false,false,false,false,false,false],"height":1.5115252320863801},{"image_id":"789faffd87b949fd9ed7e6df4fadc2f1","pose":[0.998352,0.0156401,-0.0551931,6.89589,0.0551612,0.00248225,0.998474,-1.07864,0.0157535,-0.999874,0.00161644,1.58253,0,0,0,1],"included":true,"visible":[false,false,true,false,true,false,true,false,false,false,true,false,true,false,false,true,false,false,true,false,false,true,true,true,false,false,false,false,false,true,false],"unobstructed":[false,false,true,false,false,false,true,false,false,false,true,false,true,false,false,false,false,false,false,false,false,false,true,true,false,false,false,false,false,true,false],"height":1.5156362905724483},{"image_id":"a26b0e83785f45d484e5f9b83fdb4df3","pose":[0.784717,-0.00024993,0.619854,-0.356288,-0.619842,-0.00640294,0.7847,-1.3696,0.00377304,-0.999979,-0.0051784,1.5663,0,0,0,1],"included":true,"visible":[true,false,true,false,true,false,false,false,false,true,false,false,true,false,false,true,true,true,false,false,false,true,false,false,true,true,false,true,false,false,false],"unobstructed":[true,false,false,false,true,false,false,false,false,true,false,false,false,false,false,true,true,true,false,false,false,true,false,false,true,true,false,false,false,false,false],"height":1.5217725369665362},{"image_id":"df0b69b34d04453691b72a6c16923756","pose":[0.00951654,-0.00498874,-0.999942,2.41189,0.999919,0.00833186,0.00947506,0.0914117,0.00828438,-0.999952,0.00506864,1.42153,0,0,0,1],"included":true,"visible":[false,true,false,true,true,false,false,false,false,false,true,true,false,true,true,false,true,true,false,false,false,true,false,false,false,true,false,false,true,false,false],"unobstructed":[false,false,false,true,false,false,false,false,false,false,false,true,false,false,true,false,true,false,false,false,false,true,false,false,false,false,false,false,true,false,false],"height":1.5270023190896223},{"image_id":"d7d0e431bbfa40429a561060150f24cb","pose":[0.999351,0.0057182,0.0355512,-0.337565,-0.0355828,0.00559738,0.999351,1.14528,0.00551577,-0.999968,0.00579823,1.55634,0,0,0,1],"included":true,"visible":[false,false,false,false,true,false,true,true,true,false,false,false,false,true,true,true,false,true,false,false,false,true,true,false,true,false,false,true,false,false,false],"unobstructed":[false,false,false,false,true,false,false,true,true,false,false,false,false,false,true,true,false,true,false,false,false,true,false,false,false,false,false,true,false,false,false],"height":1.5126864275679581},{"image_id":"8f17854feb134826ae42e16b303e7445","pose":[-0.04737,0.0249555,-0.998565,-0.00382618,0.998875,0.00294013,-0.0473109,-0.017549,0.00175551,-0.999684,-0.0250657,1.55087,0,0,0,1],"included":true,"visible":[false,false,false,false,false,false,false,false,true,true,false,true,false,true,true,true,true,false,false,false,false,true,false,false,true,false,false,true,false,false,false],"unobstructed":[false,false,false,false,false,false,false,false,false,false,false,false,false,false,true,false,true,false,false,false,false,true,false,false,false,false,false,true,false,false,false],"height":1.5136058544662168},{"image_id":"d0584db5d0ba41ee955f6c91195afcb3","pose":[-0.0387735,-0.000627238,0.999248,6.85886,-0.999187,-0.0109357,-0.0387783,2.09848,0.0109521,-0.99994,-0.000201698,1.56982,0,0,0,1],"included":true,"visible":[false,false,false,true,false,true,false,false,false,false,false,false,true,true,true,true,false,false,false,true,true,true,true,true,false,false,false,false,false,true,false],"unobstructed":[false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,true,false,false,false,true,false,false,false,false,false,true,false],"height":1.5123722877852799},{"image_id":"87491cd48b094270a2a1aa682b8a770c","pose":[0.995378,0.0106665,0.0954335,5.60063,-0.0953334,-0.00948957,0.9954,2.17887,0.0115233,-0.999898,-0.00842783,1.55259,0,0,0,1],"included":true,"visible":[false,false,false,true,false,true,true,false,false,false,true,true,true,true,false,true,false,false,true,false,true,true,true,true,false,false,false,false,true,false,false],"unobstructed":[false,false,false,false,false,true,false,false,false,false,false,false,false,false,false,false,false,false,true,false,false,false,false,false,false,false,false,false,false,true,false],"height":1.5096271733017124},{"image_id":"8a65d3586fed4c5f9e0f28fc184b3ff2","pose":[0.999328,0.0243579,-0.0273564,3.25097,0.0277536,-0.016113,0.999485,2.12641,0.0239048,-0.999573,-0.0167772,1.55627,0,0,0,1],"included":true,"visible":[false,false,false,true,false,true,true,false,true,false,true,true,true,true,false,false,false,false,true,true,false,false,false,true,false,false,false,false,false,false,false],"unobstructed":[false,false,false,true,false,true,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false],"height":1.5216447032258948},{"image_id":"eb464984cc4847d2a61eab27e3e31e51","pose":[0.317487,0.0187868,-0.948076,1.37215,0.94826,-0.0045702,0.317459,0.120026,0.0016314,-0.999813,-0.0192648,1.55431,0,0,0,1],"included":true,"visible":[true,false,false,false,true,false,false,true,true,true,false,true,false,true,true,true,true,true,false,false,false,false,false,false,true,true,false,false,false,false,false],"unobstructed":[true,false,false,false,true,false,false,true,false,false,true,false,false,false,true,true,true,true,false,false,false,false,false,false,true,true,false,false,false,false,false],"height":1.5187432392237161},{"image_id":"ce103547e620457f935a63050cea57b3","pose":[-0.926095,-0.0151941,-0.376983,7.37065,0.376978,0.00327303,-0.926216,0.160002,0.0153072,-0.999879,0.00269771,1.43016,0,0,0,1],"included":true,"visible":[false,false,true,false,true,false,true,false,false,false,true,false,true,true,false,false,false,false,true,false,false,false,false,true,false,true,false,false,false,true,false],"unobstructed":[false,false,false,false,false,false,true,false,false,false,false,false,false,true,false,false,false,false,false,false,false,false,false,true,false,false,false,false,false,true,false],"height":1.5228214121764414},{"image_id":"fa48c6f958304aa8a8f765a72fe7e8d5","pose":[-0.994837,-0.00721806,0.101218,6.07693,-0.101252,0.00455002,-0.99485,0.0491342,0.00672061,-0.999963,-0.00525636,1.42403,0,0,0,1],"included":true,"visible":[false,false,false,false,true,false,true,false,false,false,true,false,true,true,false,false,false,false,false,false,false,false,true,false,false,true,false,false,false,true,false],"unobstructed":[false,false,false,false,false,false,false,false,false,false,true,false,true,true,false,false,false,false,true,false,false,false,true,false,false,false,false,false,false,true,false],"height":1.520425902170783},{"image_id":"50be95bc6efb466c90867d52cf32ba3f","pose":[0.803639,0.00102907,-0.595115,-0.280264,0.595001,0.0182495,0.803517,-2.40583,0.0116877,-0.999833,0.0140547,1.54308,0,0,0,1],"included":true,"visible":[true,false,true,false,true,false,false,false,false,true,false,false,true,false,true,false,true,true,false,false,false,true,true,false,false,true,false,true,false,false,false],"unobstructed":[true,false,false,false,true,false,false,false,false,true,false,false,false,false,true,false,false,false,false,false,false,true,false,false,false,true,false,false,false,false,false],"height":1.5259856691595353},{"image_id":"91d1554c155e4185a8c69636d47fd58d","pose":[0.7634,0.00593063,0.645898,-1.49105,-0.645812,-0.0117048,0.763406,-0.563949,0.0120878,-0.999914,-0.00510434,1.56479,0,0,0,1],"included":true,"visible":[true,false,false,false,true,false,true,true,true,true,false,true,false,false,true,true,false,false,false,false,false,true,true,true,true,false,false,true,false,false,false],"unobstructed":[false,false,false,false,true,false,false,false,true,true,false,false,false,false,true,false,false,false,false,false,false,true,false,false,true,false,false,true,false,false,false],"height":1.5123581928141085},{"image_id":"5d4349e09ada47b0aa8b20a0d22c54ca","pose":[0.0797542,0.0285043,-0.996407,3.62156,0.996744,0.00951931,0.080054,-2.10242,0.0117672,-0.999548,-0.0276513,1.56537,0,0,0,1],"included":true,"visible":[false,true,true,false,false,false,false,false,false,true,true,true,true,false,false,false,false,false,false,false,false,false,false,true,false,false,false,false,true,false,true],"unobstructed":[false,true,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,true,false,true],"height":1.5223704869964667},{"image_id":"298e09e5e1144e7b9762747370ca68a5","pose":[0.31306,-0.00832259,-0.949696,0.0361493,0.949732,0.00181293,0.313056,2.42577,-0.000883427,-0.999963,0.0084728,1.55565,0,0,0,1],"included":true,"visible":[true,false,false,false,true,false,true,true,true,false,false,false,false,true,true,false,true,true,true,true,false,false,false,true,true,true,false,false,false,false,false],"unobstructed":[false,false,false,false,true,false,false,true,true,false,false,false,false,false,false,false,true,true,false,false,false,false,false,false,false,true,false,false,false,false,false],"height":1.5224640014863746},{"image_id":"f8e13e216dd6477ea05e694e2f1478d9","pose":[0.998766,0.0109404,-0.0484187,2.48582,0.0482994,0.0109393,0.998773,-1.19789,0.0114569,-0.99988,0.0103984,1.57265,0,0,0,1],"included":true,"visible":[false,true,false,true,true,false,true,false,true,true,true,true,false,false,false,true,true,true,false,false,false,false,true,true,true,false,true,false,false,false,true],"unobstructed":[false,true,false,false,false,false,false,false,false,false,true,true,false,false,false,true,false,false,false,false,false,false,false,false,false,false,true,false,false,false,false],"height":1.5206684141424807},{"image_id":"e5f7cab8517b47399eda8866f0e30ab3","pose":[-0.660778,-0.00608519,-0.750556,7.08848,0.750578,-0.00299603,-0.660773,1.44662,0.00177251,-0.999977,0.00654814,1.57334,0,0,0,1],"included":true,"visible":[false,false,false,false,false,true,true,false,false,false,true,false,true,true,false,false,false,false,true,true,false,false,true,true,false,true,false,false,true,false,false],"unobstructed":[false,false,false,false,false,false,true,false,false,false,true,false,false,true,false,false,false,false,true,true,false,false,true,true,false,false,false,false,false,false,false],"height":1.5050461478205863},{"image_id":"a924a5855b954d68b26ebe82ab61c71d","pose":[-0.120428,-0.000846936,-0.992721,4.79789,0.992705,0.00559062,-0.12043,-2.05172,0.0056522,-0.999984,0.000168504,1.57612,0,0,0,1],"included":true,"visible":[false,true,true,false,false,false,true,false,true,false,true,false,true,false,false,false,true,false,false,false,false,true,true,true,false,false,true,false,false,true,false],"unobstructed":[false,false,true,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,true,false,false,false,false],"height":1.5244946264278192}]
--------------------------------------------------------------------------------
/simulator/connectivity/scans.txt:
--------------------------------------------------------------------------------
1 | 17DRP5sb8fy
2 | 1LXtFkjw3qL
3 | 1pXnuDYAj8r
4 | 29hnd4uzFmX
5 | 2azQ1b91cZZ
6 | 2n8kARJN3HM
7 | 2t7WUuJeko7
8 | 5LpN3gDmAk7
9 | 5q7pvUzZiYa
10 | 5ZKStnWn8Zo
11 | 759xd9YjKW5
12 | 7y3sRwLe3Va
13 | 8194nk5LbLH
14 | 82sE5b5pLXE
15 | 8WUmhLawc2A
16 | aayBHfsNo7d
17 | ac26ZMwG7aT
18 | ARNzJeq3xxb
19 | B6ByNegPMKs
20 | b8cTxDM8gDG
21 | cV4RVeZvu5T
22 | D7G3Y4RVNrH
23 | D7N2EKCX4Sj
24 | dhjEzFoUFzH
25 | E9uDoFAP3SH
26 | e9zR4mvMWw7
27 | EDJbREhghzL
28 | EU6Fwq7SyZv
29 | fzynW3qQPVF
30 | GdvgFV5R1Z5
31 | gTV8FGcVJC9
32 | gxdoqLR6rwA
33 | gYvKGZ5eRqb
34 | gZ6f7yhEvPG
35 | HxpKQynjfin
36 | i5noydFURQK
37 | JeFG25nYj2p
38 | JF19kD82Mey
39 | jh4fc5c5qoQ
40 | JmbYfDe2QKZ
41 | jtcxE69GiFV
42 | kEZ7cmS4wCh
43 | mJXqzFtmKg4
44 | oLBMNvg9in8
45 | p5wJjkQkbXX
46 | pa4otMbVnkk
47 | pLe4wQe7qrG
48 | Pm6F8kyY3z2
49 | pRbA3pwrgk9
50 | PuKPg4mmafe
51 | PX4nDJXEHrG
52 | q9vSo1VnCiC
53 | qoiz87JEwZ2
54 | QUCTc6BB5sX
55 | r1Q1Z4BcV1o
56 | r47D5H71a5s
57 | rPc6DW4iMge
58 | RPmz2sHmrrY
59 | rqfALeAoiTq
60 | s8pcmisQ38h
61 | S9hNv5qa7GM
62 | sKLMLpTHeUy
63 | SN83YJsR3w2
64 | sT4fr6TAbpF
65 | TbHJrupSAjP
66 | ULsKaCPVFJR
67 | uNb9QFRL6hY
68 | ur6pFq6Qu1A
69 | UwV83HsGsw3
70 | Uxmj2M2itWa
71 | V2XKFyX4ASd
72 | VFuaQ6m2Qom
73 | VLzqgDo317F
74 | Vt2qJdWjCF2
75 | VVfe2KiqLaN
76 | Vvot9Ly1tCj
77 | vyrNrziPKCB
78 | VzqfbhrpDEA
79 | wc2JMjhGNzB
80 | WYY7iVyf5p8
81 | X7HyMhZNoso
82 | x8F5xyUWy9e
83 | XcA2TqTSSAj
84 | YFuZgdQ5vWj
85 | YmJkqBEsHnH
86 | yqstnuAEVhm
87 | YVUC4YcDtcY
88 | Z6MFQCViBuw
89 | ZMojNkEp431
90 | zsNo4HB9uLZ
91 |
--------------------------------------------------------------------------------
/simulator/envs/__init__.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | sys.path.append('simulator/envs')
4 |
--------------------------------------------------------------------------------
/simulator/envs/env.py:
--------------------------------------------------------------------------------
1 | ''' Batched Room-to-Room navigation environment '''
2 |
3 | import numpy as np
4 | import json
5 | import networkx as nx
6 | import os
7 | import torch
8 |
9 | from collections import namedtuple
10 | from envs_utils import load_nav_graphs, structured_map
11 | from model.cuda import try_cuda
12 |
13 | ANGLE_INC = np.pi / 6.
14 | WorldState = namedtuple(
15 | "WorldState",
16 | ["scan_id", "viewpoint_id", "view_index", "heading", "elevation"]
17 | )
18 |
19 |
20 | class EnvBatch():
21 | ''' A simple wrapper for a batch of MatterSim environments,
22 | using discretized viewpoints and pretrained features,
23 | using an adjacency dictionary to replace the MatterSim simulator
24 | '''
25 |
26 | def __init__(self, adj_dict=None):
27 | self.adj_dict = adj_dict
28 | assert adj_dict is not None, "Error! No adjacency dictionary!"
29 |
30 | def get_start_state(self, scan_ids, viewpoint_ids, headings):
31 | def f(scan_id, viewpoint_id, heading):
32 | elevation = 0
33 | view_index = (12 * round(elevation / ANGLE_INC + 1)
34 | + round(heading / ANGLE_INC) % 12)
35 | return WorldState(scan_id=scan_id,
36 | viewpoint_id=viewpoint_id,
37 | view_index=view_index,
38 | heading=heading,
39 | elevation=elevation)
40 |
41 | return structured_map(f, scan_ids, viewpoint_ids, headings)
42 |
43 | def get_adjs(self, world_states):
44 | def f(world_state):
45 | query = '_'.join([world_state.scan_id,
46 | world_state.viewpoint_id,
47 | str(world_state.view_index)])
48 | return self.adj_dict[query]
49 |
50 | return structured_map(f, world_states)
51 |
52 | def make_actions(self, world_states, actions, attrs):
53 | def f(world_state, action, loc_attrs):
54 | if action == 0:
55 | return world_state
56 | else:
57 | loc_attr = loc_attrs[action]
58 | return WorldState(scan_id=world_state.scan_id,
59 | viewpoint_id=loc_attr['nextViewpointId'],
60 | view_index=loc_attr['absViewIndex'],
61 | heading=(loc_attr['absViewIndex'] % 12) * ANGLE_INC,
62 | elevation=(loc_attr['absViewIndex'] // 12 - 1)
63 | * ANGLE_INC)
64 |
65 | return structured_map(f, world_states, actions, attrs)
66 |
67 |
68 | class RoomEnv():
69 | ''' Implements the R2R (R4R, R6R, etc.) navigation task,
70 | using discretized viewpoints and pretrained features.
71 | '''
72 |
73 | @staticmethod
74 | def load_adj_feature(adj_list_file):
75 | with open(adj_list_file, 'r') as f:
76 | adj_dict = json.load(f)
77 | return adj_dict
78 |
79 | @staticmethod
80 | def load_graphs():
81 | ''' Load connectivity graph for each scan. '''
82 | scans = []
83 | for file in os.listdir("simulator/connectivity"):
84 | if file.endswith(".json"):
85 | scans.append(file.split('_')[0])
86 | print('Loading navigation graphs for %d scans' % len(scans))
87 | graphs = load_nav_graphs(scans)
88 | paths = {}
89 | matrix = {}
90 | states_map = {}
91 | distances = {}
92 | for scan, G in graphs.items(): # compute all shortest paths
93 | paths[scan] = dict(nx.all_pairs_dijkstra_path(G))
94 | matrix[scan] = nx.to_numpy_matrix(G)
95 | states_map[scan] = list(G.nodes)
96 | distances[scan] = dict(nx.all_pairs_dijkstra_path_length(G))
97 | return paths, states_map, distances
98 |
99 | @staticmethod
100 | def make_state_embeddings(args, states_map, image_features_list):
101 | state_embedding = {}
102 | for scan, state_list in states_map.items():
103 | embedding = np.zeros((len(state_list), args.num_views,
104 | args.mean_pooled_dim))
105 | for i, state in enumerate(state_list):
106 | fake_state = {'scan_id': scan,
107 | 'viewpoint_id': state}
108 | feature = [featurizer.get_features(fake_state)
109 | for featurizer in image_features_list][0]
110 | embedding[i] = feature
111 | state_embedding[scan] = torch.from_numpy(embedding).float()
112 | return state_embedding
113 |
114 | @staticmethod
115 | def build_viewpoint_loc_embedding(args, view_index):
116 | """
117 | Position embedding: heading 64D + elevation 64D
118 | 1) heading: [sin(heading) for _ in range(1, 33)] +
119 | [cos(heading) for _ in range(1, 33)]
120 | 2) elevation: [sin(elevation) for _ in range(1, 33)] +
121 | [cos(elevation) for _ in range(1, 33)]
122 | """
123 | embedding = np.zeros((args.num_views, 128), np.float32)
124 | for abs_view_index in range(args.num_views):
125 | rel_view_index = (abs_view_index - view_index) % 12 \
126 | + (abs_view_index // 12) * 12
127 | rel_heading = (rel_view_index % 12) * ANGLE_INC
128 | rel_elevation = (rel_view_index // 12 - 1) * ANGLE_INC
129 | embedding[abs_view_index, 0:32] = np.sin(rel_heading)
130 | embedding[abs_view_index, 32:64] = np.cos(rel_heading)
131 | embedding[abs_view_index, 64:96] = np.sin(rel_elevation)
132 | embedding[abs_view_index, 96:] = np.cos(rel_elevation)
133 | return torch.from_numpy(embedding).float()
134 |
135 | def __init__(self, args, paths, states_map, distances, state_embedding,
136 | loc_embeddings, adj_dict):
137 | self.env = EnvBatch(adj_dict=adj_dict)
138 | self.margin = 3.0
139 | self.paths = paths
140 | self.states_map = states_map
141 | self.distances = distances
142 | self.state_embedding = state_embedding
143 | self.loc_embeddings = loc_embeddings
144 | self.padding_action = try_cuda(torch.zeros(args.action_embed_size))
145 | self.padding_feature = try_cuda(torch.zeros(args.num_views,
146 | args.action_embed_size))
147 | self.shrink = 10 # shrink distance 10 times
148 |
149 | def _build_action_embedding(self, adj_loc_list, feature):
150 | feature_adj = feature[[adj_dict['absViewIndex']
151 | for adj_dict in adj_loc_list]]
152 | feature_adj[0] = 0
153 | embedding = np.zeros((len(adj_loc_list), 128), np.float32)
154 | for a, adj_dict in enumerate(adj_loc_list):
155 | if a == 0:
156 | continue
157 | else:
158 | rel_heading = adj_dict['rel_heading']
159 | rel_elevation = adj_dict['rel_elevation']
160 | embedding[a][0:32] = np.sin(rel_heading)
161 | embedding[a][32:64] = np.cos(rel_heading)
162 | embedding[a][64:96] = np.sin(rel_elevation)
163 | embedding[a][96:] = np.cos(rel_elevation)
164 | angle_embed = torch.from_numpy(embedding).float()
165 | return try_cuda(torch.cat((feature_adj, angle_embed), dim=-1))
166 |
167 | def _build_feature_embedding(self, view_index, feature):
168 | angle_embed = self.loc_embeddings[view_index]
169 | return try_cuda(torch.cat((feature, angle_embed), dim=-1))
170 |
171 | def _shortest_path_action(self, state, adj_loc_list, goal_id):
172 | ''' Determine next action on the shortest path to goal, for supervised training. '''
173 | if state.viewpoint_id == goal_id:
174 | return 0
175 | for n_a, loc_attr in enumerate(adj_loc_list):
176 | if loc_attr['nextViewpointId'] == goal_id:
177 | return n_a
178 | path = self.paths[state.scan_id][state.viewpoint_id][goal_id]
179 | next_viewpoint_id = path[1]
180 | for n_a, loc_attr in enumerate(adj_loc_list):
181 | if loc_attr['nextViewpointId'] == next_viewpoint_id:
182 | return n_a
183 |
184 | # Next viewpoint_id not found! This should not happen!
185 | print('adj_loc_list:', adj_loc_list)
186 | print('next_viewpoint_id:', next_viewpoint_id)
187 | print('longId:', '{}_{}'.format(state.scan_id, state.viewpoint_id))
188 | raise Exception('Error: next_viewpoint_id not in adj_loc_list')
189 |
190 | def _observe(self, world_states, include_feature=True,
191 | include_teacher=True, step=None):
192 | """
193 | Return the observations of a batch of states
194 | :param world_states: states defined as a namedtuple
195 | :param done: has done, no need to provide ob
196 | :param include_feature: whether or not to return the pretrained features
197 | :param include_teacher: whether or not to return a teacher viewpoint and
198 | teacher action (for supervision)
199 | :param step: step number in the gold trajectory
200 | :return: a list of observations, each is a dictionary
201 | """
202 | obs = []
203 | for i, adj_loc_list in enumerate(self.env.get_adjs(world_states)):
204 | item = self.batch[i]
205 | state = self.world_states[i]
206 | ob = {
207 | 'scan': state.scan_id,
208 | 'viewpoint': state.viewpoint_id,
209 | 'view_index': state.view_index,
210 | 'heading': state.heading,
211 | 'elevation': state.elevation,
212 | 'adj_loc_list': adj_loc_list,
213 | 'instr_id': item['instr_id']
214 | }
215 | if include_feature:
216 | idx = self.states_map[state.scan_id].index(state.viewpoint_id)
217 | feature = self.state_embedding[state.scan_id][idx]
218 | feature_with_loc = self._build_feature_embedding(state.view_index,
219 | feature)
220 | action_embedding = self._build_action_embedding(adj_loc_list, feature)
221 | ob['feature'] = [feature_with_loc]
222 | ob['action_embedding'] = action_embedding
223 | if include_teacher:
224 | ob['goal'] = item['path'][-1]
225 | if step is not None and (step + 1) < len(item['path']):
226 | ob['teacher'] = item['path'][step + 1]
227 | else:
228 | ob['teacher'] = item['path'][-1]
229 | ob['teacher_action'] = self._shortest_path_action(state, adj_loc_list,
230 | ob['teacher'])
231 | obs.append(ob)
232 | return obs
233 |
234 | def reset(self, next_batch, step=None):
235 | ''' Load a new mini-batch and return the initial observation'''
236 | self.batch = next_batch
237 | scan_ids = [item['scan'] for item in next_batch]
238 | viewpoint_ids = [item['path'][0] for item in next_batch]
239 | headings = [item['heading'] for item in next_batch]
240 | self.world_states = self.env.get_start_state(scan_ids, viewpoint_ids,
241 | headings)
242 | obs = self._observe(self.world_states, step=step)
243 | return obs
244 |
245 | def step(self, obs, actions, step=None):
246 | ''' Take one step from the current state
247 | :param obs: last observations
248 | :param actions: current actions
249 | :param step: step information for teacher action supervision
250 | :return: current observations, and "done" (finish or not)
251 | '''
252 | attrs = [ob['adj_loc_list'] for ob in obs]
253 | self.world_states = self.env.make_actions(self.world_states, actions,
254 | attrs)
255 | obs = self._observe(self.world_states, step=step)
256 | done = (np.array(actions) == 0).astype(np.uint8)
257 | return obs, done
258 |
259 | def _paths_to_goals(self, obs, max_steps):
260 | all_obs = [[ob] for ob in obs]
261 | all_actions = [[] for _ in obs]
262 | ended = np.zeros(len(obs))
263 | for t in range(max_steps):
264 | actions = [ob['teacher_action'] for ob in obs]
265 | for i, a in enumerate(actions):
266 | if not ended[i]:
267 | all_actions[i].append(a)
268 | obs, ended = self.step(obs, actions, step=t + 1)
269 | for i, ob in enumerate(obs):
270 | if not ended[i] and t < max_steps - 1:
271 | all_obs[i].append(ob)
272 | if ended.all():
273 | break
274 | return all_obs, all_actions
275 |
276 | def gold_obs_actions_and_instructions(self, batch, max_steps=100):
277 | obs = self.reset(batch, step=0)
278 | path_obs, path_actions = self._paths_to_goals(obs, max_steps)
279 | encoded_instructions = [item['instr_encoding'] for item in batch]
280 | return path_obs, path_actions, encoded_instructions
281 |
282 | def length(self, scan, nodes):
283 | return float(np.sum([self.distances[scan][edge[0]][edge[1]]
284 | for edge in zip(nodes[:-1], nodes[1:])]))
285 |
286 | def get_mix(self, scan, prediction, reference):
287 | success = self.distances[scan][prediction[-1]][reference[-1]] < self.margin
288 | pad = [0] * (len(prediction) - 1)
289 | final = self.ndtw(scan, prediction, reference) * success \
290 | + self.cls(scan, prediction, reference)
291 | return pad + [final]
292 |
293 | def get_ndtw(self, scan, prediction, reference):
294 | success = self.distances[scan][prediction[-1]][reference[-1]] < self.margin
295 | pad = [0] * (len(prediction) - 2)
296 | return pad + [self.ndtw(scan, prediction, reference) + success]
297 |
298 | def ndtw(self, scan, prediction, reference):
299 | dtw_matrix = np.inf * np.ones((len(prediction) + 1, len(reference) + 1))
300 | dtw_matrix[0][0] = 0
301 | for i in range(1, len(prediction) + 1):
302 | for j in range(1, len(reference) + 1):
303 | best_previous_cost = min(dtw_matrix[i - 1][j],
304 | dtw_matrix[i][j - 1],
305 | dtw_matrix[i - 1][j - 1])
306 | cost = self.distances[scan][prediction[i - 1]][reference[j - 1]]
307 | dtw_matrix[i][j] = cost + best_previous_cost
308 | dtw = dtw_matrix[len(prediction)][len(reference)]
309 | ndtw = np.exp(-dtw / (self.margin * len(reference)))
310 | return ndtw
311 |
312 | def get_cls(self, scan, prediction, reference):
313 | success = self.distances[scan][reference[-1]][prediction[-1]] < self.margin
314 | pad = [0] * (len(prediction) - 2)
315 | return pad + [self.cls(scan, prediction, reference) + success]
316 |
317 | def cls(self, scan, prediction, reference):
318 | coverage = np.mean([np.exp(
319 | -np.min([self.distances[scan][u][v] for v in prediction]) / self.margin
320 | ) for u in reference])
321 | expected = coverage * self.length(scan, reference)
322 | score = expected \
323 | / (expected + np.abs(expected - self.length(scan, prediction)))
324 | return coverage * score
325 |
326 | def get_dis(self, scan, prediction, reference):
327 | goal = reference[-1]
328 | success = self.distances[scan][goal][prediction[-1]] < self.margin
329 | dis = [(self.distances[scan][goal][prediction[i]]
330 | - self.distances[scan][goal][prediction[i + 1]]) / self.shrink
331 | for i in range(len(prediction) - 1)]
332 | return dis[:-1] + [success]
333 |
--------------------------------------------------------------------------------
/simulator/envs/envs_utils.py:
--------------------------------------------------------------------------------
1 | ''' Utils for the environments '''
2 |
3 | import sys
4 | import json
5 | import numpy as np
6 | import networkx as nx
7 | import base64
8 |
9 |
10 | def load_nav_graphs(scans):
11 | ''' Load connectivity graph for each scan '''
12 |
13 | def distance(pose1, pose2):
14 | ''' Euclidean distance between two graph poses '''
15 | return ((pose1['pose'][3] - pose2['pose'][3]) ** 2
16 | + (pose1['pose'][7] - pose2['pose'][7]) ** 2
17 | + (pose1['pose'][11] - pose2['pose'][11]) ** 2) ** 0.5
18 |
19 | graphs = {}
20 | for scan in scans:
21 | with open('simulator/connectivity/%s_connectivity.json' % scan) as f:
22 | G = nx.Graph()
23 | positions = {}
24 | data = json.load(f)
25 | for i, item in enumerate(data):
26 | if item['included']:
27 | for j, conn in enumerate(item['unobstructed']):
28 | if conn and data[j]['included']:
29 | positions[item['image_id']] = np.array([item['pose'][3],
30 | item['pose'][7],
31 | item['pose'][11]])
32 | assert data[j]['unobstructed'][i], 'Graph should be undirected'
33 | G.add_edge(item['image_id'], data[j]['image_id'],
34 | weight=distance(item, data[j]))
35 | nx.set_node_attributes(G, values=positions, name='position')
36 | graphs[scan] = G
37 | return graphs
38 |
39 |
40 | def decode_base64(string):
41 | if sys.version_info[0] == 2:
42 | return base64.decodestring(bytearray(string))
43 | elif sys.version_info[0] == 3:
44 | return base64.decodebytes(bytearray(string, 'utf-8'))
45 | else:
46 | raise ValueError("decode_base64 can't handle python version {}".format(
47 | sys.version_info[0]))
48 |
49 |
50 | def structured_map(function, *args, **kwargs):
51 | nested = kwargs.get('nested', False)
52 | acc = []
53 | for t in zip(*args):
54 | if nested:
55 | mapped = [function(*inner_t) for inner_t in zip(*t)]
56 | else:
57 | mapped = function(*t)
58 | acc.append(mapped)
59 | return acc
60 |
--------------------------------------------------------------------------------
/simulator/envs/image_feature.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import sys
3 | import paths
4 | import csv
5 |
6 | from collections import defaultdict
7 | from envs_utils import decode_base64
8 |
9 | csv.field_size_limit(sys.maxsize)
10 |
11 |
12 | class ImageFeatures(object):
13 | NUM_VIEWS = 36
14 | MEAN_POOLED_DIM = 2048
15 | IMAGE_W = 640
16 | IMAGE_H = 480
17 | VFOV = 60
18 |
19 | @staticmethod
20 | def from_args(args):
21 | for image_feature_type in sorted(args.image_feature_type):
22 | assert image_feature_type == "mean_pooled"
23 | return [MeanPooledImageFeatures(args.image_feature_datasets)]
24 |
25 | @staticmethod
26 | def add_args(args):
27 | args.add_argument("--num_views", type=int,
28 | default=ImageFeatures.NUM_VIEWS)
29 | args.add_argument("--mean_pooled_dim", type=int,
30 | default=ImageFeatures.MEAN_POOLED_DIM)
31 | args.add_argument("--image_feature_type", nargs="+",
32 | default=["mean_pooled"])
33 | args.add_argument("--image_feature_datasets", nargs="+",
34 | default=["imagenet"])
35 |
36 | def get_features(self, state):
37 | raise NotImplementedError("get_features")
38 |
39 |
40 | class MeanPooledImageFeatures(ImageFeatures):
41 | def __init__(self, image_feature_datasets):
42 | image_feature_datasets = sorted(image_feature_datasets)
43 | self.image_feature_datasets = image_feature_datasets
44 | self.mean_pooled_feature_stores = [
45 | paths.MEAN_POOLED_FEATURE_STORE_PATHS[dataset]
46 | for dataset in image_feature_datasets]
47 | self.feature_dim = MeanPooledImageFeatures.MEAN_POOLED_DIM \
48 | * len(image_feature_datasets)
49 | print('Loading image features from %s'
50 | % ', '.join(self.mean_pooled_feature_stores))
51 | tsv_fieldnames = ['scanId', 'viewpointId', 'image_w', 'image_h', 'vfov',
52 | 'features']
53 | self.features = defaultdict(list)
54 | for mpfs in self.mean_pooled_feature_stores:
55 | with open(mpfs, "rt") as tsv_in_file:
56 | reader = csv.DictReader(tsv_in_file, delimiter='\t',
57 | fieldnames=tsv_fieldnames)
58 | for item in reader:
59 | assert int(item['image_h']) == ImageFeatures.IMAGE_H
60 | assert int(item['image_w']) == ImageFeatures.IMAGE_W
61 | assert int(item['vfov']) == ImageFeatures.VFOV
62 | long_id = self._make_id(item['scanId'], item['viewpointId'])
63 | features = np.frombuffer(decode_base64(item['features']),
64 | dtype=np.float32)
65 | features = features.reshape((ImageFeatures.NUM_VIEWS,
66 | ImageFeatures.MEAN_POOLED_DIM))
67 | self.features[long_id].append(features)
68 | assert all(
69 | len(feats) == len(self.mean_pooled_feature_stores)
70 | for feats in self.features.values()
71 | )
72 | self.features = {
73 | long_id: np.concatenate(feats, axis=1)
74 | for long_id, feats in self.features.items()
75 | }
76 |
77 | def _make_id(self, scan_id, viewpoint_id):
78 | return scan_id + '_' + viewpoint_id
79 |
80 | def get_features(self, state):
81 | long_id = self._make_id(state['scan_id'], state['viewpoint_id'])
82 | return self.features[long_id]
83 |
--------------------------------------------------------------------------------
/simulator/envs/paths.py:
--------------------------------------------------------------------------------
1 | MEAN_POOLED_FEATURE_STORE_PATHS = {
2 | 'imagenet': 'simulator/resnet_feature/ResNet-152-imagenet.tsv',
3 | }
4 | ADJ_LIST_FILE = 'simulator/total_adj_list.json'
5 |
--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sha-Lab/babywalk/4265340d5e521b59030bc50ef71f01b0d3d3ba63/src/__init__.py
--------------------------------------------------------------------------------
/src/eval_follower.py:
--------------------------------------------------------------------------------
1 | ''' Evaluation of agent trajectories '''
2 |
3 | from collections import defaultdict
4 | import numpy as np
5 | import copy
6 | from collections import namedtuple
7 |
8 | EvalResult = namedtuple("EvalResult", "nav_error, oracle_error, "
9 | "trajectory_steps, trajectory_length, "
10 | "sr, osr, spl, cls, ndtw, sdtw")
11 |
12 |
13 | class FollowerEvaluation(object):
14 | ''' Results submission format:
15 | [{'instr_id': string,
16 | 'trajectory':[viewpoint_id]}] '''
17 |
18 | def __init__(self, env, data):
19 | self.margin = 3.0
20 | self.gt = {}
21 | self.instr_ids = []
22 | for item in data:
23 | if item['path_id'] not in self.gt:
24 | self.gt[item['path_id']] = copy.deepcopy(item)
25 | self.gt[item['path_id']]['instructions'] = [item['instructions']]
26 | else:
27 | self.gt[item['path_id']]['instructions'].append(item['instructions'])
28 | self.instr_ids.append(item['instr_id'])
29 | self.instr_ids = set(self.instr_ids)
30 | self.distances = env.distances
31 | self.env = env
32 |
33 | def _get_nearest(self, scan, goal_id, path):
34 | near_id = path[0]
35 | near_d = self.distances[scan][near_id][goal_id]
36 | for item in path:
37 | d = self.distances[scan][item][goal_id]
38 | if d < near_d:
39 | near_id = item
40 | near_d = d
41 | return near_id
42 |
43 | def _score_item(self, gt, path):
44 | ''' Calculate error based on the final position in trajectory, and also
45 | the closest position (oracle stopping rule). '''
46 | goal = gt['path'][-1]
47 | final_position = path[-1]
48 | nearest_position = self._get_nearest(gt['scan'], goal, path)
49 | dis = self.distances[gt['scan']][path[0]][goal]
50 | nav_error = self.distances[gt['scan']][final_position][goal]
51 | oracle_error = self.distances[gt['scan']][nearest_position][goal]
52 | trajectory_steps = len(path) - 1
53 | trajectory_length = self.env.length(gt['scan'], path)
54 | sr = nav_error < self.margin
55 | osr = oracle_error < self.margin
56 | spl = sr * dis / max(trajectory_length, dis) if dis > 0 else sr
57 | cls = self.env.cls(gt['scan'], path, gt['path'])
58 | ndtw = self.env.ndtw(gt['scan'], path, gt['path'])
59 | sdtw = ndtw * sr
60 | return EvalResult(nav_error=nav_error, oracle_error=oracle_error,
61 | trajectory_steps=trajectory_steps,
62 | trajectory_length=trajectory_length,
63 | sr=sr, osr=osr, spl=spl, cls=cls, ndtw=ndtw, sdtw=sdtw)
64 |
65 | def score_results(self, results, update_results=False):
66 | '''
67 | evaluation on different metrics
68 | :param results: results should be a dictionary mapping instr_ids to
69 | dictionaries, with each dictionary containing (at least)
70 | a 'trajectory' field
71 | :param update_results: update the result dictionary for saving result files
72 | :return:
73 | '''
74 | self.scores = defaultdict(list)
75 | model_scores = []
76 | instr_ids = set(self.instr_ids)
77 |
78 | instr_count = 0
79 | for instr_id, result in results.items():
80 | if instr_id in instr_ids:
81 | instr_count += 1
82 | instr_ids.remove(instr_id)
83 |
84 | gt = self.gt[int(instr_id.split('_')[0])]
85 | eval_result = self._score_item(gt, result['trajectory'])
86 | self.scores['nav_errors'].append(eval_result.nav_error)
87 | self.scores['oracle_errors'].append(eval_result.oracle_error)
88 | self.scores['trajectory_steps'].append(eval_result.trajectory_steps)
89 | self.scores['trajectory_lengths'].append(eval_result.trajectory_length)
90 | self.scores['sr'].append(eval_result.sr)
91 | self.scores['cls'].append(eval_result.cls)
92 | self.scores['osr'].append(eval_result.osr)
93 | self.scores['spl'].append(eval_result.spl)
94 | self.scores['ndtw'].append(eval_result.ndtw)
95 | self.scores['sdtw'].append(eval_result.sdtw)
96 | if 'score' in result:
97 | model_scores.append(result['score'])
98 | if update_results:
99 | result['nav_errors'] = eval_result.nav_error
100 | result['oracle_errors'] = eval_result.oracle_error
101 | result['trajectory_steps'] = eval_result.trajectory_steps
102 | result['trajectory_lengths'] = eval_result.trajectory_length
103 | result['sr'] = eval_result.sr
104 | result['osr'] = eval_result.osr
105 | result['spl'] = eval_result.spl
106 | result['cls'] = eval_result.cls
107 | result['ndtw'] = eval_result.ndtw
108 | result['sdtw'] = eval_result.sdtw
109 | result['expert_trajectory'] = gt['path']
110 | result['distance'] = gt['distance']
111 | result['scan'] = gt['scan']
112 | result['instruction'] = \
113 | gt['instructions'][int(instr_id.split('_')[1])]
114 |
115 | score_summary = {
116 | 'nav_error': np.average(self.scores['nav_errors']),
117 | 'oracle_error': np.average(self.scores['oracle_errors']),
118 | 'steps': np.average(self.scores['trajectory_steps']),
119 | 'lengths': np.average(self.scores['trajectory_lengths']),
120 | 'cls': np.average(self.scores['cls']),
121 | 'sr': float(sum(self.scores['sr']) / len(self.scores['sr'])),
122 | 'osr': float(sum(self.scores['osr']) / len(self.scores['osr'])),
123 | 'spl': float(sum(self.scores['spl']) / len(self.scores['spl'])),
124 | 'ndtw': float(sum(self.scores['ndtw']) / len(self.scores['ndtw'])),
125 | 'sdtw': float(sum(self.scores['sdtw']) / len(self.scores['sdtw'])),
126 | }
127 | if len(model_scores) > 0:
128 | score_summary['model_score'] = np.average(model_scores)
129 | if update_results:
130 | score_summary['sr_std'] = np.std(self.scores['sr'])
131 | score_summary['cls_std'] = np.std(self.scores['cls'])
132 | score_summary['spl_std'] = np.std(self.scores['spl'])
133 |
134 | return score_summary
135 |
--------------------------------------------------------------------------------
/src/params.py:
--------------------------------------------------------------------------------
1 | RESULT_DIR = 'follower/results/'
2 | SNAPSHOT_DIR = 'follower/snapshots/'
3 | PLOT_DIR = 'follower/plots/'
4 | SUMMARY_DIR = 'follower/summary/'
5 | FOLLOWER_PATH = None
6 | SPEAKER_PATH = None
7 | ACTION_EMBEDDING_SIZE = 2048 + 128
8 | HIDDEN_SIZE = 512
9 | WEIGHT_DECAY = 0.00005
10 | FEATURE_SIZE = 2048 + 128
11 | LOG_EVERY = 500
12 | SAVE_EVERY = 5000
13 |
14 |
15 | def add_general_args(parser):
16 | # data
17 | parser.add_argument("--split_postfix", type=str, default='',
18 | help="The postfix of datasets, "
19 | "for landmark datasets it should be '_landmark', "
20 | "otherwise it should be ''")
21 | parser.add_argument("--add_augment", action='store_true')
22 | parser.add_argument("--augment_data", type=str,
23 | default='literal_speaker_data_augmentation',
24 | help="The augmentation dataset, "
25 | "only useful if --add_augment is on")
26 | parser.add_argument("--task_name", type=str, default='R2R')
27 |
28 | # learning algorithm
29 | parser.add_argument("--reward", action='store_true',
30 | help="Use RL if on")
31 | parser.add_argument("--curriculum_rl", action='store_true',
32 | help="Use CRL if on, set --reward on first")
33 | parser.add_argument("--count_curriculum", type=int, default=0,
34 | help="Set the start curriculum")
35 | parser.add_argument("--max_curriculum", type=int, default=4,
36 | help="Set the maximum curriculum")
37 | parser.add_argument("--curriculum_iters", type=int, default=10000,
38 | help="Set the # of iterations to increase curriculum")
39 | parser.add_argument("--learning_method", type=str, default="adam")
40 | parser.add_argument("--feedback_method", type=str, default="sample",
41 | help="Choose from teacher, argmax or sample")
42 | parser.add_argument("--il_mode", type=str, default=None,
43 | help="Choose from None, period_split or landmark_split")
44 |
45 | # learning params
46 | parser.add_argument("--n_iters", type=int, default=20000,
47 | help="Total training iterations")
48 | parser.add_argument("--batch_size", type=int, default=100,
49 | help="Choose carefully based on gpu memory, "
50 | "be small in large curriculum")
51 | parser.add_argument("--lr", type=float, default=0.0001)
52 | parser.add_argument("--max_ins_len", type=int, default=100,
53 | help="Max instruction length, "
54 | "for long sentences like in R8R, set it larger")
55 | parser.add_argument("--max_steps", type=int, default=10,
56 | help="Max step size, "
57 | "for long trajectories like in R8R, set it larger")
58 | parser.add_argument("--beam_size", type=int, default=8,
59 | help="Choose carefully based on gpu memory, "
60 | "be small in large curriculum")
61 | parser.add_argument("--action_embed_size", type=int,
62 | default=ACTION_EMBEDDING_SIZE)
63 | parser.add_argument("--feature_size", type=int, default=FEATURE_SIZE)
64 | parser.add_argument("--weight_decay", type=float, default=WEIGHT_DECAY)
65 | parser.add_argument("--hidden_size", type=int, default=HIDDEN_SIZE)
66 |
67 | # network
68 | parser.add_argument("--coground", action='store_true',
69 | help="Use cogrounding decoder if on")
70 | parser.add_argument("--wemb", type=int, default=300,
71 | help="Word embedding size")
72 | parser.add_argument("--dropout", type=float, default=0.5)
73 | parser.add_argument("--reward_type", type=str, default='dis',
74 | help="Choose from dis, cls, dtw and mix")
75 | parser.add_argument("--history", action='store_true',
76 | help="Use memory buffer if on")
77 | parser.add_argument("--exp_forget", type=float, default=0.5,
78 | help="Exponential forgetting ratio, for simplicity,"
79 | "here -1 mean to use LSTM memory buffer")
80 |
81 | # load model
82 | parser.add_argument("--no_speaker", action='store_true',
83 | help="Use speaker to provide internal reward if on,"
84 | "if not on, must provide the speaker prefix")
85 | parser.add_argument("--load_opt", action='store_true',
86 | help="When continue training, load previous optimizer")
87 | parser.add_argument("--speaker_prefix", type=str, default=SPEAKER_PATH)
88 | parser.add_argument("--follower_prefix", type=str, default=FOLLOWER_PATH)
89 |
90 | # save and log in training
91 | parser.add_argument("--no_save", action='store_true')
92 | parser.add_argument("--model_name", type=str, default="follower")
93 | parser.add_argument("--result_dir", default=RESULT_DIR)
94 | parser.add_argument("--snapshot_dir", default=SNAPSHOT_DIR)
95 | parser.add_argument("--plot_dir", default=PLOT_DIR)
96 | parser.add_argument("--summary_dir", default=SUMMARY_DIR)
97 | parser.add_argument("--log_every", type=int, default=LOG_EVERY)
98 | parser.add_argument("--save_every", type=int, default=SAVE_EVERY)
99 |
100 | # evaluation
101 | parser.add_argument("--use_test", action='store_true')
102 | parser.add_argument("--one_by_one", action='store_true',
103 | help="Evaluate one long instruction as "
104 | "a sequence of shorter instructions if on")
105 | parser.add_argument("--one_by_one_mode", type=str, default=None,
106 | help="Choose from splitting long instructions as "
107 | "period or landmark")
108 |
--------------------------------------------------------------------------------
/src/process_data.py:
--------------------------------------------------------------------------------
1 | import math
2 | import copy
3 | import json
4 | from collections import defaultdict
5 | from src.vocab.vocab_path import TRAIN_VOCAB
6 | from src.vocab.tokenizer import Tokenizer, read_vocab
7 |
8 |
9 | def make_data(args):
10 | # determine splits
11 | if args.use_test:
12 | train_splits = []
13 | val_splits = ['test']
14 | elif args.task_name == 'R2T8':
15 | train_splits = []
16 | val_splits = ['R2R_val_unseen', 'R4R_val_unseen', 'R6R_val_unseen',
17 | 'R8R_val_unseen']
18 | elif args.task_name == 'R2R' or args.task_name == 'R4R':
19 | train_splits = ['train']
20 | val_splits = ['val_seen', 'val_unseen']
21 | else:
22 | train_splits = ['train']
23 | val_splits = ['val_unseen']
24 |
25 | if args.add_augment:
26 | train_splits.append(args.augment_data)
27 | vocab = read_vocab(TRAIN_VOCAB)
28 | tok = Tokenizer(vocab=vocab)
29 |
30 | # get datasets from file
31 | train_data = load_task_datasets(train_splits, args.task_name,
32 | postfix=args.split_postfix,
33 | tokenizer=tok,
34 | one_by_one_mode=args.one_by_one_mode)
35 | val_data = load_task_datasets(val_splits, args.task_name,
36 | postfix=args.split_postfix,
37 | tokenizer=tok,
38 | one_by_one_mode=args.one_by_one_mode)
39 |
40 | # split for training
41 | if len(train_data) > 0:
42 | assert len(train_data['train']) >= args.batch_size, \
43 | "data not enough for one batch, reduce the batch size"
44 | if args.curriculum_rl:
45 | if args.one_by_one_mode == 'period':
46 | train_data = period_split_curriculum(train_data, tok, args.history)
47 | elif args.one_by_one_mode == 'landmark':
48 | train_data = landmark_split_curriculum(train_data, tok, args.history)
49 | else:
50 | raise ValueError("Error! One by one mode is not implemented.")
51 | elif args.il_mode is not None:
52 | if args.il_mode == 'period_split':
53 | train_data, counter = period_split(train_data, tok, 0, args.history)
54 | val_data, _ = period_split(val_data, tok, counter, args.history) \
55 | if not args.one_by_one else (val_data, None)
56 | elif args.il_mode == 'landmark_split':
57 | train_data, counter = landmark_split(train_data, tok, 0, args.history)
58 | val_data, _ = landmark_split(val_data, tok, counter, args.history) \
59 | if not args.one_by_one else (val_data, None)
60 | else:
61 | raise ValueError("Error! Training mode not available.")
62 |
63 | # make it together for evaluator
64 | train_tag = '-'.join(train_splits)
65 | train_data = merge_data(train_data)
66 | all_val_data = merge_data(val_data) if args.one_by_one_mode != 'landmark' \
67 | else merge_data_landmark(val_data)
68 |
69 | # make single data splitted sentence by sentence for evaluation
70 | if args.one_by_one:
71 | if args.one_by_one_mode == 'period':
72 | val_data = period_split_curriculum(val_data, tok, args.history,
73 | use_test=args.use_test)
74 | elif args.one_by_one_mode == 'landmark':
75 | val_data = landmark_split_curriculum(val_data, tok, args.history,
76 | use_test=args.use_test)
77 | else:
78 | print("Error! Not implemented one by one mode!")
79 | exit(0)
80 | val_data = {tag: sorted(data, key=lambda x: len(x))
81 | for tag, data in val_data.items()}
82 | return train_data, val_data, all_val_data, vocab, tok, train_tag
83 |
84 |
85 | def merge_data(data):
86 | total_val = []
87 | for tag, d in data.items():
88 | total_val += d
89 | return total_val
90 |
91 |
92 | def merge_data_landmark(data):
93 | total_val = []
94 | for tag, data in data.items():
95 | for d in data:
96 | new_d = dict(d)
97 | new_d['path'] = [d['path'][0][0]]
98 | for i in range(len(d['path'])):
99 | new_d['path'].extend(d['path'][i][1:])
100 | new_d['instructions'] = ' '.join(new_d['instructions'])
101 | total_val.append(new_d)
102 | return total_val
103 |
104 |
105 | def load_dataset(split, task, postfix):
106 | data = []
107 | with open('tasks/%s/data/%s_%s%s.json' % (task, task, split, postfix)) as f:
108 | data += json.load(f)
109 | print("Load dataset %s_%s%s" % (task, split, postfix))
110 | return data
111 |
112 |
113 | def load_task_datasets(splits, task, postfix='', tokenizer=None,
114 | one_by_one_mode=None):
115 | dataset = {}
116 | id_list = defaultdict(lambda: 0)
117 | for split in splits:
118 | data = []
119 | for item in load_dataset(split, task, postfix):
120 | if one_by_one_mode == "landmark":
121 | new_item = dict(item)
122 | new_item['instr_id'] = '%s_%d' \
123 | % (item['path_id'], id_list[item['path_id']])
124 | id_list[item['path_id']] += 1
125 | data.append(new_item)
126 | else:
127 | instructions = item['instructions']
128 | for j, instr in enumerate(instructions):
129 | new_item = dict(item)
130 | new_item['instr_id'] = '%s_%d' % (item['path_id'], j)
131 | new_item['instructions'] = instr
132 | if tokenizer:
133 | new_item['instr_encoding'], new_item[
134 | 'instr_length'] = tokenizer.encode_sentence(instr)
135 | data.append(new_item)
136 | dataset[split] = data
137 | return dataset
138 |
139 |
140 | def add_history_to_data(data, history_heading, history_path, history_instr,
141 | history_instr_encoding):
142 | data['history_heading'] = copy.deepcopy(history_heading)
143 | data['history_path'] = copy.deepcopy(history_path)
144 | data['history_instr'] = copy.deepcopy(history_instr)
145 | data['history_instr_encoding'] = copy.deepcopy(history_instr_encoding)
146 | history_heading.append(data['heading'])
147 | history_path.append(data['path'])
148 | history_instr.append(data['instructions'])
149 | history_instr_encoding.append(data['instr_encoding'])
150 |
151 |
152 | def period_split(datasets, tok, counter, history):
153 | splited = {}
154 | for tag, data in datasets.items():
155 | new_data = []
156 | for d in data:
157 | history_heading, history_path = [], []
158 | history_instr, history_instr_encoding = [], []
159 | ins = d['instructions']
160 | ins_splited = ins.split('.')
161 | if not ins_splited[0]:
162 | ins_splited = ins_splited[1:-1]
163 | else:
164 | ins_splited = ins_splited[:-1]
165 | ratio = 0
166 | last_path_split_point = 1
167 | ins_len = sum([len(ins_sp.split()) for ins_sp in ins_splited])
168 | for i, ins_sp in enumerate(ins_splited):
169 | ratio += len(ins_sp.split()) / ins_len
170 | if ratio > 1:
171 | ratio = 1
172 | path_split_point = math.ceil(len(d['path']) * ratio)
173 |
174 | new_d = copy.deepcopy(d)
175 | new_d['path_id'] = counter
176 | new_d['instr_id'] = str(counter) + '_0'
177 | new_d['path'] = d['path'][last_path_split_point - 1:path_split_point]
178 | new_d['instructions'] = ins_sp + '.'
179 | new_d['instr_encoding'], new_d['instr_length'] = tok.encode_sentence(
180 | ins_sp + '.')
181 | new_d['heading'] = d['headings'][last_path_split_point - 1]
182 | new_d['remain_split'] = len(ins_splited) - i
183 | if history:
184 | add_history_to_data(new_d, history_heading, history_path,
185 | history_instr, history_instr_encoding)
186 |
187 | last_path_split_point = path_split_point
188 | new_data.append(new_d)
189 | counter += 1
190 | assert new_d['path'][-1] == d['path'][-1]
191 | splited[tag] = new_data
192 | return splited, counter
193 |
194 |
195 | def landmark_split(datasets, tok, counter, history):
196 | splited = {}
197 | for tag, data in datasets.items():
198 | new_data = []
199 | for d in data:
200 | history_heading, history_path = [], []
201 | history_instr, history_instr_encoding = [], []
202 | for i, ins in enumerate(d['instructions']):
203 | new_d = copy.deepcopy(d)
204 | new_d['path_id'] = counter
205 | new_d['instr_id'] = str(counter) + '_0'
206 | new_d['path'] = d['path'][i]
207 | new_d['instructions'] = ins
208 | new_d['instr_encoding'], new_d['instr_length'] = tok.encode_sentence(
209 | ins)
210 | new_d['heading'] = float(d['headings'][i])
211 | new_d['remain_split'] = len(d['instructions']) - i
212 | if history:
213 | add_history_to_data(new_d, history_heading, history_path,
214 | history_instr, history_instr_encoding)
215 |
216 | new_data.append(new_d)
217 | counter += 1
218 | splited[tag] = new_data
219 | return splited, counter
220 |
221 |
222 | def period_split_curriculum(datasets, tok, history, use_test=False):
223 | splited = {}
224 | for tag, data in datasets.items():
225 | new_data = []
226 | for d in data:
227 | history_heading, history_path = [], []
228 | history_instr, history_instr_encoding = [], []
229 | new_d_list = []
230 | ins = d['instructions']
231 | ins_splited = ins.split('.')
232 | if not ins_splited[0]:
233 | ins_splited = ins_splited[1:-1]
234 | else:
235 | ins_splited = ins_splited[:-1]
236 | ratio = 0
237 | last_path_split_point = 1
238 | ins_len = sum([len(ins_sp.split()) for ins_sp in ins_splited])
239 | for i, ins_sp in enumerate(ins_splited):
240 | ratio += len(ins_sp.split()) / ins_len
241 | if ratio > 1:
242 | ratio = 1
243 | path_split_point = math.ceil(len(d['path']) * ratio)
244 |
245 | new_d = copy.deepcopy(d)
246 | new_d['instructions'] = ins_sp + '.'
247 | new_d['instr_encoding'], new_d['instr_length'] = \
248 | tok.encode_sentence(ins_sp + '.')
249 | if use_test:
250 | new_d['path'] = d['path'] * 2
251 | new_d['heading'] = float(d['heading'])
252 | else:
253 | new_d['path'] = d['path'][last_path_split_point - 1:path_split_point]
254 | new_d['heading'] = d['headings'][last_path_split_point - 1]
255 | if history:
256 | add_history_to_data(new_d, history_heading, history_path,
257 | history_instr, history_instr_encoding)
258 |
259 | last_path_split_point = path_split_point
260 | new_d_list.append(new_d)
261 | assert new_d['path'][-1] == d['path'][-1]
262 | new_data.append(new_d_list)
263 | splited[tag] = new_data
264 | return splited
265 |
266 |
267 | def landmark_split_curriculum(datasets, tok, history, use_test=False):
268 | splited = {}
269 | for tag, data in datasets.items():
270 | new_data = []
271 | for d in data:
272 | history_heading, history_path = [], []
273 | history_instr, history_instr_encoding = [], []
274 | new_d_list = []
275 | for i, ins in enumerate(d['instructions']):
276 | new_d = copy.deepcopy(d)
277 | new_d['instructions'] = ins
278 | new_d['instr_encoding'], new_d['instr_length'] = \
279 | tok.encode_sentence(ins)
280 | if use_test:
281 | new_d['path'] = d['path'] * 2
282 | new_d['heading'] = float(d['heading'])
283 | else:
284 | new_d['path'] = d['path'][i]
285 | new_d['heading'] = float(d['headings'][i])
286 | if history:
287 | add_history_to_data(new_d, history_heading, history_path,
288 | history_instr, history_instr_encoding)
289 |
290 | new_d_list.append(new_d)
291 | new_data.append(new_d_list)
292 | splited[tag] = new_data
293 | return splited
294 |
--------------------------------------------------------------------------------
/src/speaker.py:
--------------------------------------------------------------------------------
1 | import json
2 | import sys
3 | import numpy as np
4 | import itertools
5 | import torch
6 | import copy
7 | import torch.nn.functional as F
8 | import torch.distributions as D
9 |
10 | from src.vocab.tokenizer import VOCAB_PAD_IDX, VOCAB_EOS_IDX
11 | from src.utils import batch_instructions_from_encoded, \
12 | batch_observations_and_actions
13 | from model.cuda import try_cuda
14 |
15 |
16 | class Seq2SeqSpeaker(object):
17 | def __init__(self, env, results_path, args, encoder, decoder, tok):
18 | self.env = env
19 | self.tok = tok
20 | self.results_path = results_path
21 | self.results = {}
22 | self.encoder = encoder
23 | self.decoder = decoder
24 | self.max_instruction_length = args.max_ins_len
25 |
26 | def write_results(self):
27 | with open(self.results_path, 'w') as f:
28 | json.dump(self.results, f)
29 |
30 | def _score_obs_actions_and_instructions(self, path_obs, path_actions,
31 | encoded_instructions, feedback,
32 | train):
33 | batch_size = len(path_obs)
34 | instr_seq, _, _ = \
35 | batch_instructions_from_encoded(encoded_instructions,
36 | self.max_instruction_length, cut=False)
37 | batched_image_features, batched_action_embeddings, path_mask, seq_len = \
38 | batch_observations_and_actions(path_obs, path_actions,
39 | self.env.padding_feature,
40 | self.env.padding_action)
41 |
42 | ctx = self.encoder(batched_image_features, batched_action_embeddings,
43 | seq_len)
44 | h_t = try_cuda(torch.zeros(batch_size, ctx.size(-1)))
45 | c_t = try_cuda(torch.zeros(batch_size, ctx.size(-1)))
46 | ended = np.array([False] * batch_size)
47 |
48 | outputs = [{
49 | 'instr_id': path_obs[i][0]['instr_id'],
50 | 'word_indices': [],
51 | 'scores': []
52 | } for i in range(batch_size)]
53 |
54 | # Do a sequence rollout and calculate the loss
55 | loss = 0
56 | w_t = try_cuda(torch.from_numpy(
57 | np.full((batch_size, 1), self.tok.vocab_bos_idx,
58 | dtype='int64')).long())
59 |
60 | if train:
61 | w_t = torch.cat([w_t, instr_seq], dim=1)
62 | logits, _, _ = self.decoder(w_t, ctx, path_mask, h_t, c_t)
63 | logits = logits.permute(0, 2, 1).contiguous()
64 | loss = F.cross_entropy(
65 | input=logits[:, :, :-1], # -1 for aligning
66 | target=instr_seq, # "1:" to ignore the word
67 | ignore_index=VOCAB_PAD_IDX
68 | )
69 | else:
70 | sequence_scores = try_cuda(torch.zeros(batch_size))
71 | for t in range(self.max_instruction_length):
72 | logit, h_t, c_t = self.decoder(w_t.view(-1, 1), ctx, path_mask, h_t,
73 | c_t)
74 | logit = logit.squeeze(1)
75 |
76 | logit[:, VOCAB_PAD_IDX] = -float('inf')
77 | target = instr_seq[:, t].contiguous()
78 |
79 | if torch.isnan(logit).sum():
80 | print("Error: network produce nan result!")
81 | exit(0)
82 |
83 | # Determine next model inputs
84 | if feedback == 'teacher':
85 | w_t = target
86 | elif feedback == 'argmax':
87 | _, w_t = logit.max(1)
88 | w_t = w_t.detach()
89 | elif feedback == 'sample':
90 | probs = F.softmax(logit, dim=1)
91 | probs[:, VOCAB_PAD_IDX] = 0
92 | m = D.Categorical(probs)
93 | w_t = m.sample()
94 | else:
95 | sys.exit('Invalid feedback option')
96 |
97 | log_probs = F.log_softmax(logit, dim=1)
98 | word_scores = -F.nll_loss(log_probs, w_t, ignore_index=VOCAB_PAD_IDX,
99 | reduction='none')
100 | sequence_scores += word_scores
101 | loss += F.nll_loss(log_probs, target, ignore_index=VOCAB_PAD_IDX)
102 |
103 | for i in range(batch_size):
104 | word_idx = w_t[i].item()
105 | if not ended[i]:
106 | outputs[i]['word_indices'].append(int(word_idx))
107 | outputs[i]['scores'].append(word_scores[i].item())
108 | if word_idx == VOCAB_EOS_IDX:
109 | ended[i] = True
110 |
111 | # Early exit if all ended
112 | if ended.all():
113 | break
114 |
115 | for i, item in enumerate(outputs):
116 | item['score'] = float(sequence_scores[i].item()) / len(
117 | item['word_indices'])
118 | item['words'] = self.tok.decode_sentence(item['word_indices'],
119 | break_on_eos=True, join=False)
120 |
121 | return outputs, loss
122 |
123 | def _rollout(self, batch, train=True):
124 | path_obs, path_actions, encoded_instructions = \
125 | self.env.gold_obs_actions_and_instructions(batch)
126 | outputs, loss = \
127 | self._score_obs_actions_and_instructions(path_obs, path_actions,
128 | encoded_instructions,
129 | self.feedback, train)
130 | return outputs
131 |
132 | def query(self, batch, follower_results, feedback='argmax',
133 | curriculum=False):
134 | self.feedback = feedback
135 | if not curriculum:
136 | next_batch = [copy.deepcopy(b) for b in batch]
137 | else:
138 | next_batch = [copy.deepcopy(b[-1]) for b in batch]
139 | for i, b in enumerate(next_batch):
140 | if 'history_path' in b and len(b['history_path']) > 0:
141 | b['path'] = follower_results[i]['trajectory']
142 | b['heading'] = b['history_heading'][0]
143 | b['instr_encoding'] \
144 | = list(itertools.chain.from_iterable(b['history_instr_encoding'])) \
145 | + list(b['instr_encoding'])
146 | else:
147 | b['path'] = follower_results[i]['trajectory']
148 | with torch.no_grad():
149 | self.encoder.eval()
150 | self.decoder.eval()
151 | results = self._rollout(next_batch, train=False)
152 | return results
153 |
154 | def test(self, batch, feedback='argmax'):
155 | ''' Evaluate once on each instruction in the current environment '''
156 | with torch.no_grad():
157 | self.feedback = feedback
158 | self.encoder.eval()
159 | self.decoder.eval()
160 | self.results = {}
161 |
162 | count = 0
163 | looped = False
164 | while True:
165 | rollout_results = self._rollout(batch[count], train=False)
166 | count += 1
167 |
168 | for result in rollout_results:
169 | if result['instr_id'] in self.results:
170 | looped = True
171 | else:
172 | self.results[result['instr_id']] = result
173 | if looped or len(batch) == count:
174 | break
175 | return self.results
176 |
177 | def _encoder_and_decoder_paths(self, base_path):
178 | return base_path + "_enc", base_path + "_dec"
179 |
180 | def save(self, path):
181 | ''' Snapshot models '''
182 | encoder_path, decoder_path = self._encoder_and_decoder_paths(path)
183 | torch.save(self.encoder.state_dict(), encoder_path)
184 | torch.save(self.decoder.state_dict(), decoder_path)
185 |
186 | def load(self, path, **kwargs):
187 | ''' Loads parameters (but not training state) '''
188 | encoder_path, decoder_path = self._encoder_and_decoder_paths(path)
189 | self.encoder.load_my_state_dict(torch.load(encoder_path, **kwargs))
190 | self.decoder.load_my_state_dict(torch.load(decoder_path, **kwargs))
191 |
--------------------------------------------------------------------------------
/src/train_follower.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time
3 | import numpy as np
4 | import pandas as pd
5 | import argparse
6 | import math
7 | import sys
8 |
9 | sys.path.append('.')
10 |
11 | from torch import optim
12 | from collections import defaultdict
13 | from tensorboardX import SummaryWriter
14 | from src.vocab.vocab_path import GLOVE_PATH
15 | from src.vocab.tokenizer import VOCAB_PAD_IDX
16 | from src.utils import time_since, check_dir, make_batch, get_model_prefix, \
17 | make_data_and_env, run
18 | from src.params import add_general_args
19 | from simulator.envs.image_feature import ImageFeatures
20 | from model.speaker_lstm import SpeakerEncoderLSTM, SpeakerDecoderLSTM
21 | from src.speaker import Seq2SeqSpeaker
22 | from model.follower_coattend import EncoderLSTM, AttnDecoderLSTM
23 | from model.follower_coground import CogroundDecoderLSTM
24 | from model.cuda import try_cuda
25 | from src.follower import Seq2SeqFollower
26 | from src.eval_follower import FollowerEvaluation
27 |
28 |
29 | def train(args, agent, train_data, val_data, evaluator, speaker, train_tag):
30 | def make_path(dir, n_iter):
31 | return os.path.join(dir, '%s_%s_iter_%d' % (
32 | get_model_prefix(args.model_name, args.feedback_method),
33 | train_tag, n_iter))
34 |
35 | # check result directories
36 | task_prefix = os.path.join('tasks', args.task_name)
37 | result_dir = os.path.join(task_prefix, args.result_dir)
38 | snapshot_dir = os.path.join(task_prefix, args.snapshot_dir)
39 | plot_dir = os.path.join(task_prefix, args.plot_dir)
40 | summary_dir = os.path.join(task_prefix, args.summary_dir,
41 | time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime()))
42 | check_dir([result_dir, snapshot_dir, plot_dir, summary_dir])
43 |
44 | # initialize
45 | data_log = defaultdict(list)
46 | n_iters = args.n_iters
47 | log_every = args.log_every
48 | best_metrics = {}
49 | last_model_saved = {}
50 | writer = SummaryWriter(log_dir=summary_dir)
51 | train_ix = 0
52 |
53 | print('Training with %s feedback' % args.feedback_method)
54 | start = time.time()
55 | for idx in range(0, n_iters, log_every):
56 | interval = min(log_every, n_iters - idx)
57 | iter = idx + interval
58 | data_log['iteration'].append(iter)
59 |
60 | # make mini-batch
61 | if args.curriculum_rl:
62 | train_batch, train_ix = make_batch(train_data, train_ix, interval,
63 | args.batch_size, sort_instr_len=False)
64 | else:
65 | train_batch, train_ix = make_batch(train_data, train_ix, interval,
66 | args.batch_size, sort_instr_len=True)
67 |
68 | # train
69 | if args.curriculum_rl:
70 | assert args.reward == True
71 | agent.train_crl(train_batch, interval, speaker,
72 | curriculum=args.count_curriculum, history=args.history,
73 | reward_type=args.reward_type, exp_forget=args.exp_forget,
74 | beam_size=args.beam_size, feedback=args.feedback_method)
75 | elif args.reward:
76 | agent.train_reward(train_batch, interval, speaker,
77 | history=args.history, reward_type=args.reward_type,
78 | exp_forget=args.exp_forget, beam_size=args.beam_size,
79 | feedback=args.feedback_method)
80 | else:
81 | agent.train(train_batch, interval,
82 | history=args.history, exp_forget=args.exp_forget,
83 | feedback=args.feedback_method)
84 |
85 | # output loss / reward
86 | train_loss_avg = np.array(agent.losses).mean()
87 | data_log['train loss'].append(train_loss_avg)
88 | loss_str = 'train loss: %.4f' % train_loss_avg
89 | writer.add_scalar('data/train_loss', train_loss_avg, iter)
90 |
91 | if args.reward:
92 | int_reward_avg = np.array(agent.int_rewards).mean()
93 | data_log['int_reward'].append(int_reward_avg)
94 | loss_str += ', internal reward: %.4f' % int_reward_avg
95 | writer.add_scalar('data/int_reward', int_reward_avg, iter)
96 |
97 | ext_reward_avg = np.array(agent.ext_rewards).mean()
98 | data_log['ext_reward'].append(ext_reward_avg)
99 | loss_str += ', external reward: %.4f' % ext_reward_avg
100 | writer.add_scalar('data/ext_reward', ext_reward_avg, iter)
101 |
102 | # run validation
103 | save_log = []
104 | for tag, d in val_data.items():
105 | it = math.ceil(len(d) / args.batch_size)
106 | test_batch, _ = make_batch(d, 0, it, args.batch_size,
107 | shuffle=False, sort_instr_len=False)
108 | agent.test(test_batch, history=args.history, one_by_one=args.one_by_one,
109 | exp_forget=args.exp_forget)
110 | agent.results_path = make_path(result_dir, iter) + '_' + tag + '.json'
111 |
112 | # evaluate results
113 | print("evaluating on {}".format(tag))
114 | score_summary = evaluator.score_results(agent.results)
115 |
116 | loss_str += '\n%s' % (tag)
117 | for metric, val in sorted(score_summary.items()):
118 | data_log['%s %s' % (tag, metric)].append(val)
119 | writer.add_scalar('data/' + tag + '_' + metric, val, iter)
120 | if metric in ['sr', 'cls', 'sdtw']:
121 | print("%s on %s: %.3f" % (metric, tag, val))
122 |
123 | # save model
124 | key = (tag, metric)
125 | if key not in best_metrics or best_metrics[key] < val:
126 | best_metrics[key] = val
127 | if not args.no_save:
128 | model_path = make_path(snapshot_dir, iter) \
129 | + "_%s-%s=%.3f" % (tag, metric, val)
130 | save_log.append("new best, saved model to %s" % model_path)
131 | agent.save(model_path)
132 | if key in last_model_saved:
133 | for old_model_path in \
134 | agent.encoder_and_decoder_paths(last_model_saved[key]):
135 | os.remove(old_model_path)
136 | last_model_saved[key] = model_path
137 | loss_str += ', %s: %.3f' % (metric, val)
138 |
139 | # report training process
140 | print(('%s (%d %d%%) %s'
141 | % (time_since(start, float(iter) / n_iters), iter,
142 | float(iter) / n_iters * 100, loss_str)))
143 | for s in save_log:
144 | print(s)
145 | if not args.no_save:
146 | if args.save_every and iter % args.save_every == 0:
147 | agent.save(make_path(snapshot_dir, iter))
148 | df = pd.DataFrame(data_log)
149 | df.set_index('iteration')
150 | df_path = '%s%s_%s_log.csv' \
151 | % (plot_dir,
152 | get_model_prefix(args.model_name, args.feedback_method),
153 | train_tag)
154 | df.to_csv(df_path)
155 |
156 | # update curriculum
157 | if args.curriculum_rl \
158 | and iter % args.curriculum_iters == 0 \
159 | and args.count_curriculum < args.max_curriculum:
160 | args.count_curriculum += 1
161 | agent.reset_rav()
162 | agent.encoder_optimizer, agent.decoder_optimizer = \
163 | reset_optimizer(args, agent.encoder, agent.decoder)
164 |
165 |
166 | def reset_optimizer(args, encoder, decoder):
167 | def filter_param(param_list):
168 | return [p for p in param_list if p.requires_grad]
169 |
170 | enc_para = encoder.parameters()
171 | dec_para = decoder.parameters()
172 | if args.learning_method == "adam":
173 | encoder_optimizer = optim.Adam(filter_param(enc_para), lr=args.lr,
174 | weight_decay=args.weight_decay)
175 | decoder_optimizer = optim.Adam(filter_param(dec_para), lr=args.lr,
176 | weight_decay=args.weight_decay)
177 | elif args.learning_method == "sgd":
178 | encoder_optimizer = optim.SGD(filter_param(enc_para), lr=args.lr,
179 | momentum=0.9, nesterov=True,
180 | weight_decay=args.weight_decay)
181 | decoder_optimizer = optim.SGD(filter_param(dec_para), lr=args.lr,
182 | momentum=0.9, nesterov=True,
183 | weight_decay=args.weight_decay)
184 | elif args.learning_method == "rms":
185 | encoder_optimizer = optim.RMSprop(filter_param(enc_para), lr=args.lr,
186 | weight_decay=args.weight_decay)
187 | decoder_optimizer = optim.RMSprop(filter_param(dec_para), lr=args.lr,
188 | weight_decay=args.weight_decay)
189 | else:
190 | raise ValueError("Error! Learning method not correct")
191 |
192 | return encoder_optimizer, decoder_optimizer
193 |
194 |
195 | def make_speaker_models(args, vocab_size, env, tok):
196 | glove = np.load(GLOVE_PATH)
197 | encoder = SpeakerEncoderLSTM(args.feature_size, args.hidden_size,
198 | args.dropout)
199 | decoder = SpeakerDecoderLSTM(vocab_size, args.wemb, VOCAB_PAD_IDX,
200 | args.hidden_size, args.dropout, glove=glove)
201 | encoder = try_cuda(encoder)
202 | decoder = try_cuda(decoder)
203 |
204 | agent = Seq2SeqSpeaker(env, "", args, encoder, decoder, tok)
205 | return agent
206 |
207 |
208 | def make_follower_models(args, vocab_size, all_val_data, env):
209 | glove = np.load(GLOVE_PATH)
210 | encoder = EncoderLSTM(vocab_size, args.wemb, args.hidden_size, VOCAB_PAD_IDX,
211 | args.dropout, glove=glove)
212 | if args.coground:
213 | decoder = CogroundDecoderLSTM(args.action_embed_size, args.hidden_size,
214 | args.dropout, args.feature_size,
215 | args.max_ins_len, history=args.history)
216 | else:
217 | decoder = AttnDecoderLSTM(args.action_embed_size, args.hidden_size,
218 | args.dropout, args.feature_size,
219 | history=args.history,
220 | lstm_mem=args.exp_forget < 0)
221 |
222 | encoder = try_cuda(encoder)
223 | decoder = try_cuda(decoder)
224 | encoder_optimizer, decoder_optimizer = \
225 | reset_optimizer(args, encoder, decoder)
226 |
227 | agent = Seq2SeqFollower(env, "", args, encoder, decoder, encoder_optimizer,
228 | decoder_optimizer)
229 | evaluator = FollowerEvaluation(env, all_val_data)
230 | return agent, evaluator
231 |
232 |
233 | def train_setup(args):
234 | ''' Load data, setup environment and setup agent '''
235 | train_data, val_data, all_val_data, env, vocab, tok, train_tag = \
236 | make_data_and_env(args)
237 | agent, evaluator = make_follower_models(args, len(vocab), all_val_data, env)
238 | if args.reward and not args.no_speaker:
239 | speaker = make_speaker_models(args, len(vocab), env, tok)
240 | speaker.load(args.speaker_prefix, **{})
241 | print("Load speaker model %s" % args.speaker_prefix)
242 | else:
243 | speaker = None
244 | if args.follower_prefix is not None:
245 | agent.load(args.follower_prefix, args.load_opt, **{})
246 | print("Load follower model %s" % args.follower_prefix)
247 | return agent, train_data, val_data, evaluator, speaker, train_tag
248 |
249 |
250 | def train_val(args):
251 | ''' Train on the training set, and validate on validation (seen/unseen) set. '''
252 | agent, train_data, val_data, evaluator, speaker, train_tag = \
253 | train_setup(args)
254 | train(args, agent, train_data, val_data, evaluator, speaker, train_tag)
255 |
256 |
257 | def make_arg_parser():
258 | parser = argparse.ArgumentParser()
259 | ImageFeatures.add_args(parser)
260 | add_general_args(parser)
261 | return parser
262 |
263 |
264 | if __name__ == "__main__":
265 | run(make_arg_parser(), train_val)
266 |
--------------------------------------------------------------------------------
/src/utils.py:
--------------------------------------------------------------------------------
1 | ''' Utils for training and evaluation '''
2 |
3 | import os
4 | import json
5 | import time
6 | import math
7 | import random
8 | import torch
9 | import numpy as np
10 |
11 | from model.cuda import try_cuda
12 | from src.process_data import make_data
13 | from src.vocab.tokenizer import VOCAB_PAD_IDX, VOCAB_EOS_IDX
14 | from simulator.envs.image_feature import ImageFeatures
15 | from simulator.envs.env import RoomEnv
16 | from simulator.envs.paths import ADJ_LIST_FILE
17 |
18 |
19 | def random_seed():
20 | torch.manual_seed(1)
21 | torch.cuda.manual_seed(1)
22 |
23 |
24 | def batch_observations_and_actions(path_obs, path_actions, padding_feature,
25 | padding_action):
26 | batch_size = len(path_obs)
27 | seq_lengths = np.array([len(a) for a in path_actions])
28 | max_path_length = seq_lengths.max()
29 | mask = np.ones((batch_size, max_path_length), np.uint8)
30 | image_features = [[] for _ in range(batch_size)]
31 | action_embeddings = [[] for _ in range(batch_size)]
32 | for i in range(batch_size):
33 | assert len(path_obs[i]) == len(path_actions[i])
34 | mask[i, :len(path_actions[i])] = 0
35 | image_features[i] = [ob['feature'][0] for ob in path_obs[i]]
36 | action_embeddings[i] = [ob['action_embedding'][path_actions[i][j]]
37 | for j, ob in enumerate(path_obs[i])]
38 | image_features[i].extend([padding_feature]
39 | * (max_path_length - len(path_actions[i])))
40 | action_embeddings[i].extend([padding_action]
41 | * (max_path_length - len(path_actions[i])))
42 | image_features[i] = torch.stack(image_features[i], dim=0)
43 | action_embeddings[i] = torch.stack(action_embeddings[i], dim=0)
44 | batched_image_features = torch.stack(image_features, dim=0)
45 | batched_action_embeddings = torch.stack(action_embeddings, dim=0)
46 | mask = try_cuda(torch.from_numpy(mask).byte())
47 | return batched_image_features, batched_action_embeddings, mask, seq_lengths
48 |
49 |
50 | def batch_instructions_from_encoded(encoded_instructions, max_length,
51 | reverse=False, cut=True):
52 | num_instructions = len(encoded_instructions)
53 | seq_tensor = np.full((num_instructions, max_length), VOCAB_PAD_IDX)
54 | seq_lengths = []
55 | for i, inst in enumerate(encoded_instructions):
56 | if len(inst) > 0 and inst[-1] == VOCAB_EOS_IDX:
57 | inst = inst[:-1]
58 | if reverse:
59 | inst = inst[::-1]
60 | inst = np.concatenate((inst, [VOCAB_EOS_IDX]))
61 | inst = inst[:max_length]
62 | seq_tensor[i, :len(inst)] = inst
63 | seq_lengths.append(len(inst))
64 |
65 | if cut:
66 | seq_tensor = torch.from_numpy(seq_tensor)[:, :max(seq_lengths)]
67 | mask = (seq_tensor == VOCAB_PAD_IDX)[:, :max(seq_lengths)]
68 | else:
69 | seq_tensor = torch.from_numpy(seq_tensor)
70 | mask = (seq_tensor == VOCAB_PAD_IDX)
71 |
72 | return try_cuda(seq_tensor.long()), try_cuda(mask.byte()), seq_lengths
73 |
74 |
75 | def make_data_and_env(args):
76 | # make data
77 | train_data, val_data, all_val_data, vocab, tok, train_tag = make_data(args)
78 |
79 | # make env
80 | random_seed()
81 | image_features_list = ImageFeatures.from_args(args)
82 | paths, states_map, distances = RoomEnv.load_graphs()
83 | state_embedding = RoomEnv.make_state_embeddings(args, states_map,
84 | image_features_list)
85 | loc_embeddings = [RoomEnv.build_viewpoint_loc_embedding(args, viewIndex)
86 | for viewIndex in range(args.num_views)]
87 | adj_dict = RoomEnv.load_adj_feature(ADJ_LIST_FILE)
88 | env = RoomEnv(args, paths, states_map, distances, state_embedding,
89 | loc_embeddings, adj_dict)
90 | return train_data, val_data, all_val_data, env, vocab, tok, train_tag
91 |
92 |
93 | def make_batch(data, ix, n_iter, batch_size, shuffle=True,
94 | sort_instr_len=True):
95 | batches = []
96 | new_ix = ix
97 | for i in range(n_iter):
98 | batch = data[new_ix:new_ix + batch_size]
99 | if len(batch) < batch_size:
100 | random.shuffle(data) if shuffle else None
101 | new_ix = batch_size - len(batch)
102 | batch += data[:new_ix]
103 | else:
104 | new_ix += batch_size
105 | if sort_instr_len:
106 | batch = sorted(batch, key=lambda item: item['instr_length'],
107 | reverse=True)
108 | batches.append(batch)
109 | return batches, new_ix
110 |
111 |
112 | def get_model_prefix(model_name, feedback_method):
113 | model_prefix = '{}_{}'.format(model_name, feedback_method)
114 | return model_prefix
115 |
116 |
117 | def pretty_json_dump(obj, fp):
118 | json.dump(obj, fp, sort_keys=True, indent=4, separators=(',', ':'))
119 |
120 |
121 | def as_minutes(s):
122 | m = math.floor(s / 60)
123 | s -= m * 60
124 | return '%dm %ds' % (m, s)
125 |
126 |
127 | def time_since(since, percent):
128 | now = time.time()
129 | s = now - since
130 | es = s / (percent)
131 | rs = es - s
132 | return '%s (- %s)' % (as_minutes(s), as_minutes(rs))
133 |
134 |
135 | def run(arg_parser, entry_function):
136 | arg_parser.add_argument("--pdb", action='store_true')
137 | arg_parser.add_argument("--ipdb", action='store_true')
138 | arg_parser.add_argument("--no_cuda", action='store_true')
139 |
140 | args = arg_parser.parse_args()
141 |
142 | import torch.cuda
143 | torch.cuda.disabled = args.no_cuda
144 |
145 | if args.ipdb:
146 | import ipdb
147 | ipdb.runcall(entry_function, args)
148 | elif args.pdb:
149 | import pdb
150 | pdb.runcall(entry_function, args)
151 | else:
152 | entry_function(args)
153 |
154 |
155 | def check_dir(dir_list):
156 | for dir in dir_list:
157 | if not os.path.exists(dir):
158 | os.makedirs(dir)
159 |
--------------------------------------------------------------------------------
/src/val_follower.py:
--------------------------------------------------------------------------------
1 | import os
2 | import os.path
3 | import numpy as np
4 | import argparse
5 | import math
6 | import sys
7 |
8 | sys.path.append('.')
9 |
10 | from src.params import add_general_args
11 | from src.utils import check_dir, make_batch, get_model_prefix, run
12 | from src.train_follower import train_setup
13 | from simulator.envs.image_feature import ImageFeatures
14 |
15 |
16 | def val(args, agent, val_data, evaluator):
17 | task_prefix = os.path.join('tasks', args.task_name)
18 | result_dir = os.path.join(task_prefix, args.result_dir)
19 | check_dir([result_dir])
20 |
21 | def make_path(dir):
22 | return os.path.join(dir, '%s_%s'
23 | % (get_model_prefix(args.model_name, args.task_name),
24 | 'validation'))
25 |
26 | # run validation
27 | loss_str = ''
28 | ratio_number = []
29 | metric_dict = {
30 | 'lengths': [], 'nav_error': [], 'sr': [], 'spl': [], 'cls': [], 'ndtw': [],
31 | 'sdtw': [], 'sr_std': [], 'spl_std': [], 'cls_std': []
32 | }
33 | for tag, d in val_data.items():
34 | ratio_number.append(len(d))
35 | it = math.ceil(len(d) / args.batch_size)
36 | test_batch, _ = make_batch(d, 0, it, args.batch_size,
37 | shuffle=False, sort_instr_len=False)
38 | agent.test(test_batch, one_by_one=args.one_by_one, history=args.history,
39 | exp_forget=args.exp_forget)
40 | agent.results_path = make_path(result_dir) + '_' + tag + '.json'
41 |
42 | print("evaluating on {}".format(tag))
43 | score_summary = evaluator.score_results(agent.results, update_results=True)
44 | loss_str += '\n%s' % (tag)
45 | for metric, val in sorted(score_summary.items()):
46 | if metric in metric_dict:
47 | metric_dict[metric].append(val)
48 | loss_str += ', %s: %.3f' % (metric, val)
49 | agent.write_results()
50 | print("PL: %.2f, NE: %.2f, SR: %.1f, SPL: %.1f, "
51 | "CLS: %.1f, NDTW: %.1f, SDTW %.1f"
52 | % (metric_dict['lengths'][-1], metric_dict['nav_error'][-1],
53 | metric_dict['sr'][-1] * 100, metric_dict['spl'][-1] * 100,
54 | metric_dict['cls'][-1] * 100, metric_dict['ndtw'][-1] * 100,
55 | metric_dict['sdtw'][-1] * 100))
56 |
57 | print("Average\nPL: %.2f, NE: %.2f, SR: %.1f, SPL: %.1f, "
58 | "CLS: %.1f, NDTW: %.1f, SDTW %.1f"
59 | % (np.array(metric_dict['lengths']).mean(),
60 | np.array(metric_dict['nav_error']).mean(),
61 | np.array(metric_dict['sr']).mean() * 100,
62 | np.array(metric_dict['spl']).mean() * 100,
63 | np.array(metric_dict['cls']).mean() * 100,
64 | np.array(metric_dict['ndtw']).mean() * 100,
65 | np.array(metric_dict['sdtw']).mean() * 100))
66 | print('%s' % (loss_str))
67 |
68 |
69 | def test(args, agent, val_data):
70 | task_prefix = os.path.join('tasks', args.task_name)
71 | result_dir = os.path.join(task_prefix, args.result_dir)
72 | check_dir([result_dir])
73 |
74 | def make_path(dir):
75 | return os.path.join(dir, '%s_%s'
76 | % (get_model_prefix(args.model_name, args.task_name),
77 | 'test'))
78 |
79 | # test
80 | for _, d in val_data.items():
81 | it = math.ceil(len(d) / args.batch_size)
82 | test_batch, _ = make_batch(d, 0, it, args.batch_size,
83 | shuffle=False, sort_instr_len=False)
84 | agent.test(test_batch, one_by_one=args.one_by_one, history=args.history,
85 | exp_forget=args.exp_forget)
86 | agent.results_path = make_path(result_dir) + '.json'
87 |
88 | # reformat
89 | reformat_results = []
90 | for id, r in agent.results.items():
91 | reformat_results.append({
92 | "instr_id": id,
93 | "trajectory": [[r["trajectory"][i]] + list(r["trajectory_radians"][i])
94 | for i in range(len(r["trajectory"]))]
95 | })
96 | agent.results = reformat_results
97 | agent.write_results()
98 |
99 |
100 | def train_val(args):
101 | ''' Validate on seen and unseen splits. '''
102 | follower, _, val_data, evaluator, _, _ = train_setup(args)
103 | if args.use_test:
104 | test(args, follower, val_data)
105 | else:
106 | val(args, follower, val_data, evaluator)
107 |
108 |
109 | def make_arg_parser():
110 | parser = argparse.ArgumentParser()
111 | ImageFeatures.add_args(parser)
112 | add_general_args(parser)
113 | return parser
114 |
115 |
116 | if __name__ == "__main__":
117 | run(make_arg_parser(), train_val)
118 |
--------------------------------------------------------------------------------
/src/vocab/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sha-Lab/babywalk/4265340d5e521b59030bc50ef71f01b0d3d3ba63/src/vocab/__init__.py
--------------------------------------------------------------------------------
/src/vocab/tokenizer.py:
--------------------------------------------------------------------------------
1 | import re
2 | import string
3 | import sys
4 | import numpy as np
5 |
6 | # padding, unknown word, end of sentence
7 | BASE_VOCAB = ['', '', '', '']
8 | VOCAB_PAD_IDX = BASE_VOCAB.index('')
9 | VOCAB_UNK_IDX = BASE_VOCAB.index('')
10 | VOCAB_EOS_IDX = BASE_VOCAB.index('')
11 |
12 |
13 | def read_vocab(path):
14 | with open(path) as f:
15 | vocab = [word.strip() for word in f.readlines()]
16 | return vocab
17 |
18 |
19 | class Tokenizer(object):
20 | ''' Class to tokenize and encode a sentence. '''
21 | # Split on any non-alphanumeric character
22 | SENTENCE_SPLIT_REGEX = re.compile(r'(\W+)')
23 |
24 | def __init__(self, vocab=None, no_glove=False):
25 | self.vocab = vocab
26 | self.word_to_index = {}
27 |
28 | if no_glove:
29 | self.vocab_bos_idx = len(vocab)
30 | self.add_word('', len(vocab))
31 | else:
32 | self.vocab_bos_idx = BASE_VOCAB.index('')
33 | if vocab:
34 | for i, word in enumerate(vocab):
35 | self.word_to_index[word] = i
36 |
37 | def add_word(self, word, place):
38 | assert word not in self.word_to_index
39 | self.vocab.insert(place, word)
40 |
41 | @staticmethod
42 | def split_sentence(sentence):
43 | ''' Break sentence into a list of words and punctuation '''
44 | toks = []
45 | for word in [s.strip().lower() for s in
46 | Tokenizer.SENTENCE_SPLIT_REGEX.split(sentence.strip())
47 | if len(s.strip()) > 0]:
48 | # Break up any words containing punctuation only, e.g. '!?', unless it is multiple full stops e.g. '..'
49 | if all(c in string.punctuation for c in word) \
50 | and not all(c in '.' for c in word):
51 | toks += list(word)
52 | else:
53 | toks.append(word)
54 | return toks
55 |
56 | def encode_sentence(self, sentence):
57 | if len(self.word_to_index) == 0:
58 | sys.exit('Tokenizer has no vocab')
59 | encoding = []
60 | for word in Tokenizer.split_sentence(sentence):
61 | if word in self.word_to_index:
62 | encoding.append(self.word_to_index[word])
63 | else:
64 | encoding.append(VOCAB_UNK_IDX)
65 | arr = np.array(encoding)
66 | return arr, len(encoding)
67 |
68 | def decode_sentence(self, encoding, break_on_eos=False, join=True):
69 | sentence = []
70 | for ix in encoding:
71 | if ix == (VOCAB_EOS_IDX if break_on_eos else VOCAB_PAD_IDX):
72 | break
73 | else:
74 | sentence.append(self.vocab[ix])
75 | if join:
76 | return " ".join(sentence)
77 | return sentence
78 |
--------------------------------------------------------------------------------
/src/vocab/vocab_path.py:
--------------------------------------------------------------------------------
1 | SUBTRAIN_VOCAB = 'src/vocab/vocab_data/sub_train_vocab.txt'
2 | TRAIN_VOCAB = 'src/vocab/vocab_data/train_vocab.txt'
3 | TRAINVAL_VOCAB = 'src/vocab/vocab_data/trainval_vocab.txt'
4 | NOUN_TRAIN = 'src/vocab/vocab_data/train_noun.txt'
5 | NOUN_PHRASE_TRAIN = 'src/vocab/vocab_data/train_spacy_noun_phrase.txt'
6 | GLOVE_PATH = 'src/vocab/vocab_data/train_glove.npy'
7 |
--------------------------------------------------------------------------------
/tasks/R2R/README.md:
--------------------------------------------------------------------------------
1 | # Room-to-Room (R2R) Navigation Task
2 |
3 |
4 | ## Download Data
5 |
6 | Data consists of train/val-seen/val-unseen/test splits. There are two validation sets to better understand generalization performance between buildings that are in the training set (val-seen) and unseen buildings. The test set consists entirely of unseen buildings.
7 |
8 | To download, from the top level directory, run:
9 | ```
10 | ./tasks/R2R/data/download.sh
11 | ```
12 |
13 | Data is formatted as follows:
14 | ```
15 | {
16 | "distance": float,
17 | "scan": str,
18 | "path_id": int,
19 | "path": [str x num_steps],
20 | "heading": float,
21 | "instructions": [str x 3],
22 | }
23 | ```
24 | - `distance`: length of the path in meters.
25 | - `scan`: Matterport scan id.
26 | - `path_id`: Unique id for this path.
27 | - `path`: List of viewpoint ids (the first is is the start location, the last is the goal location)
28 | - `heading`: Agents initial heading in radians (elevation is always assumed to be zero).
29 | - `instructions`: Three unique natural language strings describing how to find the goal given the start pose.
30 |
31 | For the test set, only the first path_id (starting location) is included. We will provide a test server for scoring uploaded trajectories according to the metrics in the [paper](https://arxiv.org/abs/1711.07280).
32 |
33 | ## Directory Structure
34 |
35 | - `env.py`: Wraps the simulator and adds language instructions, with several simplifications -- namely discretized heading / elevation and pre-cached image features. This is not intended to be a standard component, or to preclude the use of continous camera actions, end-to-end training etc. Use the simulator and the data as you see fit, but this can provide a starting point.
36 | - `utils.py`: Text pre-processing, navigation graph loading etc.
37 | - `eval.py`: Evaluation script.
38 | - `model.py`: PyTorch seq2seq model with attention.
39 | - `agent.py`: Various implementations of an agent.
40 | - `train.py`: Training entrypoint, parameter settings etc.
41 | - `plot.py`: Figures from the arXiv paper.
42 |
43 | ## Prerequisites
44 |
45 | Python 2, [PyTorch](http://pytorch.org/), [NetworkX](https://networkx.github.io/). Install python dependencies by running:
46 | ```
47 | pip install -r /tasks/R2R/requirements.txt
48 | ```
49 |
50 | ## Training and Evaluation
51 |
52 | To train the seq2seq model with student-forcing:
53 | ```
54 | python tasks/R2R/train.py
55 | ```
56 |
57 | To run some simple baselines:
58 | ```
59 | python tasks/R2R/eval.py
60 | ```
61 |
62 | Generate figures from the paper:
63 | ```
64 | python tasks/R2R/plot.py
65 | ```
66 |
67 | The simple baselines include:
68 | - `ShortestAgent`: Agent that always follows the shortest path to goal (foundation for supervised training).
69 | - `RandomAgent`: Agent that randomly picks a directly, then tries to go straight for 5 viewpoints.
70 | - `StopAgent`: Agent that remains at the starting position.
71 |
72 | 
73 |
--------------------------------------------------------------------------------
/tasks/R2R/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sha-Lab/babywalk/4265340d5e521b59030bc50ef71f01b0d3d3ba63/tasks/R2R/__init__.py
--------------------------------------------------------------------------------
/tasks/R2R/requirements.txt:
--------------------------------------------------------------------------------
1 | backports.functools-lru-cache==1.4
2 | cycler==0.10.0
3 | decorator==4.1.2
4 | matplotlib==2.1.0
5 | networkx==2.0
6 | numpy==1.13.3
7 | olefile==0.44
8 | pandas==0.21.0
9 | pillow>=6.2.2
10 | pyparsing==2.2.0
11 | python-dateutil==2.6.1
12 | pytz==2017.3
13 | pytorch=1.1.0
14 | PyYAML==5.1
15 | six==1.11.0
16 | subprocess32==3.2.7
17 | torch==0.2.0.post3
18 | torchvision==0.3.0
19 |
--------------------------------------------------------------------------------
/tasks/R4R/README.md:
--------------------------------------------------------------------------------
1 | # R4R: Instruction and Path Composition for VLN
2 |
3 | [Room-to-Room](https://bringmeaspoon.org/) (R2R) is a pioneering dataset in
4 | visually-grounded natural language navigation with photo-realistic environments
5 | ([Anderson et al., 2018](https://arxiv.org/abs/1711.07280)). R2R consists of an
6 | environment and language instructions paired to reference paths. Due to the
7 | process by which the data are generated, all R2R reference paths are
8 | shortest-to-goal paths by construction. As such, they capture only a small
9 | subset of the richness of navigation.
10 |
11 | To address the lack of variety in path configurations, we propose a simple yet
12 | effective data augmentation strategy that increases the number of training
13 | examples and introduces paths that twist and turn, without additional human or
14 | low-fidelity machine annotations Quite simply, the existing paths in the
15 | dataset can be extended by joining them with other paths that start within some
16 | threshold dth of where they end. We name this the Room-for-Room (R4R) dataset.
17 |
18 | For further details, see the accompanying paper:
19 | [Stay on the Path: Instruction Fidelity in Vision-and-Language Navigation](https://arxiv.org/abs/1905.12255)
20 |
21 | ## Documentation
22 |
23 | The R4R dataset is created by joining together paths in the R2R dataset, for
24 | which the first path ends within a thresholded distance from the start of the
25 | second. We do not distribute the original R2R data here, and instead provide
26 | code that constructs R4R from it. The original R2R data can be downloaded
27 | [here](https://niessner.github.io/Matterport/).
28 |
29 | Example usage:
30 |
31 | ```
32 | python r4r_generate_data.py \
33 | --input_file_path="path/to/R2R_train.json" \
34 | --output_file_path="path/to/R4R_train.json" \
35 | --connections_dir="path/to/connections" \
36 | --distance_threshold="3.0"
37 | ```
38 |
39 | Command line arguments for `r4r_generate_data.py`:
40 |
41 | * `--output_file_path`: Path to the R4R data JSON file you are generating.
42 | * `--input_file_path`: Path to the original R2R data JSON file, which can be
43 | downloaded
44 | [here](https://github.com/peteanderson80/Matterport3DSimulator/blob/master/tasks/R2R/data/download.sh).
45 | * `--connections_dir`: Path to a directory containing graph connectivity
46 | files, which can be downloaded
47 | [here](https://github.com/peteanderson80/Matterport3DSimulator/tree/master/connectivity).
48 | * `--distance_threshold`: The maximum shortest-path distance between the final
49 | node of first path and the first node of the second path for the two paths
50 | to be joined. Conventionaly this is 3.0 meters
51 | ([Anderson et al., 2018](https://arxiv.org/abs/1711.07280)).
52 | * `--heading_threshold`: The maximum absolute heading angle difference in
53 | radians between the final connection of first path and the initial heading
54 | of the second path for the two paths to be joined. Conventionaly this check
55 | is disabled.
56 |
57 | Running this script on the standard R2R training and validation data with a
58 | distance threshold of 3.0 meters and no heading threshold:
59 |
60 | ```
61 | ### R2R_train.json
62 |
63 | ******Final Results********
64 | Total instructions generated: 233613
65 | Average path distance (meters): 20.5901583255
66 | Average shortest path distance: 10.5022469844
67 | Average path length (steps): 12.0681064404
68 | Average shortest path length: 6.4874662553
69 | Total paths generated: 25930
70 | Total distance filtered paths: 381581
71 | Total heading filtered paths: 0
72 |
73 | ### R2R_val_seen.json
74 |
75 | ******Final Results********
76 | Total instructions generated: 1035
77 | Average path distance (meters): 20.3605171182
78 | Average shortest path distance: 11.1137253455
79 | Average path length (steps): 12.2173913043
80 | Average shortest path length: 7.0
81 | Total paths generated: 115
82 | Total distance filtered paths: 2269
83 | Total heading filtered paths: 0
84 |
85 | ### R2R_val_unseen.json
86 |
87 | ******Final Results********
88 | Total instructions generated: 45162
89 | Average path distance (meters): 20.222094624
90 | Average shortest path distance: 10.057187751
91 | Average path length (steps): 12.147070546
92 | Average shortest path length: 6.40294938222
93 | Total paths generated: 5018
94 | Total distance filtered paths: 63401
95 | Total heading filtered paths: 0
96 | ```
97 |
98 | Note: this script requires NetworkX and was tested on version 2.3.
99 |
100 | ## Reference
101 |
102 | If you use or discuss this dataset in your work, please cite our paper:
103 |
104 | ```
105 | @InProceedings{sotp2019acl,
106 | title = {{Stay on the Path: Instruction Fidelity in Vision-and-Language Navigation}},
107 | author = {Jain, Vihan and Magalhaes, Gabriel and Ku, Alexander and Vaswani, Ashish and Ie, Eugene and Baldridge, Jason},
108 | booktitle = {Proc. of ACL},
109 | year = {2019}
110 | }
111 | ```
112 |
113 | ## Contact
114 |
115 | If you have a technical question regarding the dataset or publication, please
116 | create an issue in this repository.
117 |
--------------------------------------------------------------------------------
/tasks/R4R/__init__.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2020 The Google Research Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 |
--------------------------------------------------------------------------------
/tasks/R4R/cls.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2020 The Google Research Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | """Coverage weighted by length score (CLS).
17 |
18 | Link to the original paper:
19 | https://arxiv.org/abs/1905.12255
20 | """
21 |
22 | from __future__ import print_function
23 |
24 | import networkx as nx
25 | import numpy as np
26 |
27 |
28 | class CLS(object):
29 | """Coverage weighted by length score (CLS).
30 |
31 | Python doctest:
32 |
33 | >>> cls = CLS(nx.grid_graph([3, 4]))
34 | >>> reference = [(0, 0), (1, 0), (1, 1), (2, 1), (2, 2), (3, 2)]
35 | >>> assert np.isclose(cls(reference, reference), 1.0)
36 | >>> prediction = [(0, 0), (0, 1), (1, 1), (2, 1), (3, 1), (3, 2)]
37 | >>> assert np.isclose(cls(reference, prediction), 0.81994915125863865)
38 | >>> prediction = [(0, 1), (1, 1), (2, 1), (3, 1)]
39 | >>> assert np.isclose(cls(reference, prediction), 0.44197196102702557)
40 |
41 | Link to the original paper:
42 | https://arxiv.org/abs/1905.12255
43 | """
44 |
45 | def __init__(self, graph, weight='weight', threshold=3.0):
46 | """Initializes a CLS object.
47 |
48 | Args:
49 | graph: networkx graph for the environment.
50 | weight: networkx edge weight key (str).
51 | threshold: distance threshold $d_{th}$ (float).
52 | """
53 | self.graph = graph
54 | self.weight = weight
55 | self.threshold = threshold
56 | self.distance = dict(
57 | nx.all_pairs_dijkstra_path_length(
58 | self.graph, weight=self.weight))
59 |
60 | def __call__(self, prediction, reference):
61 | """Computes the CLS metric.
62 |
63 | Args:
64 | prediction: list of nodes (str), path predicted by agent.
65 | reference: list of nodes (str), the ground truth path.
66 |
67 | Returns:
68 | the CLS between the prediction and reference path (float).
69 | """
70 |
71 | def length(nodes):
72 | return np.sum([
73 | self.graph.edges[edge].get(self.weight, 1.0)
74 | for edge in zip(nodes[:-1], nodes[1:])
75 | ])
76 |
77 | coverage = np.mean([
78 | np.exp(-np.min([ # pylint: disable=g-complex-comprehension
79 | self.distance[u][v] for v in prediction
80 | ]) / self.threshold) for u in reference
81 | ])
82 | expected = coverage * length(reference)
83 | score = expected / (expected + np.abs(expected - length(prediction)))
84 | return coverage * score
--------------------------------------------------------------------------------
/tasks/R4R/dtw.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2020 The Google Research Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | """Dynamic Time Warping based evaluation metrics for VLN."""
17 |
18 | from __future__ import print_function
19 |
20 | import networkx as nx
21 | import numpy as np
22 |
23 |
24 | class DTW(object):
25 | """Dynamic Time Warping (DTW) evaluation metrics.
26 |
27 | Python doctest:
28 |
29 | >>> graph = nx.grid_graph([3, 4])
30 | >>> prediction = [(0, 0), (1, 0), (2, 0), (3, 0)]
31 | >>> reference = [(0, 0), (1, 0), (2, 1), (3, 2)]
32 | >>> dtw = DTW(graph)
33 | >>> assert np.isclose(dtw(prediction, reference, 'dtw'), 3.0)
34 | >>> assert np.isclose(dtw(prediction, reference, 'ndtw'), 0.77880078307140488)
35 | >>> assert np.isclose(dtw(prediction, reference, 'sdtw'), 0.77880078307140488)
36 | >>> assert np.isclose(dtw(prediction[:2], reference, 'sdtw'), 0.0)
37 | """
38 |
39 | def __init__(self, graph, weight='weight', threshold=3.0):
40 | """Initializes a DTW object.
41 |
42 | Args:
43 | graph: networkx graph for the environment.
44 | weight: networkx edge weight key (str).
45 | threshold: distance threshold $d_{th}$ (float).
46 | """
47 | self.graph = graph
48 | self.weight = weight
49 | self.threshold = threshold
50 | self.distance = dict(
51 | nx.all_pairs_dijkstra_path_length(self.graph, weight=self.weight))
52 |
53 | def __call__(self, prediction, reference, metric='sdtw'):
54 | """Computes DTW metrics.
55 |
56 | Args:
57 | prediction: list of nodes (str), path predicted by agent.
58 | reference: list of nodes (str), the ground truth path.
59 | metric: one of ['ndtw', 'sdtw', 'dtw'].
60 |
61 | Returns:
62 | the DTW between the prediction and reference path (float).
63 | """
64 | assert metric in ['ndtw', 'sdtw', 'dtw']
65 |
66 | dtw_matrix = np.inf * np.ones((len(prediction) + 1, len(reference) + 1))
67 | dtw_matrix[0][0] = 0
68 | for i in range(1, len(prediction)+1):
69 | for j in range(1, len(reference)+1):
70 | best_previous_cost = min(
71 | dtw_matrix[i-1][j], dtw_matrix[i][j-1], dtw_matrix[i-1][j-1])
72 | cost = self.distance[prediction[i-1]][reference[j-1]]
73 | dtw_matrix[i][j] = cost + best_previous_cost
74 | dtw = dtw_matrix[len(prediction)][len(reference)]
75 |
76 | if metric == 'dtw':
77 | return dtw
78 |
79 | ndtw = np.exp(-dtw/(self.threshold * len(reference)))
80 | if metric == 'ndtw':
81 | return ndtw
82 |
83 | success = self.distance[prediction[-1]][reference[-1]] <= self.threshold
84 | return success * ndtw
--------------------------------------------------------------------------------
/tasks/R4R/graph_utils.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2020 The Google Research Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | """Utils for loading and drawing graphs of the houses."""
17 |
18 | from __future__ import print_function
19 |
20 | import json
21 | import matplotlib.pyplot as plt
22 |
23 | import networkx as nx
24 | import numpy as np
25 | from numpy.linalg import norm
26 |
27 |
28 | def load(connections_file):
29 | """Loads a networkx graph for a given scan.
30 |
31 | Args:
32 | connections_file: A string with the path to the .json file with the
33 | connectivity information.
34 | Returns:
35 | A networkx graph.
36 | """
37 | with open(connections_file) as f:
38 | lines = json.load(f)
39 | nodes = np.array([x['image_id'] for x in lines])
40 | matrix = np.array([x['unobstructed'] for x in lines])
41 | mask = [x['included'] for x in lines]
42 | matrix = matrix[mask][:, mask]
43 | nodes = nodes[mask]
44 | pos2d = {x['image_id']: np.array(x['pose'])[[3, 7]] for x in lines}
45 | pos3d = {x['image_id']: np.array(x['pose'])[[3, 7, 11]] for x in lines}
46 |
47 | graph = nx.from_numpy_matrix(matrix)
48 | graph = nx.relabel.relabel_nodes(graph, dict(enumerate(nodes)))
49 | nx.set_node_attributes(graph, pos2d, 'pos2d')
50 | nx.set_node_attributes(graph, pos3d, 'pos3d')
51 |
52 | weight2d = {(u, v): norm(pos2d[u] - pos2d[v]) for u, v in graph.edges}
53 | weight3d = {(u, v): norm(pos3d[u] - pos3d[v]) for u, v in graph.edges}
54 | nx.set_edge_attributes(graph, weight2d, 'weight2d')
55 | nx.set_edge_attributes(graph, weight3d, 'weight3d')
56 |
57 | return graph
58 |
59 |
60 | def draw(graph, predicted_path, reference_path, output_filename, **kwargs):
61 | """Generates a plot showing the graph, predicted and reference paths.
62 |
63 | Args:
64 | graph: A networkx graph.
65 | predicted_path: A list with the ids of the nodes in the predicted path.
66 | reference_path: A list with the ids of the nodes in the reference path.
67 | output_filename: A string with the path where to store the generated image.
68 | **kwargs: Key-word arguments for aesthetic control.
69 | """
70 | plt.figure(figsize=(10, 10))
71 | ax = plt.gca()
72 | pos = nx.get_node_attributes(graph, 'pos2d')
73 |
74 | # Zoom in.
75 | xs = [pos[node][0] for node in predicted_path + reference_path]
76 | ys = [pos[node][1] for node in predicted_path + reference_path]
77 | min_x, max_x, min_y, max_y = min(xs), max(xs), min(ys), max(ys)
78 | center_x, center_y = (min_x + max_x)/2, (min_y + max_y)/2
79 | zoom_margin = kwargs.get('zoom_margin', 1.3)
80 | max_range = zoom_margin * max(max_x - min_x, max_y - min_y)
81 | half_range = max_range / 2
82 | ax.set_xlim(center_x - half_range, center_x + half_range)
83 | ax.set_ylim(center_y - half_range, center_y + half_range)
84 |
85 | # Background graph.
86 | nx.draw(graph,
87 | pos,
88 | edge_color=kwargs.get('background_edge_color', 'lightgrey'),
89 | node_color=kwargs.get('background_node_color', 'lightgrey'),
90 | node_size=kwargs.get('background_node_size', 60),
91 | width=kwargs.get('background_edge_width', 0.5))
92 |
93 | # Prediction graph.
94 | predicted_path_graph = nx.DiGraph()
95 | predicted_path_graph.add_nodes_from(predicted_path)
96 | predicted_path_graph.add_edges_from(
97 | zip(predicted_path[:-1], predicted_path[1:]))
98 | nx.draw(predicted_path_graph,
99 | pos,
100 | arrowsize=kwargs.get('prediction_arrowsize', 15),
101 | edge_color=kwargs.get('prediction_edge_color', 'red'),
102 | node_color=kwargs.get('prediction_node_color', 'red'),
103 | node_size=kwargs.get('prediction_node_size', 130),
104 | width=kwargs.get('prediction_edge_width', 2.0))
105 |
106 | # Reference graph.
107 | reference_path_graph = nx.DiGraph()
108 | reference_path_graph.add_nodes_from(reference_path)
109 | reference_path_graph.add_edges_from(
110 | zip(reference_path[:-1], reference_path[1:]))
111 | nx.draw(reference_path_graph,
112 | pos,
113 | arrowsize=kwargs.get('reference_arrowsize', 15),
114 | edge_color=kwargs.get('reference_edge_color', 'dodgerblue'),
115 | node_color=kwargs.get('reference_node_color', 'dodgerblue'),
116 | node_size=kwargs.get('reference_node_size', 130),
117 | width=kwargs.get('reference_edge_width', 2.0))
118 |
119 | # Intersection graph.
120 | intersection_path_graph = nx.DiGraph()
121 | common_nodes = set(predicted_path_graph.nodes.keys()).intersection(
122 | set(reference_path_graph.nodes.keys()))
123 | intersection_path_graph.add_nodes_from(common_nodes)
124 | common_edges = set(predicted_path_graph.edges.keys()).intersection(
125 | set(reference_path_graph.edges.keys()))
126 | intersection_path_graph.add_edges_from(common_edges)
127 | nx.draw(intersection_path_graph,
128 | pos,
129 | arrowsize=kwargs.get('intersection_arrowsize', 15),
130 | edge_color=kwargs.get('intersection_edge_color', 'limegreen'),
131 | node_color=kwargs.get('intersection_node_color', 'limegreen'),
132 | node_size=kwargs.get('intersection_node_size', 130),
133 | width=kwargs.get('intersection_edge_width', 2.0))
134 |
135 | plt.savefig(output_filename)
136 | plt.close()
137 |
--------------------------------------------------------------------------------
/tasks/R4R/r4r_generate_data.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2020 The Google Research Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | """Script to build R4R data from the original R2R data.
17 |
18 | Link to the original R2R:
19 | https://niessner.github.io/Matterport/
20 | """
21 |
22 | from __future__ import print_function
23 |
24 | import argparse
25 | import collections
26 | import json
27 | import os
28 |
29 | import graph_utils
30 |
31 | import networkx as nx
32 | import numpy as np
33 |
34 |
35 | def main(args):
36 | """Generate R4R data from the original R2R data.
37 |
38 | Args:
39 | args: argparse containing paths to input and output files.
40 | """
41 | print('******Generating R4R Data********')
42 | print(' Distance threshold: {} meters'.format(args.distance_threshold))
43 | print(' Heading threshold: {} radians'.format(args.heading_threshold))
44 |
45 | def _connections_file_path(scan):
46 | return os.path.join(
47 | args.connections_dir, '{}_connectivity.json'.format(scan))
48 |
49 | inputs = json.load(open(args.input_file_path))
50 | outputs = list()
51 | filtered = collections.Counter()
52 |
53 | # Group by scan to save memory.
54 | scans = dict()
55 | for value in inputs:
56 | scan = value['scan']
57 | if scan not in scans:
58 | scans[scan] = []
59 | scans[scan].append(value)
60 |
61 | for scan, values in scans.items():
62 | print('Loading graph for scan {}.'.format(scan))
63 | graph = graph_utils.load(_connections_file_path(scan))
64 | pos2d = nx.get_node_attributes(graph, 'pos2d')
65 |
66 | # Cache format: (node, (distance, path)) ((node obj, (dict, dict)))
67 | cache = dict(nx.all_pairs_dijkstra(graph, weight='weight3d'))
68 | shortest_distance = {k: v[0] for k, v in cache.items()}
69 | shortest_path = {k: v[1] for k, v in cache.items()}
70 |
71 | for first in values:
72 | for second in values:
73 | first_target = first['path'][-1]
74 | second_source = second['path'][0]
75 |
76 | # Compute the end-start distance (meters).
77 | distance = shortest_distance[first_target][second_source]
78 |
79 | # Compute the absolute end-start heading difference (radians).
80 | x, y = pos2d[first['path'][-1]] - pos2d[first['path'][-2]]
81 | heading = abs(second['heading'] - np.arctan2(y, x) % (2 * np.pi))
82 |
83 | if (args.distance_threshold is not None
84 | and distance > args.distance_threshold):
85 | filtered['distance'] += 1
86 | elif (args.heading_threshold is not None
87 | and heading > args.heading_threshold):
88 | filtered['heading'] += 1
89 | else:
90 | value = dict()
91 | value['path'] = (
92 | first['path'][:-1]
93 | + shortest_path[first_target][second_source]
94 | + second['path'][1:])
95 | value['distance'] = (
96 | first['distance']
97 | + shortest_distance[first_target][second_source]
98 | + second['distance'])
99 | value['instructions'] = [
100 | x + y # pylint: disable=g-complex-comprehension
101 | for x in first['instructions']
102 | for y in second['instructions']]
103 | value['heading'] = first['heading']
104 | value['path_id'] = len(outputs)
105 | value['scan'] = scan
106 |
107 | # Additional data.
108 | path_source = first['path'][0]
109 | path_target = second['path'][-1]
110 | value['shortest_path_distance'] = cache[path_source][0][path_target]
111 | value['shortest_path'] = cache[path_source][1][path_target]
112 | value['first_path_id'] = first['path_id']
113 | value['second_path_id'] = second['path_id']
114 |
115 | outputs.append(value)
116 |
117 | with open(args.output_file_path, 'w') as f:
118 | json.dump(outputs, f, indent=2, sort_keys=True, separators=(',', ': '))
119 |
120 | # Dataset summary metrics.
121 | tot_instructions = np.sum([len(x['instructions']) for x in outputs])
122 | avg_distance = np.mean([x['distance'] for x in outputs])
123 | avg_path_len = np.mean([len(x['path']) for x in outputs])
124 | avg_sp_distance = np.mean([x['shortest_path_distance'] for x in outputs])
125 | avg_sp_path_len = np.mean([len(x['shortest_path']) for x in outputs])
126 |
127 | print('******Final Results********')
128 | print(' Total instructions generated: {}'.format(tot_instructions))
129 | print(' Average path distance (meters): {}'.format(avg_distance))
130 | print(' Average shortest path distance: {}'.format(avg_sp_distance))
131 | print(' Average path length (steps): {}'.format(avg_path_len))
132 | print(' Average shortest path length: {}'.format(avg_sp_path_len))
133 | print(' Total paths generated: {}'.format(len(outputs)))
134 | print(' Total distance filtered paths: {}'.format(filtered['distance']))
135 | print(' Total heading filtered paths: {}'.format(filtered['heading']))
136 |
137 |
138 | if __name__ == '__main__':
139 | parser = argparse.ArgumentParser()
140 | parser.add_argument(
141 | '--connections_dir',
142 | dest='connections_dir',
143 | required=True,
144 | help='Path to the Matterport simulator connection data.')
145 | parser.add_argument(
146 | '--input_file_path',
147 | dest='input_file_path',
148 | required=True,
149 | help='Path to read the R2R input data.')
150 | parser.add_argument(
151 | '--output_file_path',
152 | dest='output_file_path',
153 | required=True,
154 | help='Path to write the R4R output data.')
155 | parser.add_argument(
156 | '--distance_threshold',
157 | dest='distance_threshold',
158 | required=False,
159 | nargs='?',
160 | const=3.0,
161 | type=float,
162 | help='Maximum end-start distance (meters) to join R2R paths.')
163 | parser.add_argument(
164 | '--heading_threshold',
165 | dest='heading_threshold',
166 | required=False,
167 | nargs='?',
168 | const=None,
169 | type=float,
170 | help='Maximum end-start heading difference (radians) to join R2R paths.')
171 | main(parser.parse_args())
--------------------------------------------------------------------------------
/tasks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sha-Lab/babywalk/4265340d5e521b59030bc50ef71f01b0d3d3ba63/tasks/__init__.py
--------------------------------------------------------------------------------
/teaser/babywalk_curriculum.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sha-Lab/babywalk/4265340d5e521b59030bc50ef71f01b0d3d3ba63/teaser/babywalk_curriculum.jpg
--------------------------------------------------------------------------------
/teaser/pytorch-logo-dark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sha-Lab/babywalk/4265340d5e521b59030bc50ef71f01b0d3d3ba63/teaser/pytorch-logo-dark.png
--------------------------------------------------------------------------------