├── .flake8
├── .gitignore
├── .gitmodules
├── .pre-commit-config.yaml
├── LICENSE
├── README.md
├── checkpoints
└── download_model_weights.sh
├── conf
├── __init__.py
├── annotations
│ ├── new_playtable.yaml
│ └── new_playtable_validation.yaml
├── callbacks
│ ├── checkpoint
│ │ ├── all.yaml
│ │ ├── clip_loss.yaml
│ │ ├── kl.yaml
│ │ ├── lh_sr.yaml
│ │ ├── state_recon.yaml
│ │ ├── task_sr.yaml
│ │ └── val_action.yaml
│ ├── default.yaml
│ ├── kl_schedule
│ │ ├── constant.yaml
│ │ ├── linear.yaml
│ │ └── sigmoid.yaml
│ ├── rollout
│ │ ├── default.yaml
│ │ └── tasks
│ │ │ └── new_playtable_tasks.yaml
│ ├── rollout_lh
│ │ └── default.yaml
│ ├── shm_signal
│ │ └── default.yaml
│ └── tsne_plot
│ │ └── default.yaml
├── config.yaml
├── datamodule
│ ├── datasets
│ │ ├── lang_dataset
│ │ │ ├── lang.yaml
│ │ │ └── lang_shm.yaml
│ │ ├── lang_only.yaml
│ │ ├── vision_dataset
│ │ │ ├── vision.yaml
│ │ │ └── vision_shm.yaml
│ │ ├── vision_lang.yaml
│ │ ├── vision_lang_shm.yaml
│ │ └── vision_only.yaml
│ ├── default.yaml
│ ├── mcil.yaml
│ ├── observation_space
│ │ ├── all_mods_abs_act.yaml
│ │ ├── lang_rgb_static_abs_act.yaml
│ │ ├── lang_rgb_static_gripper_abs_act.yaml
│ │ ├── lang_rgb_static_gripper_rel_act.yaml
│ │ ├── lang_rgb_static_rel_act.yaml
│ │ ├── lang_rgb_static_robot_scene_abs_act.yaml
│ │ ├── lang_rgb_static_tactile_abs_act.yaml
│ │ ├── lang_rgbd_both_abs_act.yaml
│ │ ├── lang_rgbd_both_rel_act.yaml
│ │ ├── lang_rgbd_static_gripper_rel_act.yaml
│ │ ├── lang_rgbd_static_robot_abs_act.yaml
│ │ ├── rgb_static_abs_act.yaml
│ │ ├── rgb_static_robot_scene_abs_act.yaml
│ │ └── state_only.yaml
│ ├── proprioception_dims
│ │ ├── none.yaml
│ │ ├── robot_full.yaml
│ │ ├── robot_no_joints.yaml
│ │ ├── robot_no_joints_no_gripper_width.yaml
│ │ └── robot_scene.yaml
│ └── transforms
│ │ ├── clip.yaml
│ │ ├── play_basic.yaml
│ │ └── rand_shift.yaml
├── inference
│ └── config_inference.yaml
├── lang_ann.yaml
├── logger
│ ├── tb_logger.yaml
│ └── wandb.yaml
├── loss
│ └── default.yaml
├── model
│ ├── action_decoder
│ │ ├── deterministic.yaml
│ │ ├── hulc_default.yaml
│ │ └── mcil_default.yaml
│ ├── bc_z_lang_decoder
│ │ ├── default.yaml
│ │ └── none.yaml
│ ├── clip_lang.yaml
│ ├── distribution
│ │ ├── continuous.yaml
│ │ └── discrete.yaml
│ ├── gcbc.yaml
│ ├── hulc.yaml
│ ├── language_encoder
│ │ ├── default.yaml
│ │ └── none.yaml
│ ├── language_goal
│ │ ├── default.yaml
│ │ └── none.yaml
│ ├── lr_scheduler
│ │ ├── constant.yaml
│ │ ├── cosine_schedule_with_warmup.yaml
│ │ └── linear_schedule_with_warmup.yaml
│ ├── mcil.yaml
│ ├── mia_lang_discriminator
│ │ ├── default.yaml
│ │ └── none.yaml
│ ├── optimizer
│ │ ├── adam.yaml
│ │ ├── adamw.yaml
│ │ └── sgd.yaml
│ ├── perceptual_encoder
│ │ ├── default.yaml
│ │ ├── depth_gripper
│ │ │ ├── default.yaml
│ │ │ └── none.yaml
│ │ ├── depth_static
│ │ │ ├── default.yaml
│ │ │ └── none.yaml
│ │ ├── gripper_cam.yaml
│ │ ├── proprio
│ │ │ ├── identity.yaml
│ │ │ └── none.yaml
│ │ ├── rgb_gripper
│ │ │ ├── default.yaml
│ │ │ └── none.yaml
│ │ ├── rgb_static
│ │ │ ├── clip.yaml
│ │ │ └── default.yaml
│ │ ├── state_decoder
│ │ │ ├── default.yaml
│ │ │ └── none.yaml
│ │ └── tactile
│ │ │ ├── default.yaml
│ │ │ └── none.yaml
│ ├── plan_proposal
│ │ └── default.yaml
│ ├── plan_recognition
│ │ ├── birnn.yaml
│ │ └── transformers.yaml
│ ├── proj_vis_lang
│ │ ├── default.yaml
│ │ └── none.yaml
│ ├── sbert.yaml
│ └── visual_goal
│ │ └── default.yaml
├── trainer
│ └── play_trainer.yaml
└── training
│ └── default_training.yaml
├── dataset
├── README.md
├── download_data.sh
└── download_lang_embeddings.sh
├── hulc
├── __init__.py
├── evaluation
│ ├── __init__.py
│ ├── create_plots.py
│ ├── evaluate_policy.py
│ ├── rollouts_interactive.py
│ └── run_multiple.py
├── models
│ ├── __init__.py
│ ├── auxiliary_loss_networks
│ │ ├── __init__.py
│ │ ├── bc_z_lang_decoder.py
│ │ ├── mia_lang_discriminator.py
│ │ ├── proj_vis_lang.py
│ │ └── state_decoder.py
│ ├── decoders
│ │ ├── __init__.py
│ │ ├── action_decoder.py
│ │ ├── deterministic_decoder.py
│ │ ├── logistic_decoder_rnn.py
│ │ └── utils
│ │ │ ├── __init__.py
│ │ │ ├── gripper_control.py
│ │ │ ├── pytorch3d_transforms.py
│ │ │ └── rnn.py
│ ├── encoders
│ │ ├── __init__.py
│ │ ├── clip_lang_encoder.py
│ │ ├── goal_encoders.py
│ │ ├── lang_encoder.py
│ │ └── language_network.py
│ ├── gcbc.py
│ ├── hulc.py
│ ├── perceptual_encoders
│ │ ├── __init__.py
│ │ ├── clip.py
│ │ ├── concat_encoders.py
│ │ ├── proprio_encoder.py
│ │ ├── tactile_encoder.py
│ │ ├── vision_clip.py
│ │ ├── vision_network.py
│ │ └── vision_network_gripper.py
│ └── plan_encoders
│ │ ├── __init__.py
│ │ ├── plan_proposal_net.py
│ │ └── plan_recognition_net.py
├── training.py
└── utils
│ ├── __init__.py
│ ├── bpe_simple_vocab_16e6.txt.gz
│ ├── clip_tokenizer.py
│ ├── distributions.py
│ ├── kl_callbacks.py
│ ├── transforms.py
│ └── utils.py
├── install.sh
├── media
└── hulc_rollout.gif
├── pyproject.toml
├── requirements-dev.txt
├── requirements.txt
├── setup.py
├── setup_local.py
└── slurm_scripts
├── README.md
├── sbatch_eval.sh
├── sbatch_lfp.sh
├── slurm_eval.py
└── slurm_training.py
/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | exclude = .git
3 | # Default is 79 in PEP 8
4 | max-line-length = 120
5 | select = E,F,W,C
6 | ignore=W503, # line break before binary operator, need for black
7 | E203, # whitespace before ':'. Opposite convention enforced by black
8 | E731, # do not assign a lambda expression, use a def
9 | E722,
10 | F401,
11 | F841,
12 | E402, # module level import not at top of file
13 | E741, # ambiguous variable name
14 | E501, # line too long. Handled by black
15 | C406, # Unnecessary list literal - rewrite as a dict literal
16 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # IPython
81 | profile_default/
82 | ipython_config.py
83 |
84 | # pyenv
85 | .python-version
86 |
87 | # pipenv
88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
91 | # install all needed dependencies.
92 | #Pipfile.lock
93 |
94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95 | __pypackages__/
96 |
97 | # Celery stuff
98 | celerybeat-schedule
99 | celerybeat.pid
100 |
101 | # SageMath parsed files
102 | *.sage.py
103 |
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 |
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 |
117 | # Rope project settings
118 | .ropeproject
119 |
120 | # mkdocs documentation
121 | /site
122 |
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 |
128 | # Pyre type checker
129 | .pyre/
130 |
131 | # Pycharm
132 | .idea
133 |
134 | # log files
135 | runs
136 |
137 | checkpoints/HULC*
138 |
139 | dataset/calvin_debug_dataset/
140 | dataset/task*
141 |
--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "calvin_env"]
2 | path = calvin_env
3 | url = https://github.com/mees/calvin_env.git
4 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | default_language_version:
2 | python: python3.8
3 | repos:
4 | - repo: https://github.com/psf/black
5 | rev: 21.5b2
6 | hooks:
7 | - id: black
8 | language_version: python3.8
9 |
10 | - repo: https://gitlab.com/pycqa/flake8
11 | rev: 3.8.4
12 | hooks:
13 | - id: flake8
14 | additional_dependencies: [-e, "git+git://github.com/pycqa/pyflakes.git@c72d6cf#egg=pyflakes"]
15 | exclude: telegram_bot
16 |
17 | - repo: https://github.com/pycqa/isort
18 | rev: 5.7.0
19 | hooks:
20 | - id: isort
21 |
22 | - repo: https://github.com/pre-commit/mirrors-mypy
23 | rev: v0.812
24 | hooks:
25 | - id: mypy
26 | args: [--ignore-missing-imports, --warn-no-return, --warn-redundant-casts, --disallow-incomplete-defs]
27 | additional_dependencies: [pytorch-lightning==1.5.5, torch==1.10.0, numpy]
28 | exclude: telegram_bot
29 |
30 | - repo: https://github.com/pre-commit/pre-commit-hooks
31 | rev: v4.0.1
32 | hooks:
33 | - id: check-yaml
34 | - id: trailing-whitespace
35 | - id: end-of-file-fixer
36 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2022 Oier Mees
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # HULC
2 | [](https://github.com/psf/black)
3 | [](https://lgtm.com/projects/g/mees/hulc/context:python)
4 | [](https://lgtm.com/projects/g/mees/hulc/alerts/)
5 | [](https://opensource.org/licenses/MIT)
6 |
7 | [What Matters in Language Conditioned Imitation Learning over Unstructured Data](https://arxiv.org/pdf/2204.06252.pdf)
8 |
9 | [Oier Mees](https://www.oiermees.com/), [Lukas Hermann](https://lukashermann.github.io/), [Wolfram Burgard](http://www2.informatik.uni-freiburg.de/~burgard)
10 |
11 | We present **HULC** (**H**ierarchical **U**niversal **L**anguage **C**onditioned Policies), an end-to-end model that can
12 | learn a wide variety of language conditioned robot skills from offline free-form imitation datasets. HULC sets a new state of the art on the challenging CALVIN benchmark,
13 | on learning a single 7-DoF policy that can perform long-horizon manipulation tasks in a 3D environment, directly from images, and only specified with natural language.
14 | This code accompanies the paper What Matters in Language Conditioned Imitation Learning, which can be found [here](https://arxiv.org/pdf/2204.06252.pdf).
15 | We hope the code will be useful as a starting point for further research on language conditioned policy learning and will bring us closer towards general-purpose robots that can relate human language to their perception and actions.
16 |
17 | 
18 | ## Installation
19 | As a prerequisite, you need to have [calvin](https://github.com/mees/calvin) installed.
20 | This is needed because HULC builds upon calvin_agent and calvin_env.
21 |
22 | Next, clone this repository locally
23 | ```bash
24 | git clone https://github.com/mees/hulc.git
25 | export HULC_ROOT=$(pwd)/hulc
26 |
27 | ```
28 |
29 | Install requirements:
30 | ```bash
31 | cd $HULC_ROOT
32 | conda create -n hulc_venv python=3.10 # or use virtualenv
33 | conda activate hulc_venv
34 | sh install.sh
35 | ```
36 | We originally used Python 3.8, but we 3.10 should also work.
37 |
38 | If you encounter problems installing pyhash, you might have to downgrade setuptools to a version below 58.
39 |
40 | ## Download
41 | ### CALVIN Dataset
42 | If you want to train on the [CALVIN](https://github.com/mees/calvin) dataset, choose a split with:
43 | ```bash
44 | cd $HULC_ROOT/dataset
45 | sh download_data.sh D | ABC | ABCD | debug
46 | ```
47 | If you have previously downloaded the dataset in the calvin repo, you can just set the paths to that folder via the command line when starting a training.
48 | If you want to get started without downloading the whole dataset, use the argument `debug` to download a small debug dataset (1.3 GB).
49 | ### Language Embeddings
50 | We provide the precomputed embeddings of the different Language Models we evaluate in the paper.
51 | The script assumes the corresponding split has been already downloaded.
52 | ```bash
53 | cd $HULC_ROOT/dataset
54 | sh download_lang_embeddings.sh D | ABC | ABCD
55 | ```
56 |
57 | ### Pre-trained Models
58 | We provide our final models for all three CALVIN splits.
59 | ```bash
60 | cd $HULC_ROOT/checkpoints
61 | sh download_model_weights.sh D | ABC | ABCD
62 | ```
63 | For instructions how to use the pretrained models, look at the training and evaluation sections.
64 |
65 | ## Hardware Requirements
66 |
67 | We leverage [Pytorch Lightning's](https://www.pytorchlightning.ai/) DDP implementation to scale our training to 8x NVIDIA GPUs with **12GB** memory each.
68 | Evaluating the models requires a single NVIDIA GPU with **8GB**. As each GPU receives a batch of 64 sequences (32 language + 32 vision), the effective batch size is 512 for all our experiments.
69 |
70 | Trained with:
71 | - **GPU** - 8x NVIDIA RTX 2080Ti
72 | - **CPU** - AMD EPYC 7502
73 | - **RAM** - 512GB
74 | - **OS** - Ubuntu 20.04
75 |
76 | With this setup, one epoch takes around 1.5 hours and the whole training with 30 epochs can be completed in 45 hours (without the evaluation callbacks).
77 |
78 | ## Training
79 | To train our HULC model with the maximum amount of available GPUS, run:
80 | ```
81 | python hulc/training.py trainer.devices-1 datamodule.root_data_dir=path/to/dataset datamodule/datasets=vision_lang_shm
82 | ```
83 | The `vision_lang_shm` option loads the CALVIN dataset into shared memory at the beginning of the training,
84 | speeding up the data loading during training.
85 | The preparation of the shared memory cache will take some time
86 | (approx. 20 min at our SLURM cluster). \
87 | If you want to use the original data loader (e.g. for debugging) just override the command with `datamodule/datasets=vision_lang`. \
88 | For an additional speed up, you can disable the evaluation callbacks during training by adding `~callbacks/rollout` and `~callbacks/rollout_lh`
89 |
90 | If you have access to a SLURM cluster, follow this [guide](https://github.com/mees/hulc/blob/main/slurm_scripts/README.md).
91 |
92 | You can use our [pre-trained models](#pre-trained-models) to initialize a training by running
93 | ```
94 | python hulc/training.py trainer.devices-1 datamodule.root_data_dir=path/to/dataset hydra.run.dir=$HULC_ROOT/checkpoints/HULC_D_D
95 | ```
96 | Note that this will log the training into the checkpoint folder.
97 |
98 | ### Ablations
99 | Multi-context imitation learning (MCIL), (Lynch et al., 2019):
100 | ```
101 | python hulc/training.py trainer.devices-1 datamodule.root_data_dir=path/to/dataset datamodule/datasets=vision_lang_shm model=mcil
102 | datamodule=mcil
103 | ```
104 |
105 | Goal-conditioned behavior cloning (GCBC), (Lynch et al., 2019):
106 | ```
107 | python hulc/training.py trainer.devices-1 datamodule.root_data_dir=path/to/dataset datamodule/datasets=vision_lang_shm model=gcbc
108 | ~callbacks/tsne_plot
109 | ```
110 |
111 |
112 | ## Evaluation
113 | See detailed inference instructions on the [CALVIN repo](https://github.com/mees/calvin#muscle-evaluation-the-calvin-challenge).
114 | ```
115 | python hulc/evaluation/evaluate_policy.py --dataset_path --train_folder
116 | ```
117 | Set `--train_folder $HULC_ROOT/checkpoints/HULC_D_D` to evaluate our [pre-trained models](#pre-trained-models).
118 |
119 | Optional arguments:
120 |
121 | - `--checkpoint `: by default, the evaluation loads the last checkpoint in the training log directory.
122 | You can instead specify the path to another checkpoint by adding this to the evaluation command.
123 | - `--debug`: print debug information and visualize environment.
124 |
125 | ## Changelog
126 |
127 | ### 16 Sep 2022
128 | - **MAJOR BUG IN ABC and ABCD dataset:** If you downloaded these datasets before this date you have to do these fixes:
129 | - Wrong language annotations in ABC and ABCD dataset. You can download the corrected language embeddings [here](https://github.com/mees/calvin/blob/main/dataset/README.md#language-embeddings).
130 | - Bug in `calvin_env` that only affects the generation of language embeddings.
131 | - Wrong `scene_info.npy` in ABC and ABCD dataset. Please replace as follows:
132 | ```
133 | cd task_ABCD_D
134 | wget http://calvin.cs.uni-freiburg.de/scene_info_fix/task_ABCD_D_scene_info.zip
135 | unzip task_ABCD_D_scene_info.zip && rm task_ABCD_D_scene_info.zip
136 | ```
137 | ```
138 | cd task_ABC_D
139 | wget http://calvin.cs.uni-freiburg.de/scene_info_fix/task_ABC_D_scene_info.zip
140 | unzip task_ABC_D_scene_info.zip && rm task_ABC_D_scene_info.zip
141 | ```
142 |
143 | ### 1 Sep 2022
144 | - Updated the language embeddings for the splits ABC and ABCD due to a bug in switching scenes during the automatic language labeling. Additionally, added various precomputed language embeddings.
145 |
146 | ## Acknowledgements
147 |
148 | This work uses code from the following open-source projects and datasets:
149 |
150 | #### CALVIN
151 | Original: [https://github.com/mees/calvin](https://github.com/mees/calvin)
152 | License: [MIT](https://github.com/mees/calvin/blob/main/LICENSE)
153 |
154 | #### Sentence-Transformers
155 | Original: [https://github.com/UKPLab/sentence-transformers](https://github.com/UKPLab/sentence-transformers)
156 | License: [Apache 2.0](https://github.com/UKPLab/sentence-transformers/blob/master/LICENSE)
157 |
158 | #### OpenAI CLIP
159 | Original: [https://github.com/openai/CLIP](https://github.com/openai/CLIP)
160 | License: [MIT](https://github.com/openai/CLIP/blob/main/LICENSE)
161 | ## Citations
162 |
163 | If you find the code useful, please cite:
164 |
165 | **HULC**
166 | ```bibtex
167 | @article{mees2022hulc,
168 | author={Oier Mees and Lukas Hermann and Wolfram Burgard},
169 | title={What Matters in Language Conditioned Robotic Imitation Learning Over Unstructured Data},
170 | journal={IEEE Robotics and Automation Letters (RA-L)},
171 | volume={7},
172 | number={4},
173 | pages={11205-11212},
174 | year={2022}
175 | }
176 | ```
177 | **CALVIN**
178 | ```bibtex
179 | @article{mees2022calvin,
180 | author = {Oier Mees and Lukas Hermann and Erick Rosete-Beas and Wolfram Burgard},
181 | title = {CALVIN: A Benchmark for Language-Conditioned Policy Learning for Long-Horizon Robot Manipulation Tasks},
182 | journal={IEEE Robotics and Automation Letters (RA-L)},
183 | volume={7},
184 | number={3},
185 | pages={7327-7334},
186 | year={2022}
187 | }
188 | ```
189 |
190 | ## License
191 |
192 | MIT License
193 |
--------------------------------------------------------------------------------
/checkpoints/download_model_weights.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Download, Unzip, and Remove zip
3 | if [ "$1" = "D" ]
4 | then
5 |
6 | echo "Downloading HULC Checkpoint for task_D_D ..."
7 | wget http://hulc.cs.uni-freiburg.de/model_weights/HULC_D_D.zip
8 | unzip HULC_D_D.zip && rm HULC_D_D.zip
9 | echo "finished!"
10 | elif [ "$1" = "ABC" ]
11 | then
12 |
13 | echo "Downloading HULC Checkpoint for task_ABC_D ..."
14 | wget http://hulc.cs.uni-freiburg.de/model_weights/HULC_ABC_D.zip
15 | unzip HULC_ABC_D.zip && rm HULC_ABC_D.zip
16 | echo "finished!"
17 |
18 | elif [ "$1" = "ABCD" ]
19 | then
20 |
21 | echo "Downloading HULC Checkpoint for task_ABCD_D ..."
22 | wget http://hulc.cs.uni-freiburg.de/model_weights/HULC_ABCD_D.zip
23 | unzip HULC_ABCD_D.zip && rm HULC_ABCD_D.zip
24 | echo "finished!"
25 |
26 | else
27 | echo "Failed: Usage download_model_weights.sh D | ABC | ABCD"
28 | exit 1
29 | fi
30 |
--------------------------------------------------------------------------------
/conf/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lukashermann/hulc/7fdb09f5f1ff831ec8df29cfc61a41db573e9a06/conf/__init__.py
--------------------------------------------------------------------------------
/conf/annotations/new_playtable_validation.yaml:
--------------------------------------------------------------------------------
1 | # rotation
2 | rotate_red_block_right: ["take the red block and rotate it to the right"]
3 | rotate_red_block_left: ["take the red block and rotate it to the left"]
4 | rotate_blue_block_right: ["take the blue block and rotate it to the right"]
5 | rotate_blue_block_left: ["take the blue block and rotate it to the left"]
6 | rotate_pink_block_right: ["take the pink block and rotate it to the right"]
7 | rotate_pink_block_left: ["take the pink block and rotate it to the left"]
8 |
9 | # sliding
10 | push_red_block_right: ["go push the red block right"]
11 | push_red_block_left: ["go push the red block left"]
12 | push_blue_block_right: ["go push the blue block right"]
13 | push_blue_block_left: ["go push the blue block left"]
14 | push_pink_block_right: ["go push the pink block right"]
15 | push_pink_block_left: ["go push the pink block left"]
16 |
17 | # open/close
18 | move_slider_left: [ "push the sliding door to the left side"]
19 | move_slider_right: [ "push the sliding door to the right side"]
20 | open_drawer: ["pull the handle to open the drawer"]
21 | close_drawer: ["push the handle to close the drawer"]
22 |
23 | # lifting
24 | lift_red_block_table: ["grasp and lift the red block"]
25 | lift_blue_block_table: ["grasp and lift the blue block"]
26 | lift_pink_block_table: ["grasp and lift the pink block"]
27 |
28 | lift_red_block_slider: [ "lift the red block from the sliding cabinet"]
29 | lift_blue_block_slider: [ "lift the blue block from the sliding cabinet"]
30 | lift_pink_block_slider: [ "lift the pink block from the sliding cabinet"]
31 |
32 | lift_red_block_drawer: ["Take the red block from the drawer"]
33 | lift_blue_block_drawer: ["Take the blue block from the drawer"]
34 | lift_pink_block_drawer: ["Take the pink block from the drawer"]
35 |
36 | place_in_slider: [ "store the grasped block in the sliding cabinet"]
37 | place_in_drawer: [ "store the grasped block in the drawer"]
38 |
39 | push_into_drawer: ["slide the block that it falls into the drawer"]
40 |
41 | stack_block: ["stack the grasped block"]
42 | unstack_block: ["remove the stacked block"]
43 |
44 | turn_on_lightbulb: ["use the switch to turn on the light bulb"]
45 | turn_off_lightbulb: ["use the switch to turn off the light bulb"]
46 | turn_on_led: ["press the button to turn on the led light"]
47 | turn_off_led: ["press the button to turn off the led light"]
48 |
--------------------------------------------------------------------------------
/conf/callbacks/checkpoint/all.yaml:
--------------------------------------------------------------------------------
1 | _target_: pytorch_lightning.callbacks.ModelCheckpoint
2 | save_top_k: -1
3 | verbose: True
4 | dirpath: saved_models
5 | filename: '{epoch}' #put back in when PL fixes this _{val/accuracy:.4f}'
6 |
--------------------------------------------------------------------------------
/conf/callbacks/checkpoint/clip_loss.yaml:
--------------------------------------------------------------------------------
1 | _target_: pytorch_lightning.callbacks.ModelCheckpoint
2 | save_top_k: 3
3 | verbose: True
4 | monitor: val/val_pred_clip_loss
5 | mode: min
6 | dirpath: saved_models
7 | filename: '{epoch}' #put back in when PL fixes this _{val/accuracy:.4f}'
8 |
--------------------------------------------------------------------------------
/conf/callbacks/checkpoint/kl.yaml:
--------------------------------------------------------------------------------
1 | _target_: pytorch_lightning.callbacks.ModelCheckpoint
2 | save_top_k: 3
3 | verbose: True
4 | monitor: train/kl_loss
5 | mode: max
6 | dirpath: saved_models
7 | filename: '{epoch}' #put back in when PL fixes this _{val/accuracy:.4f}'
8 |
--------------------------------------------------------------------------------
/conf/callbacks/checkpoint/lh_sr.yaml:
--------------------------------------------------------------------------------
1 | _target_: pytorch_lightning.callbacks.ModelCheckpoint
2 | save_top_k: 3
3 | verbose: True
4 | monitor: eval_lh/avg_seq_len
5 | mode: max
6 | dirpath: saved_models
7 | filename: '{epoch}' #put back in when PL fixes this _{val/accuracy:.4f}'
8 | every_n_epochs: ${callbacks.rollout_lh.rollout_freq}
9 |
--------------------------------------------------------------------------------
/conf/callbacks/checkpoint/state_recon.yaml:
--------------------------------------------------------------------------------
1 | _target_: pytorch_lightning.callbacks.ModelCheckpoint
2 | save_top_k: 3
3 | verbose: True
4 | monitor: val/state_recon_loss
5 | mode: min
6 | dirpath: saved_models
7 | filename: '{epoch}' #put back in when PL fixes this _{val/accuracy:.4f}'
8 |
--------------------------------------------------------------------------------
/conf/callbacks/checkpoint/task_sr.yaml:
--------------------------------------------------------------------------------
1 | _target_: pytorch_lightning.callbacks.ModelCheckpoint
2 | save_top_k: 3
3 | verbose: True
4 | monitor: tasks/average_sr
5 | mode: max
6 | dirpath: saved_models
7 | filename: '{epoch}' #put back in when PL fixes this _{val/accuracy:.4f}'
8 | every_n_epochs: ${callbacks.rollout.rollout_freq}
9 |
--------------------------------------------------------------------------------
/conf/callbacks/checkpoint/val_action.yaml:
--------------------------------------------------------------------------------
1 | _target_: pytorch_lightning.callbacks.ModelCheckpoint
2 | save_top_k: -1
3 | verbose: True
4 | monitor: val_act/action_loss_pp
5 | mode: min
6 | dirpath: saved_models
7 | filename: '{epoch}' #put back in when PL fixes this _{val/accuracy:.4f}'
8 |
--------------------------------------------------------------------------------
/conf/callbacks/default.yaml:
--------------------------------------------------------------------------------
1 | defaults:
2 | # - rollout: default
3 | - rollout_lh: default
4 | - checkpoint: all
5 | - tsne_plot: default
6 | - kl_schedule: constant
7 | - shm_signal: default
8 |
--------------------------------------------------------------------------------
/conf/callbacks/kl_schedule/constant.yaml:
--------------------------------------------------------------------------------
1 | _target_: hulc.utils.kl_callbacks.KLConstantSchedule
2 |
--------------------------------------------------------------------------------
/conf/callbacks/kl_schedule/linear.yaml:
--------------------------------------------------------------------------------
1 | _target_: hulc.utils.kl_callbacks.KLLinearSchedule
2 | start_epoch: 10
3 | end_epoch: 50
4 | max_kl_beta: ${loss.kl_beta}
5 |
--------------------------------------------------------------------------------
/conf/callbacks/kl_schedule/sigmoid.yaml:
--------------------------------------------------------------------------------
1 | # @package _group_
2 | _target_: hulc.utils.kl_callbacks.KLSigmoidSchedule
3 | start_epoch: 10
4 | end_epoch: 50
5 | max_kl_beta: ${loss.kl_beta}
6 |
--------------------------------------------------------------------------------
/conf/callbacks/rollout/default.yaml:
--------------------------------------------------------------------------------
1 | defaults:
2 | - /callbacks/rollout/tasks@tasks: new_playtable_tasks
3 | - /annotations@val_annotations: new_playtable_validation
4 | _target_: calvin_agent.rollout.rollout.Rollout
5 | _recursive_: false
6 | env_cfg:
7 | _target_: calvin_agent.wrappers.calvin_env_wrapper.CalvinEnvWrapper
8 | skip_epochs: 1
9 | rollout_freq: 5
10 | video: true
11 | num_rollouts_per_task: 10
12 | check_percentage_of_batch: 1 # which percentage of sequences do we want to check for possible tasks
13 | ep_len: 120
14 | empty_cache: false
15 | log_video_to_file: false
16 | save_dir: ./videos
17 | add_goal_thumbnail: true
18 | min_window_size: ${datamodule.datasets.vision_dataset.min_window_size}
19 | max_window_size: ${datamodule.datasets.vision_dataset.max_window_size}
20 | id_selection_strategy: "select_longest"
21 | lang_folder: ${datamodule.datasets.lang_dataset.lang_folder}
22 |
--------------------------------------------------------------------------------
/conf/callbacks/rollout/tasks/new_playtable_tasks.yaml:
--------------------------------------------------------------------------------
1 | _target_: calvin_env.envs.tasks.Tasks
2 | tasks:
3 | # rotation
4 | rotate_red_block_right: [rotate_object, 'block_red', -60]
5 | rotate_red_block_left: [rotate_object, 'block_red', 60]
6 | rotate_blue_block_right: [ rotate_object, 'block_blue', -60 ]
7 | rotate_blue_block_left: [ rotate_object, 'block_blue', 60 ]
8 | rotate_pink_block_right: [ rotate_object, 'block_pink', -60 ]
9 | rotate_pink_block_left: [ rotate_object, 'block_pink', 60 ]
10 |
11 | # pushing
12 | push_red_block_right: [ push_object, 'block_red', 0.1, 0]
13 | push_red_block_left: [ push_object, 'block_red', -0.1, 0]
14 | push_blue_block_right: [ push_object, 'block_blue', 0.1, 0]
15 | push_blue_block_left: [ push_object, 'block_blue', -0.1, 0]
16 | push_pink_block_right: [ push_object, 'block_pink', 0.1, 0]
17 | push_pink_block_left: [ push_object, 'block_pink', -0.1, 0]
18 |
19 | # open/close
20 | move_slider_left: [move_door_rel, 'base__slide', 0.15] # 0 - 0.56
21 | move_slider_right: [move_door_rel, 'base__slide', -0.15]
22 | open_drawer: [move_door_rel, 'base__drawer', 0.12] # 0 - 0.24
23 | close_drawer: [move_door_rel, 'base__drawer', -0.12]
24 |
25 | # lifting
26 | lift_red_block_table: [lift_object, 'block_red', 0.05, 'table', 'base_link']
27 | lift_red_block_slider: [lift_object, 'block_red', 0.03, 'table', 'plank_link']
28 | lift_red_block_drawer: [lift_object, 'block_red', 0.05, 'table', 'drawer_link']
29 | lift_blue_block_table: [ lift_object, 'block_blue', 0.05, 'table', 'base_link' ]
30 | lift_blue_block_slider: [ lift_object, 'block_blue', 0.03, 'table', 'plank_link' ]
31 | lift_blue_block_drawer: [ lift_object, 'block_blue', 0.05, 'table', 'drawer_link' ]
32 | lift_pink_block_table: [ lift_object, 'block_pink', 0.05, 'table', 'base_link' ]
33 | lift_pink_block_slider: [ lift_object, 'block_pink', 0.03, 'table', 'plank_link' ]
34 | lift_pink_block_drawer: [ lift_object, 'block_pink', 0.05, 'table', 'drawer_link' ]
35 |
36 | # placing
37 | place_in_slider: [place_object, 'table', 'plank_link']
38 | place_in_drawer: [place_object, 'table', 'drawer_link']
39 |
40 | # stacking
41 | stack_block: [stack_objects]
42 | unstack_block: [unstack_objects]
43 |
44 | # lights
45 | turn_on_lightbulb: [toggle_light, 'lightbulb', 0, 1]
46 | turn_off_lightbulb: [toggle_light, 'lightbulb', 1, 0]
47 | turn_on_led: [ toggle_light, 'led', 0, 1 ]
48 | turn_off_led: [ toggle_light, 'led', 1, 0 ]
49 |
50 | # pushing into drawer
51 | push_into_drawer: [push_object_into, ['block_red', 'block_blue', 'block_pink'], 'table', 'base_link', 'table', 'drawer_link']
52 |
53 | # signatures of available base tasks:
54 | # rotate_object(obj_name, degrees, x_y_threshold=30, z_treshold=180):
55 | # push_object(obj_name, x_direction, y_direction):
56 | # lift_object(obj_name, z_direction, surface_body=None, surface_link=None):
57 | # place_object(dest_body, dest_link=None):
58 | # push_object_into(obj_name, src_body, dest_body):
59 | # move_door_abs(start_info, end_info, obj_name, joint_name, start_threshold, end_threshold):
60 | # move_door_rel(obj_name, joint_name, threshold):
61 |
--------------------------------------------------------------------------------
/conf/callbacks/rollout_lh/default.yaml:
--------------------------------------------------------------------------------
1 | defaults:
2 | - /callbacks/rollout/tasks@tasks: new_playtable_tasks
3 | - /annotations@val_annotations: new_playtable_validation
4 | _target_: calvin_agent.rollout.rollout_long_horizon.RolloutLongHorizon
5 | _recursive_: false
6 | env_cfg:
7 | _target_: calvin_agent.wrappers.calvin_env_wrapper.CalvinEnvWrapper
8 | skip_epochs: 1
9 | rollout_freq: 1
10 | num_videos: 16
11 | num_sequences: 128
12 | replan_freq: 30
13 | ep_len: 360
14 | empty_cache: false
15 | log_video_to_file: false
16 | save_dir: ./videos
17 | lang_folder: ${datamodule.datasets.lang_dataset.lang_folder}
18 | debug: false
19 |
--------------------------------------------------------------------------------
/conf/callbacks/shm_signal/default.yaml:
--------------------------------------------------------------------------------
1 | _target_: calvin_agent.datasets.utils.shared_memory_utils.SignalCallback
2 |
--------------------------------------------------------------------------------
/conf/callbacks/tsne_plot/default.yaml:
--------------------------------------------------------------------------------
1 | _target_: calvin_agent.visualization.tsne_plot.TSNEPlot
2 | perplexity: 40
3 | n_jobs: 8
4 | plot_percentage: 0.2
5 | opacity: 0.3
6 | marker_size: 5
7 |
--------------------------------------------------------------------------------
/conf/config.yaml:
--------------------------------------------------------------------------------
1 | defaults:
2 | - callbacks: default
3 | - datamodule: default
4 | - model: hulc
5 | - loss: default
6 | - training: default_training
7 | - trainer: play_trainer
8 | - logger: wandb
9 | - override hydra/job_logging: colorlog
10 | - override hydra/hydra_logging: colorlog
11 | - _self_
12 |
13 | seed: 42
14 | log_dir: ../
15 | slurm: false
16 |
17 | hydra:
18 | run:
19 | dir: ${log_dir}/runs/${now:%Y-%m-%d}/${now:%H-%M-%S}
20 | sweep:
21 | dir: ${log_dir}/runs/${now:%Y-%m-%d}/${now:%H-%M-%S}
22 | subdir: ${hydra.job.override_dirname}
23 | job:
24 | config:
25 | override_dirname:
26 | exclude_keys:
27 | - log_dir
28 | - datamodule.root_data_dir
29 | - trainer.gpus
30 | - model.tsne_plot
31 | - datamodule.num_workers
32 | - trainer.limit_train_batches
33 | - trainer.limit_val_batches
34 | - model.action_decoder.load_action_bounds
35 |
--------------------------------------------------------------------------------
/conf/datamodule/datasets/lang_dataset/lang.yaml:
--------------------------------------------------------------------------------
1 | _target_: calvin_agent.datasets.disk_dataset.DiskDataset
2 | key: "lang"
3 | save_format: "npz"
4 | batch_size: 32
5 | min_window_size: 20
6 | max_window_size: 32
7 | proprio_state: ${datamodule.proprioception_dims}
8 | obs_space: ${datamodule.observation_space}
9 | skip_frames: 1
10 | pad: true
11 | lang_folder: "lang_paraphrase-MiniLM-L3-v2"
12 | aux_lang_loss_window: 8
13 | num_workers: 2
14 |
--------------------------------------------------------------------------------
/conf/datamodule/datasets/lang_dataset/lang_shm.yaml:
--------------------------------------------------------------------------------
1 | _target_: calvin_agent.datasets.shm_dataset.ShmDataset
2 | key: "lang"
3 | batch_size: 32
4 | min_window_size: 20
5 | max_window_size: 32
6 | proprio_state: ${datamodule.proprioception_dims}
7 | obs_space: ${datamodule.observation_space}
8 | pad: true
9 | lang_folder: "lang_paraphrase-MiniLM-L3-v2"
10 | aux_lang_loss_window: 8
11 | num_workers: 2
12 |
--------------------------------------------------------------------------------
/conf/datamodule/datasets/lang_only.yaml:
--------------------------------------------------------------------------------
1 | defaults:
2 | - lang_dataset: lang
3 |
--------------------------------------------------------------------------------
/conf/datamodule/datasets/vision_dataset/vision.yaml:
--------------------------------------------------------------------------------
1 | _target_: calvin_agent.datasets.disk_dataset.DiskDataset
2 | key: "vis"
3 | save_format: "npz"
4 | batch_size: 32
5 | min_window_size: 20
6 | max_window_size: 32
7 | proprio_state: ${datamodule.proprioception_dims}
8 | obs_space: ${datamodule.observation_space}
9 | pad: true
10 | lang_folder: "lang_paraphrase-MiniLM-L3-v2"
11 | num_workers: 2
12 |
--------------------------------------------------------------------------------
/conf/datamodule/datasets/vision_dataset/vision_shm.yaml:
--------------------------------------------------------------------------------
1 | _target_: calvin_agent.datasets.shm_dataset.ShmDataset
2 | key: "vis"
3 | batch_size: 32
4 | min_window_size: 20
5 | max_window_size: 32
6 | proprio_state: ${datamodule.proprioception_dims}
7 | obs_space: ${datamodule.observation_space}
8 | pad: true
9 | lang_folder: "lang_paraphrase-MiniLM-L3-v2"
10 | num_workers: 2
11 |
--------------------------------------------------------------------------------
/conf/datamodule/datasets/vision_lang.yaml:
--------------------------------------------------------------------------------
1 | defaults:
2 | - vision_dataset: vision
3 | - lang_dataset: lang
4 |
--------------------------------------------------------------------------------
/conf/datamodule/datasets/vision_lang_shm.yaml:
--------------------------------------------------------------------------------
1 | defaults:
2 | - vision_dataset: vision_shm
3 | - lang_dataset: lang_shm
4 |
--------------------------------------------------------------------------------
/conf/datamodule/datasets/vision_only.yaml:
--------------------------------------------------------------------------------
1 | defaults:
2 | - vision_dataset: vision
3 |
--------------------------------------------------------------------------------
/conf/datamodule/default.yaml:
--------------------------------------------------------------------------------
1 | defaults:
2 | - datasets: vision_lang_shm
3 | - transforms: rand_shift
4 | - proprioception_dims: robot_no_joints #robot_full
5 | - observation_space: lang_rgb_static_gripper_rel_act
6 | _target_: calvin_agent.datasets.calvin_data_module.CalvinDataModule
7 | _recursive_: false
8 | root_data_dir: ???
9 | action_space: 7
10 | action_max: [1., 1., 1., 1., 1., 1., 1.,]
11 | action_min: [-1., -1., -1., -1., -1., -1., -1]
12 | shuffle_val: false
13 |
--------------------------------------------------------------------------------
/conf/datamodule/mcil.yaml:
--------------------------------------------------------------------------------
1 | defaults:
2 | - datasets: vision_lang_shm
3 | - transforms: play_basic
4 | - proprioception_dims: robot_no_joints #robot_full
5 | - observation_space: lang_rgb_static_gripper_abs_act
6 | _target_: calvin_agent.datasets.calvin_data_module.CalvinDataModule
7 | _recursive_: false
8 | root_data_dir: ???
9 | action_space: 7
10 | action_max: [1., 1., 1., 1., 1., 1., 1.,]
11 | action_min: [-1., -1., -1., -1., -1., -1., -1]
12 | shuffle_val: false
13 |
--------------------------------------------------------------------------------
/conf/datamodule/observation_space/all_mods_abs_act.yaml:
--------------------------------------------------------------------------------
1 | rgb_obs: ['rgb_static', 'rgb_gripper', 'rgb_tactile']
2 | depth_obs: ['depth_static', 'depth_gripper', 'depth_tactile']
3 | state_obs: ['robot_obs', 'scene_obs']
4 | actions: ['actions']
5 | language: ['language']
6 |
--------------------------------------------------------------------------------
/conf/datamodule/observation_space/lang_rgb_static_abs_act.yaml:
--------------------------------------------------------------------------------
1 | rgb_obs: ['rgb_static']
2 | depth_obs: []
3 | state_obs: ['robot_obs']
4 | actions: ['actions']
5 | language: ['language']
6 |
--------------------------------------------------------------------------------
/conf/datamodule/observation_space/lang_rgb_static_gripper_abs_act.yaml:
--------------------------------------------------------------------------------
1 | rgb_obs: ['rgb_static', 'rgb_gripper']
2 | depth_obs: []
3 | state_obs: ['robot_obs']
4 | actions: ['actions']
5 | language: ['language']
6 |
--------------------------------------------------------------------------------
/conf/datamodule/observation_space/lang_rgb_static_gripper_rel_act.yaml:
--------------------------------------------------------------------------------
1 | rgb_obs: ['rgb_static', 'rgb_gripper']
2 | depth_obs: []
3 | state_obs: ['robot_obs']
4 | actions: ['rel_actions']
5 | language: ['language']
6 |
--------------------------------------------------------------------------------
/conf/datamodule/observation_space/lang_rgb_static_rel_act.yaml:
--------------------------------------------------------------------------------
1 | rgb_obs: ['rgb_static']
2 | depth_obs: []
3 | state_obs: ['robot_obs']
4 | actions: ['rel_actions']
5 | language: ['language']
6 |
--------------------------------------------------------------------------------
/conf/datamodule/observation_space/lang_rgb_static_robot_scene_abs_act.yaml:
--------------------------------------------------------------------------------
1 | rgb_obs: ['rgb_static']
2 | depth_obs: []
3 | state_obs: ['robot_obs', 'scene_obs']
4 | actions: ['actions']
5 | language: ['language']
6 |
--------------------------------------------------------------------------------
/conf/datamodule/observation_space/lang_rgb_static_tactile_abs_act.yaml:
--------------------------------------------------------------------------------
1 | rgb_obs: ['rgb_static', 'rgb_tactile']
2 | depth_obs: []
3 | state_obs: ['robot_obs']
4 | actions: ['actions']
5 | language: ['language']
6 |
--------------------------------------------------------------------------------
/conf/datamodule/observation_space/lang_rgbd_both_abs_act.yaml:
--------------------------------------------------------------------------------
1 | rgb_obs: ['rgb_static', 'rgb_gripper']
2 | depth_obs: ['depth_static', 'depth_gripper']
3 | state_obs: ['robot_obs']
4 | actions: ['actions']
5 | language: ['language']
6 |
--------------------------------------------------------------------------------
/conf/datamodule/observation_space/lang_rgbd_both_rel_act.yaml:
--------------------------------------------------------------------------------
1 | rgb_obs: ['rgb_static', 'rgb_gripper']
2 | depth_obs: ['depth_static', 'depth_gripper']
3 | state_obs: ['robot_obs']
4 | actions: ['rel_actions']
5 | language: ['language']
6 |
--------------------------------------------------------------------------------
/conf/datamodule/observation_space/lang_rgbd_static_gripper_rel_act.yaml:
--------------------------------------------------------------------------------
1 | rgb_obs: ['rgb_static', 'rgb_gripper']
2 | depth_obs: ['depth_gripper']
3 | state_obs: ['robot_obs']
4 | actions: ['rel_actions']
5 | language: ['language']
6 |
--------------------------------------------------------------------------------
/conf/datamodule/observation_space/lang_rgbd_static_robot_abs_act.yaml:
--------------------------------------------------------------------------------
1 | rgb_obs: ['rgb_static']
2 | depth_obs: ['depth_static']
3 | state_obs: ['robot_obs']
4 | actions: ['actions']
5 | language: ['language']
6 |
--------------------------------------------------------------------------------
/conf/datamodule/observation_space/rgb_static_abs_act.yaml:
--------------------------------------------------------------------------------
1 | rgb_obs: ['rgb_static']
2 | depth_obs: []
3 | state_obs: ['robot_obs']
4 | actions: ['actions']
5 |
--------------------------------------------------------------------------------
/conf/datamodule/observation_space/rgb_static_robot_scene_abs_act.yaml:
--------------------------------------------------------------------------------
1 | rgb_obs: ['rgb_static']
2 | depth_obs: []
3 | state_obs: ['robot_obs', 'scene_obs']
4 | actions: ['actions']
5 |
--------------------------------------------------------------------------------
/conf/datamodule/observation_space/state_only.yaml:
--------------------------------------------------------------------------------
1 | rgb_obs: []
2 | depth_obs: []
3 | state_obs: ['robot_obs']
4 | actions: ['actions']
5 | language: ['language']
6 |
--------------------------------------------------------------------------------
/conf/datamodule/proprioception_dims/none.yaml:
--------------------------------------------------------------------------------
1 | n_state_obs: 0
2 | keep_indices: [[0, 0]]
3 | robot_orientation_idx: [3, 6]
4 | normalize: False
5 | normalize_robot_orientation: False
6 |
--------------------------------------------------------------------------------
/conf/datamodule/proprioception_dims/robot_full.yaml:
--------------------------------------------------------------------------------
1 | n_state_obs: 15
2 | keep_indices: [[0, 15]]
3 | robot_orientation_idx: [3, 6]
4 | normalize: True
5 | normalize_robot_orientation: True
6 |
--------------------------------------------------------------------------------
/conf/datamodule/proprioception_dims/robot_no_joints.yaml:
--------------------------------------------------------------------------------
1 | n_state_obs: 8
2 | keep_indices: [[0, 7], [14,15]]
3 | robot_orientation_idx: [3, 6]
4 | normalize: True
5 | normalize_robot_orientation: True
6 |
--------------------------------------------------------------------------------
/conf/datamodule/proprioception_dims/robot_no_joints_no_gripper_width.yaml:
--------------------------------------------------------------------------------
1 | n_state_obs: 7
2 | keep_indices: [[0, 6], [14,15]]
3 | robot_orientation_idx: [3, 6]
4 | normalize: True
5 | normalize_robot_orientation: True
6 |
--------------------------------------------------------------------------------
/conf/datamodule/proprioception_dims/robot_scene.yaml:
--------------------------------------------------------------------------------
1 | n_state_obs: 54
2 | keep_indices: [[0, 54]]
3 | robot_orientation_idx: [3, 6]
4 | normalize: True
5 | normalize_robot_orientation: True
6 |
--------------------------------------------------------------------------------
/conf/datamodule/transforms/clip.yaml:
--------------------------------------------------------------------------------
1 | train:
2 | rgb_static:
3 | - _target_: torchvision.transforms.Resize
4 | size: 224
5 | - _target_: hulc.utils.transforms.RandomShiftsAug
6 | pad: 10
7 | - _target_: calvin_agent.utils.transforms.ScaleImageTensor
8 | - _target_: torchvision.transforms.Normalize
9 | mean: [0.48145466, 0.4578275, 0.40821073]
10 | std: [0.26862954, 0.26130258, 0.27577711]
11 | rgb_gripper:
12 | - _target_: torchvision.transforms.Resize
13 | size: 84
14 | - _target_: hulc.utils.transforms.RandomShiftsAug
15 | pad: 4
16 | - _target_: calvin_agent.utils.transforms.ScaleImageTensor
17 | - _target_: torchvision.transforms.Normalize
18 | mean: [0.48145466, 0.4578275, 0.40821073]
19 | std: [0.26862954, 0.26130258, 0.27577711]
20 | depth_static:
21 | - _target_: torchvision.transforms.Resize
22 | size: 200
23 | - _target_: calvin_agent.utils.transforms.AddDepthNoise
24 | shape: [1000.0]
25 | rate: [1000.0]
26 | - _target_: calvin_agent.utils.transforms.AddGaussianNoise
27 | mean: [0.0]
28 | std: [0.01]
29 | depth_gripper:
30 | - _target_: torchvision.transforms.Resize
31 | size: 84
32 | # - _target_: calvin.utils.transforms.AddDepthNoise
33 | # shape: [ 1000.0 ]
34 | # rate: [ 1000.0 ]
35 | - _target_: calvin_agent.utils.transforms.AddGaussianNoise
36 | mean: [ 0.0 ]
37 | std: [ 0.01 ]
38 | rgb_tactile:
39 | - _target_: torchvision.transforms.Resize
40 | size: 70
41 | - _target_: torchvision.transforms.RandomCrop
42 | size: 64
43 | - _target_: calvin_agent.utils.transforms.ScaleImageTensor
44 | - _target_: torchvision.transforms.Normalize
45 | mean: [0.5]
46 | std: [0.5]
47 | - _target_: calvin_agent.utils.transforms.AddGaussianNoise
48 | mean: [ 0.0 ]
49 | std: [ 0.01 ]
50 | depth_tactile:
51 | - _target_: torchvision.transforms.Resize
52 | size: 64
53 | - _target_: torchvision.transforms.Normalize
54 | mean: [0.1,]
55 | std: [0.2,]
56 | - _target_: calvin_agent.utils.transforms.AddGaussianNoise
57 | mean: [ 0.0 ]
58 | std: [ 0.01 ]
59 | robot_obs:
60 | - _target_: calvin_agent.utils.transforms.NormalizeVector
61 | - _target_: calvin_agent.utils.transforms.AddGaussianNoise
62 | mean: [ 0.0 ]
63 | std: [ 0.01 ]
64 | scene_obs:
65 | - _target_: calvin_agent.utils.transforms.NormalizeVector
66 | - _target_: calvin_agent.utils.transforms.AddGaussianNoise
67 | mean: [ 0.0 ]
68 | std: [ 0.01 ]
69 | language:
70 | - _target_: calvin_agent.utils.transforms.AddGaussianNoise
71 | mean: [ 0.0 ]
72 | std: [ 0.01 ]
73 |
74 |
75 | val:
76 | rgb_static:
77 | - _target_: torchvision.transforms.Resize
78 | size: 224
79 | - _target_: hulc.utils.transforms.ScaleImageTensor
80 | - _target_: torchvision.transforms.Normalize
81 | mean: [ 0.48145466, 0.4578275, 0.40821073 ]
82 | std: [ 0.26862954, 0.26130258, 0.27577711 ]
83 | rgb_gripper:
84 | - _target_: torchvision.transforms.Resize
85 | size: 84
86 | - _target_: hulc.utils.transforms.ScaleImageTensor
87 | - _target_: torchvision.transforms.Normalize
88 | mean: [ 0.48145466, 0.4578275, 0.40821073 ]
89 | std: [ 0.26862954, 0.26130258, 0.27577711 ]
90 | depth_static:
91 | - _target_: torchvision.transforms.Resize
92 | size: 200
93 | depth_gripper:
94 | - _target_: torchvision.transforms.Resize
95 | size: 84
96 | rgb_tactile:
97 | - _target_: torchvision.transforms.Resize
98 | size: 70
99 | - _target_: torchvision.transforms.RandomCrop
100 | size: 64
101 | - _target_: hulc.utils.transforms.ScaleImageTensor
102 | - _target_: torchvision.transforms.Normalize
103 | mean: [0.5]
104 | std: [0.5]
105 | depth_tactile:
106 | - _target_: torchvision.transforms.Resize
107 | size: 64
108 | - _target_: torchvision.transforms.Normalize
109 | mean: [0.1,]
110 | std: [0.2,]
111 | robot_obs:
112 | - _target_: hulc.utils.transforms.NormalizeVector
113 | scene_obs:
114 | - _target_: hulc.utils.transforms.NormalizeVector
115 |
--------------------------------------------------------------------------------
/conf/datamodule/transforms/play_basic.yaml:
--------------------------------------------------------------------------------
1 | train:
2 | rgb_static:
3 | - _target_: torchvision.transforms.Resize
4 | size: 200
5 | - _target_: calvin_agent.utils.transforms.ScaleImageTensor
6 | - _target_: torchvision.transforms.Normalize
7 | mean: [0.5,]
8 | std: [0.5,]
9 | # - _target_: calvin_agent.utils.transforms.AddGaussianNoise
10 | # mean: [0.0]
11 | # std: [0.01]
12 | rgb_gripper:
13 | - _target_: torchvision.transforms.Resize
14 | size: 84
15 | - _target_: calvin_agent.utils.transforms.ScaleImageTensor
16 | - _target_: torchvision.transforms.Normalize
17 | mean: [0.5,]
18 | std: [0.5,]
19 | # - _target_: calvin_agent.utils.transforms.AddGaussianNoise
20 | # mean: [0.0]
21 | # std: [0.01]
22 | depth_static:
23 | - _target_: torchvision.transforms.Resize
24 | size: 200
25 | - _target_: calvin_agent.utils.transforms.AddDepthNoise
26 | shape: [1000.0]
27 | rate: [1000.0]
28 | # - _target_: calvin_agent.utils.transforms.AddGaussianNoise
29 | # mean: [0.0]
30 | # std: [0.01]
31 | depth_gripper:
32 | - _target_: torchvision.transforms.Resize
33 | size: 84
34 | # - _target_: calvin.utils.transforms.AddDepthNoise
35 | # shape: [ 1000.0 ]
36 | # rate: [ 1000.0 ]
37 | - _target_: calvin_agent.utils.transforms.AddGaussianNoise
38 | mean: [ 0.0 ]
39 | std: [ 0.01 ]
40 | rgb_tactile:
41 | - _target_: torchvision.transforms.Resize
42 | size: 70
43 | - _target_: torchvision.transforms.RandomCrop
44 | size: 64
45 | - _target_: calvin_agent.utils.transforms.ScaleImageTensor
46 | - _target_: torchvision.transforms.Normalize
47 | mean: [0.5]
48 | std: [0.5]
49 | # - _target_: calvin_agent.utils.transforms.AddGaussianNoise
50 | # mean: [ 0.0 ]
51 | # std: [ 0.01 ]
52 | depth_tactile:
53 | - _target_: torchvision.transforms.Resize
54 | size: 64
55 | - _target_: torchvision.transforms.Normalize
56 | mean: [0.1,]
57 | std: [0.2,]
58 | # - _target_: calvin_agent.utils.transforms.AddGaussianNoise
59 | # mean: [ 0.0 ]
60 | # std: [ 0.01 ]
61 | robot_obs:
62 | - _target_: calvin_agent.utils.transforms.NormalizeVector
63 | # - _target_: calvin_agent.utils.transforms.AddGaussianNoise
64 | # mean: [ 0.0 ]
65 | # std: [ 0.01 ]
66 | scene_obs:
67 | - _target_: calvin_agent.utils.transforms.NormalizeVector
68 | # - _target_: calvin_agent.utils.transforms.AddGaussianNoise
69 | # mean: [ 0.0 ]
70 | # std: [ 0.01 ]
71 | # language:
72 | # - _target_: calvin_agent.utils.transforms.AddGaussianNoise
73 | # mean: [ 0.0 ]
74 | # std: [ 0.01 ]
75 |
76 |
77 | val:
78 | rgb_static:
79 | - _target_: torchvision.transforms.Resize
80 | size: 200
81 | - _target_: calvin_agent.utils.transforms.ScaleImageTensor
82 | - _target_: torchvision.transforms.Normalize
83 | mean: [0.5,]
84 | std: [0.5,]
85 | rgb_gripper:
86 | - _target_: torchvision.transforms.Resize
87 | size: 84
88 | - _target_: calvin_agent.utils.transforms.ScaleImageTensor
89 | - _target_: torchvision.transforms.Normalize
90 | mean: [0.5,]
91 | std: [0.5,]
92 | depth_static:
93 | - _target_: torchvision.transforms.Resize
94 | size: 200
95 | depth_gripper:
96 | - _target_: torchvision.transforms.Resize
97 | size: 84
98 | rgb_tactile:
99 | - _target_: torchvision.transforms.Resize
100 | size: 70
101 | - _target_: torchvision.transforms.RandomCrop
102 | size: 64
103 | - _target_: calvin_agent.utils.transforms.ScaleImageTensor
104 | - _target_: torchvision.transforms.Normalize
105 | mean: [0.5]
106 | std: [0.5]
107 | depth_tactile:
108 | - _target_: torchvision.transforms.Resize
109 | size: 64
110 | - _target_: torchvision.transforms.Normalize
111 | mean: [0.1,]
112 | std: [0.2,]
113 | robot_obs:
114 | - _target_: calvin_agent.utils.transforms.NormalizeVector
115 | scene_obs:
116 | - _target_: calvin_agent.utils.transforms.NormalizeVector
117 |
--------------------------------------------------------------------------------
/conf/datamodule/transforms/rand_shift.yaml:
--------------------------------------------------------------------------------
1 | train:
2 | rgb_static:
3 | - _target_: torchvision.transforms.Resize
4 | size: 200
5 | - _target_: hulc.utils.transforms.RandomShiftsAug
6 | pad: 10
7 | - _target_: calvin_agent.utils.transforms.ScaleImageTensor
8 | - _target_: torchvision.transforms.Normalize
9 | mean: [0.5,]
10 | std: [0.5,]
11 | # - _target_: calvin_agent.utils.transforms.AddGaussianNoise
12 | # mean: [0.0]
13 | # std: [0.01]
14 | rgb_gripper:
15 | - _target_: torchvision.transforms.Resize
16 | size: 84
17 | - _target_: hulc.utils.transforms.RandomShiftsAug
18 | pad: 4
19 | - _target_: calvin_agent.utils.transforms.ScaleImageTensor
20 | - _target_: torchvision.transforms.Normalize
21 | mean: [0.5,]
22 | std: [0.5,]
23 | # - _target_: calvin_agent.utils.transforms.AddGaussianNoise
24 | # mean: [0.0]
25 | # std: [0.01]
26 | depth_static:
27 | - _target_: torchvision.transforms.Resize
28 | size: 200
29 | - _target_: calvin_agent.utils.transforms.AddDepthNoise
30 | shape: [1000.0]
31 | rate: [1000.0]
32 | # - _target_: calvin_agent.utils.transforms.AddGaussianNoise
33 | # mean: [0.0]
34 | # std: [0.01]
35 | depth_gripper:
36 | - _target_: torchvision.transforms.Resize
37 | size: 84
38 | # - _target_: calvin.utils.transforms.AddDepthNoise
39 | # shape: [ 1000.0 ]
40 | # rate: [ 1000.0 ]
41 | - _target_: calvin_agent.utils.transforms.AddGaussianNoise
42 | mean: [ 0.0 ]
43 | std: [ 0.01 ]
44 | rgb_tactile:
45 | - _target_: torchvision.transforms.Resize
46 | size: 70
47 | - _target_: torchvision.transforms.RandomCrop
48 | size: 64
49 | - _target_: calvin_agent.utils.transforms.ScaleImageTensor
50 | - _target_: torchvision.transforms.Normalize
51 | mean: [0.5]
52 | std: [0.5]
53 | # - _target_: calvin_agent.utils.transforms.AddGaussianNoise
54 | # mean: [ 0.0 ]
55 | # std: [ 0.01 ]
56 | depth_tactile:
57 | - _target_: torchvision.transforms.Resize
58 | size: 64
59 | - _target_: torchvision.transforms.Normalize
60 | mean: [0.1,]
61 | std: [0.2,]
62 | # - _target_: calvin_agent.utils.transforms.AddGaussianNoise
63 | # mean: [ 0.0 ]
64 | # std: [ 0.01 ]
65 | robot_obs:
66 | - _target_: calvin_agent.utils.transforms.NormalizeVector
67 | # - _target_: calvin_agent.utils.transforms.AddGaussianNoise
68 | # mean: [ 0.0 ]
69 | # std: [ 0.01 ]
70 | scene_obs:
71 | - _target_: calvin_agent.utils.transforms.NormalizeVector
72 | # - _target_: calvin_agent.utils.transforms.AddGaussianNoise
73 | # mean: [ 0.0 ]
74 | # std: [ 0.01 ]
75 | # language:
76 | # - _target_: calvin_agent.utils.transforms.AddGaussianNoise
77 | # mean: [ 0.0 ]
78 | # std: [ 0.01 ]
79 |
80 |
81 | val:
82 | rgb_static:
83 | - _target_: torchvision.transforms.Resize
84 | size: 200
85 | - _target_: calvin_agent.utils.transforms.ScaleImageTensor
86 | - _target_: torchvision.transforms.Normalize
87 | mean: [0.5,]
88 | std: [0.5,]
89 | rgb_gripper:
90 | - _target_: torchvision.transforms.Resize
91 | size: 84
92 | - _target_: calvin_agent.utils.transforms.ScaleImageTensor
93 | - _target_: torchvision.transforms.Normalize
94 | mean: [0.5,]
95 | std: [0.5,]
96 | depth_static:
97 | - _target_: torchvision.transforms.Resize
98 | size: 200
99 | depth_gripper:
100 | - _target_: torchvision.transforms.Resize
101 | size: 84
102 | rgb_tactile:
103 | - _target_: torchvision.transforms.Resize
104 | size: 70
105 | - _target_: torchvision.transforms.RandomCrop
106 | size: 64
107 | - _target_: calvin_agent.utils.transforms.ScaleImageTensor
108 | - _target_: torchvision.transforms.Normalize
109 | mean: [0.5]
110 | std: [0.5]
111 | depth_tactile:
112 | - _target_: torchvision.transforms.Resize
113 | size: 64
114 | - _target_: torchvision.transforms.Normalize
115 | mean: [0.1,]
116 | std: [0.2,]
117 | robot_obs:
118 | - _target_: calvin_agent.utils.transforms.NormalizeVector
119 | scene_obs:
120 | - _target_: calvin_agent.utils.transforms.NormalizeVector
121 |
--------------------------------------------------------------------------------
/conf/inference/config_inference.yaml:
--------------------------------------------------------------------------------
1 | train_folder: ??? # config path to the config.yaml of the training folder (in .hydra)
2 | load_checkpoint: ???
3 | seed: 42
4 | log_dir: /tmp
5 | visualize: True
6 | ep_len: 120
7 | replan_freq: 30
8 | processes: 1
9 |
10 | hydra:
11 | run:
12 | dir: ${log_dir}/inference_runs/${now:%Y-%m-%d}/${now:%H-%M-%S}
13 |
14 | defaults:
15 | - override hydra/job_logging: colorlog
16 | - override hydra/hydra_logging: colorlog
17 |
--------------------------------------------------------------------------------
/conf/lang_ann.yaml:
--------------------------------------------------------------------------------
1 | defaults:
2 | - callbacks: default
3 | - datamodule: default
4 | - model: sbert
5 | - loss: default
6 | - training: default_training
7 | - trainer: play_trainer
8 | - logger: wandb
9 | - annotations@train_instructions: new_playtable
10 | - annotations@val_instructions: new_playtable_validation
11 |
12 | - override hydra/job_logging: colorlog
13 | - override hydra/hydra_logging: colorlog
14 | - override datamodule/observation_space: state_only
15 | seed: 42
16 | log_dir: ../
17 | slurm: false
18 | eps: 0.01
19 | postprocessing: true
20 | lang_folder: "lang_annotations"
21 | with_text: false
22 | reannotate: false
23 | prior_steps_window: 16
24 | validation_scene: calvin_scene_D
25 | compute_tsne: false
26 |
27 | hydra:
28 | run:
29 | dir: ${log_dir}/runs/${now:%Y-%m-%d}/${now:%H-%M-%S}_${hydra.job.override_dirname}
30 | sweep:
31 | dir: ${log_dir}/runs/${now:%Y-%m-%d}/${now:%H-%M-%S}
32 | subdir: ${hydra.job.override_dirname}
33 | job:
34 | config:
35 | override_dirname:
36 | exclude_keys:
37 | - log_dir
38 | - datamodule.root_data_dir
39 | - trainer.gpus
40 | - model.tsne_plot
41 | - datamodule.num_workers
42 | - trainer.limit_train_batches
43 | - trainer.limit_val_batches
44 | - model.decoder.load_action_bounds
45 |
--------------------------------------------------------------------------------
/conf/logger/tb_logger.yaml:
--------------------------------------------------------------------------------
1 | _target_: pytorch_lightning.loggers.TensorBoardLogger
2 | save_dir: .
3 | name: play_lmp
4 | version: ""
5 |
--------------------------------------------------------------------------------
/conf/logger/wandb.yaml:
--------------------------------------------------------------------------------
1 | _target_: pytorch_lightning.loggers.WandbLogger
2 | save_dir: .
3 | name: play_lmp
4 | group: play_lmp
5 | log_model: false
6 | project: "multi_play"
7 | entity: "multimodal_control"
8 | id: ???
9 |
--------------------------------------------------------------------------------
/conf/loss/default.yaml:
--------------------------------------------------------------------------------
1 | kl_beta: 0.01
2 | state_recon_beta: 0.5
3 | kl_balancing_mix: 0.8
4 | bc_z_auxiliary_loss_beta: 1.0
5 | mia_auxiliary_loss_beta: 1.0
6 | clip_auxiliary_loss_beta: 3.0
7 |
--------------------------------------------------------------------------------
/conf/model/action_decoder/deterministic.yaml:
--------------------------------------------------------------------------------
1 | _target_: hulc.models.decoders.deterministic_decoder.DeterministicDecoder
2 | hidden_size: 2048
3 | out_features: ${datamodule.action_space}
4 | policy_rnn_dropout_p: 0.0
5 | perceptual_features: ??
6 | latent_goal_features: ${model.visual_goal.latent_goal_features}
7 | plan_features: ???
8 | criterion: HuberLoss # MSELoss
9 | num_layers: 2
10 | rnn_model: rnn_decoder
11 | perceptual_emb_slice: [64, 128]
12 | gripper_control: true
13 |
--------------------------------------------------------------------------------
/conf/model/action_decoder/hulc_default.yaml:
--------------------------------------------------------------------------------
1 | _target_: hulc.models.decoders.logistic_decoder_rnn.LogisticDecoderRNN
2 | n_mixtures: 10
3 | hidden_size: 2048
4 | out_features: ${datamodule.action_space}
5 | log_scale_min: -7.0
6 | act_max_bound: ${datamodule.action_max}
7 | act_min_bound: ${datamodule.action_min}
8 | dataset_dir: ${datamodule.root_data_dir}
9 | load_action_bounds: false
10 | num_classes: 10
11 | latent_goal_features: ${model.visual_goal.latent_goal_features}
12 | plan_features: ???
13 | perceptual_features: ???
14 | gripper_alpha: 1.0
15 | perceptual_emb_slice: [64, 128]
16 | policy_rnn_dropout_p: 0.0
17 | num_layers: 2
18 | rnn_model: rnn_decoder
19 | gripper_control: true
20 | discrete_gripper: true
21 |
--------------------------------------------------------------------------------
/conf/model/action_decoder/mcil_default.yaml:
--------------------------------------------------------------------------------
1 | _target_: hulc.models.decoders.logistic_decoder_rnn.LogisticDecoderRNN
2 | n_mixtures: 10
3 | hidden_size: 2048
4 | out_features: ${datamodule.action_space}
5 | log_scale_min: -7.0
6 | act_max_bound: ${datamodule.action_max}
7 | act_min_bound: ${datamodule.action_min}
8 | dataset_dir: ${datamodule.root_data_dir}
9 | load_action_bounds: false
10 | num_classes: 256
11 | latent_goal_features: ${model.visual_goal.latent_goal_features}
12 | plan_features: ???
13 | perceptual_features: ???
14 | gripper_alpha: 1.0
15 | policy_rnn_dropout_p: 0.0
16 | num_layers: 2
17 | rnn_model: rnn_decoder
18 | gripper_control: false
19 | discrete_gripper: false
20 |
--------------------------------------------------------------------------------
/conf/model/bc_z_lang_decoder/default.yaml:
--------------------------------------------------------------------------------
1 | _target_: hulc.models.auxiliary_loss_networks.bc_z_lang_decoder.BCZLangDecoder
2 | in_features: ${model.plan_recognition.fc_hidden_size}
3 | lang_dim: ${model.language_goal.in_features}
4 |
--------------------------------------------------------------------------------
/conf/model/bc_z_lang_decoder/none.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lukashermann/hulc/7fdb09f5f1ff831ec8df29cfc61a41db573e9a06/conf/model/bc_z_lang_decoder/none.yaml
--------------------------------------------------------------------------------
/conf/model/clip_lang.yaml:
--------------------------------------------------------------------------------
1 | _target_: hulc.models.encoders.clip_lang_encoder.LangClip
2 | freeze_backbone: true
3 | model_name: "RN50" # "RN101", "RN50x4", "RN50x16", "ViT-B/32", "ViT-B/16"
4 |
--------------------------------------------------------------------------------
/conf/model/distribution/continuous.yaml:
--------------------------------------------------------------------------------
1 | _target_: hulc.utils.distributions.Distribution
2 | dist: "continuous"
3 | plan_features: 256
4 |
--------------------------------------------------------------------------------
/conf/model/distribution/discrete.yaml:
--------------------------------------------------------------------------------
1 | _target_: hulc.utils.distributions.Distribution
2 | dist: "discrete"
3 | category_size: 32
4 | class_size: 32
5 |
--------------------------------------------------------------------------------
/conf/model/gcbc.yaml:
--------------------------------------------------------------------------------
1 | defaults:
2 | - perceptual_encoder: gripper_cam
3 | - plan_proposal: default
4 | - plan_recognition: transformers
5 | - distribution: discrete
6 | - visual_goal: default
7 | - language_goal: default
8 | - action_decoder: hulc_default
9 | - optimizer: adam
10 | - lr_scheduler: constant
11 | - bc_z_lang_decoder: none
12 | - mia_lang_discriminator: none
13 | - proj_vis_lang: default
14 | - /annotations@val_instructions: new_playtable_validation
15 |
16 | _target_: hulc.models.gcbc.GCBC
17 | _recursive_: false
18 |
19 | kl_beta: ${loss.kl_beta}
20 | kl_balancing_mix: ${loss.kl_balancing_mix}
21 | state_recons: false
22 | state_recon_beta: ${loss.state_recon_beta}
23 | use_bc_z_auxiliary_loss: false
24 | bc_z_auxiliary_loss_beta: ${loss.bc_z_auxiliary_loss_beta}
25 | use_mia_auxiliary_loss: false
26 | mia_auxiliary_loss_beta: ${loss.mia_auxiliary_loss_beta}
27 | replan_freq: 30
28 | use_clip_auxiliary_loss: true
29 | clip_auxiliary_loss_beta: ${loss.clip_auxiliary_loss_beta}
30 |
--------------------------------------------------------------------------------
/conf/model/hulc.yaml:
--------------------------------------------------------------------------------
1 | defaults:
2 | - perceptual_encoder: gripper_cam
3 | - plan_proposal: default
4 | - plan_recognition: transformers
5 | - distribution: discrete
6 | - visual_goal: default
7 | - language_goal: default
8 | - action_decoder: hulc_default
9 | - optimizer: adam
10 | - lr_scheduler: constant
11 | - bc_z_lang_decoder: none
12 | - mia_lang_discriminator: none
13 | - proj_vis_lang: default
14 | - /annotations@val_instructions: new_playtable_validation
15 |
16 | _target_: hulc.models.hulc.Hulc
17 | _recursive_: false
18 |
19 | kl_beta: ${loss.kl_beta}
20 | kl_balancing_mix: ${loss.kl_balancing_mix}
21 | state_recons: false
22 | state_recon_beta: ${loss.state_recon_beta}
23 | use_bc_z_auxiliary_loss: false
24 | bc_z_auxiliary_loss_beta: ${loss.bc_z_auxiliary_loss_beta}
25 | use_mia_auxiliary_loss: false
26 | mia_auxiliary_loss_beta: ${loss.mia_auxiliary_loss_beta}
27 | replan_freq: 30
28 | use_clip_auxiliary_loss: true
29 | clip_auxiliary_loss_beta: ${loss.clip_auxiliary_loss_beta}
30 |
--------------------------------------------------------------------------------
/conf/model/language_encoder/default.yaml:
--------------------------------------------------------------------------------
1 | _target_: hulc.models.encoders.lang_encoder.LanguageEncoder
2 | language_features: 384
3 | hidden_size: 2048
4 | out_features: 256
5 | word_dropout_p: 0.0
6 | activation_function: ReLU #ELU
7 |
--------------------------------------------------------------------------------
/conf/model/language_encoder/none.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lukashermann/hulc/7fdb09f5f1ff831ec8df29cfc61a41db573e9a06/conf/model/language_encoder/none.yaml
--------------------------------------------------------------------------------
/conf/model/language_goal/default.yaml:
--------------------------------------------------------------------------------
1 | _target_: hulc.models.encoders.goal_encoders.LanguageGoalEncoder
2 | in_features: 384
3 | hidden_size: 2048
4 | latent_goal_features: 32
5 | l2_normalize_goal_embeddings: False
6 | activation_function: ReLU #ELU
7 | word_dropout_p: 0.0
8 |
--------------------------------------------------------------------------------
/conf/model/language_goal/none.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lukashermann/hulc/7fdb09f5f1ff831ec8df29cfc61a41db573e9a06/conf/model/language_goal/none.yaml
--------------------------------------------------------------------------------
/conf/model/lr_scheduler/constant.yaml:
--------------------------------------------------------------------------------
1 | _target_: transformers.get_constant_schedule
2 |
--------------------------------------------------------------------------------
/conf/model/lr_scheduler/cosine_schedule_with_warmup.yaml:
--------------------------------------------------------------------------------
1 | _target_: transformers.get_cosine_schedule_with_warmup
2 | num_training_steps: -1 # -1 specifies to infer number of training steps
3 | num_warmup_steps: 0.1 # float values determines percentage of training steps to use as warmup
4 | num_cycles: 0.5
5 |
--------------------------------------------------------------------------------
/conf/model/lr_scheduler/linear_schedule_with_warmup.yaml:
--------------------------------------------------------------------------------
1 | _target_: transformers.get_linear_schedule_with_warmup
2 | num_training_steps: -1 # -1 specifies to infer number of training steps
3 | num_warmup_steps: 0.1 # float values determines percentage of training steps to use as warmup
4 |
--------------------------------------------------------------------------------
/conf/model/mcil.yaml:
--------------------------------------------------------------------------------
1 | defaults:
2 | - perceptual_encoder: gripper_cam
3 | - plan_proposal: default
4 | - plan_recognition: birnn
5 | - distribution: continuous
6 | - visual_goal: default
7 | - language_goal: default
8 | - action_decoder: mcil_default
9 | - optimizer: adam
10 | - lr_scheduler: constant
11 | - bc_z_lang_decoder: none
12 | - mia_lang_discriminator: none
13 | - proj_vis_lang: none
14 | - /annotations@val_instructions: new_playtable_validation
15 |
16 | _target_: hulc.models.hulc.Hulc
17 | _recursive_: false
18 |
19 | kl_beta: ${loss.kl_beta}
20 | kl_balancing_mix: ${loss.kl_balancing_mix}
21 | state_recons: false
22 | state_recon_beta: ${loss.state_recon_beta}
23 | use_bc_z_auxiliary_loss: false
24 | bc_z_auxiliary_loss_beta: ${loss.bc_z_auxiliary_loss_beta}
25 | use_mia_auxiliary_loss: false
26 | mia_auxiliary_loss_beta: ${loss.mia_auxiliary_loss_beta}
27 | replan_freq: 30
28 | use_clip_auxiliary_loss: false
29 | clip_auxiliary_loss_beta: ${loss.clip_auxiliary_loss_beta}
30 |
--------------------------------------------------------------------------------
/conf/model/mia_lang_discriminator/default.yaml:
--------------------------------------------------------------------------------
1 | _target_: hulc.models.auxiliary_loss_networks.mia_lang_discriminator.MIALangDiscriminator
2 | in_features: ${model.proj_vis_lang.output_dim}
3 | lang_dim: ${model.proj_vis_lang.output_dim}
4 | dropout_p: 0.0
5 |
--------------------------------------------------------------------------------
/conf/model/mia_lang_discriminator/none.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lukashermann/hulc/7fdb09f5f1ff831ec8df29cfc61a41db573e9a06/conf/model/mia_lang_discriminator/none.yaml
--------------------------------------------------------------------------------
/conf/model/optimizer/adam.yaml:
--------------------------------------------------------------------------------
1 | _target_: torch.optim.Adam
2 | lr: ${training.lr}
3 | #weight_decay: 1e-6
4 |
--------------------------------------------------------------------------------
/conf/model/optimizer/adamw.yaml:
--------------------------------------------------------------------------------
1 | _target_: torch.optim.AdamW
2 | lr: ${training.lr}
3 | weight_decay: 1e-6
4 | #amsgrad: False
5 |
--------------------------------------------------------------------------------
/conf/model/optimizer/sgd.yaml:
--------------------------------------------------------------------------------
1 | _target_: torch.optim.SGD
2 | lr: ${training.lr}
3 | momentum: 0.9
4 | #weight_decay: 0.0005
5 |
--------------------------------------------------------------------------------
/conf/model/perceptual_encoder/default.yaml:
--------------------------------------------------------------------------------
1 | _target_: hulc.models.perceptual_encoders.concat_encoders.ConcatEncoders
2 | _recursive_: false
3 |
4 | defaults:
5 | - rgb_static: default
6 | - rgb_gripper: none
7 | - depth_static: none
8 | - depth_gripper: none
9 | - proprio: none
10 | - tactile: none
11 | - state_decoder: none
12 |
--------------------------------------------------------------------------------
/conf/model/perceptual_encoder/depth_gripper/default.yaml:
--------------------------------------------------------------------------------
1 | _target_: hulc.models.perceptual_encoders.vision_network_gripper.VisionNetwork
2 | input_width: 84
3 | input_height: 84
4 | activation_function: ReLU #ELU
5 | dropout_vis_fc: 0.0
6 | l2_normalize_output: false
7 | visual_features: 64
8 | conv_encoder: nature_cnn
9 | num_c: 1
10 |
--------------------------------------------------------------------------------
/conf/model/perceptual_encoder/depth_gripper/none.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lukashermann/hulc/7fdb09f5f1ff831ec8df29cfc61a41db573e9a06/conf/model/perceptual_encoder/depth_gripper/none.yaml
--------------------------------------------------------------------------------
/conf/model/perceptual_encoder/depth_static/default.yaml:
--------------------------------------------------------------------------------
1 | _target_: hulc.models.perceptual_encoders.vision_network.VisionNetwork
2 | input_width: 200
3 | input_height: 200
4 | activation_function: ReLU #ELU
5 | dropout_vis_fc: 0.0
6 | l2_normalize_output: false
7 | visual_features: 64
8 | num_c: 1
9 |
--------------------------------------------------------------------------------
/conf/model/perceptual_encoder/depth_static/none.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lukashermann/hulc/7fdb09f5f1ff831ec8df29cfc61a41db573e9a06/conf/model/perceptual_encoder/depth_static/none.yaml
--------------------------------------------------------------------------------
/conf/model/perceptual_encoder/gripper_cam.yaml:
--------------------------------------------------------------------------------
1 | _target_: hulc.models.perceptual_encoders.concat_encoders.ConcatEncoders
2 | _recursive_: false
3 |
4 | defaults:
5 | - rgb_static: default
6 | - rgb_gripper: default
7 | - depth_static: none
8 | - depth_gripper: none
9 | - proprio: none
10 | - tactile: none
11 |
--------------------------------------------------------------------------------
/conf/model/perceptual_encoder/proprio/identity.yaml:
--------------------------------------------------------------------------------
1 | _target_: hulc.models.perceptual_encoders.proprio_encoder.IdentityEncoder
2 | proprioception_dims: ${datamodule.proprioception_dims}
3 |
--------------------------------------------------------------------------------
/conf/model/perceptual_encoder/proprio/none.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lukashermann/hulc/7fdb09f5f1ff831ec8df29cfc61a41db573e9a06/conf/model/perceptual_encoder/proprio/none.yaml
--------------------------------------------------------------------------------
/conf/model/perceptual_encoder/rgb_gripper/default.yaml:
--------------------------------------------------------------------------------
1 | _target_: hulc.models.perceptual_encoders.vision_network_gripper.VisionNetwork
2 | input_width: 84
3 | input_height: 84
4 | activation_function: ReLU #ELU
5 | dropout_vis_fc: 0.0
6 | l2_normalize_output: false
7 | visual_features: 64
8 | conv_encoder: nature_cnn
9 | num_c: 3
10 |
--------------------------------------------------------------------------------
/conf/model/perceptual_encoder/rgb_gripper/none.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lukashermann/hulc/7fdb09f5f1ff831ec8df29cfc61a41db573e9a06/conf/model/perceptual_encoder/rgb_gripper/none.yaml
--------------------------------------------------------------------------------
/conf/model/perceptual_encoder/rgb_static/clip.yaml:
--------------------------------------------------------------------------------
1 | _target_: hulc.models.perceptual_encoders.vision_clip.VisionClip
2 | visual_features: 64
3 | freeze_backbone: true
4 | model_name: "RN50" # "RN101", "RN50x4", "RN50x16", "ViT-B/32", "ViT-B/16"
5 |
--------------------------------------------------------------------------------
/conf/model/perceptual_encoder/rgb_static/default.yaml:
--------------------------------------------------------------------------------
1 | _target_: hulc.models.perceptual_encoders.vision_network.VisionNetwork
2 | input_width: 200
3 | input_height: 200
4 | activation_function: ReLU #ELU
5 | dropout_vis_fc: 0.0
6 | l2_normalize_output: false
7 | visual_features: 64
8 | num_c: 3
9 | use_sinusoid: false
10 | spatial_softmax_temp: 1.0
11 |
--------------------------------------------------------------------------------
/conf/model/perceptual_encoder/state_decoder/default.yaml:
--------------------------------------------------------------------------------
1 | _target_: hulc.models.auxiliary_loss_networks.state_decoder.StateDecoder
2 | visual_features: 64
3 | n_state_obs: 8
4 |
--------------------------------------------------------------------------------
/conf/model/perceptual_encoder/state_decoder/none.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lukashermann/hulc/7fdb09f5f1ff831ec8df29cfc61a41db573e9a06/conf/model/perceptual_encoder/state_decoder/none.yaml
--------------------------------------------------------------------------------
/conf/model/perceptual_encoder/tactile/default.yaml:
--------------------------------------------------------------------------------
1 | _target_: calvin.models.perceptual_encoders.tactile_encoder.TactileEncoder
2 | visual_features: 64
3 |
--------------------------------------------------------------------------------
/conf/model/perceptual_encoder/tactile/none.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lukashermann/hulc/7fdb09f5f1ff831ec8df29cfc61a41db573e9a06/conf/model/perceptual_encoder/tactile/none.yaml
--------------------------------------------------------------------------------
/conf/model/plan_proposal/default.yaml:
--------------------------------------------------------------------------------
1 | _target_: hulc.models.plan_encoders.plan_proposal_net.PlanProposalNetwork
2 | perceptual_features: ???
3 | latent_goal_features: ${model.visual_goal.latent_goal_features}
4 | plan_features: ???
5 | activation_function: ReLU #ELU
6 | hidden_size: 2048
7 |
--------------------------------------------------------------------------------
/conf/model/plan_recognition/birnn.yaml:
--------------------------------------------------------------------------------
1 | _target_: hulc.models.plan_encoders.plan_recognition_net.PlanRecognitionBiRNNNetwork
2 | in_features: ???
3 | plan_features: 256
4 | action_space: ${datamodule.action_space}
5 | birnn_dropout_p: 0.0
6 | rnn_type: nn.RNN # nn.GRU
7 |
--------------------------------------------------------------------------------
/conf/model/plan_recognition/transformers.yaml:
--------------------------------------------------------------------------------
1 | _target_: hulc.models.plan_encoders.plan_recognition_net.PlanRecognitionTransformersNetwork
2 | num_heads: 8
3 | num_layers: 2
4 | encoder_hidden_size: 2048
5 | fc_hidden_size: 4096
6 | in_features: ??
7 | plan_features: ???
8 | action_space: ${datamodule.action_space}
9 | dropout_p: 0.1
10 | encoder_normalize: false
11 | positional_normalize: false
12 | position_embedding: true
13 | max_position_embeddings: ${datamodule.datasets.lang_dataset.max_window_size}
14 |
--------------------------------------------------------------------------------
/conf/model/proj_vis_lang/default.yaml:
--------------------------------------------------------------------------------
1 | _target_: hulc.models.auxiliary_loss_networks.proj_vis_lang.ProjVisLang
2 | im_dim: ${model.plan_recognition.fc_hidden_size}
3 | lang_dim: ${model.language_goal.latent_goal_features}
4 | output_dim: ${model.language_goal.latent_goal_features}
5 | proj_lang: true
6 |
--------------------------------------------------------------------------------
/conf/model/proj_vis_lang/none.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lukashermann/hulc/7fdb09f5f1ff831ec8df29cfc61a41db573e9a06/conf/model/proj_vis_lang/none.yaml
--------------------------------------------------------------------------------
/conf/model/sbert.yaml:
--------------------------------------------------------------------------------
1 | _target_: hulc.models.encoders.language_network.SBert
2 | nlp_model: "all-MiniLM-L6-v2"
3 |
--------------------------------------------------------------------------------
/conf/model/visual_goal/default.yaml:
--------------------------------------------------------------------------------
1 | _target_: hulc.models.encoders.goal_encoders.VisualGoalEncoder
2 | in_features: ???
3 | hidden_size: 2048
4 | latent_goal_features: 32
5 | l2_normalize_goal_embeddings: False
6 | activation_function: ReLU #ELU
7 |
--------------------------------------------------------------------------------
/conf/trainer/play_trainer.yaml:
--------------------------------------------------------------------------------
1 | devices: 1
2 | accelerator: gpu
3 | precision: 16
4 | val_check_interval: 1.0
5 | max_epochs: 100
6 | sync_batchnorm: false
7 |
--------------------------------------------------------------------------------
/conf/training/default_training.yaml:
--------------------------------------------------------------------------------
1 | lr: 0.0002
2 |
--------------------------------------------------------------------------------
/dataset/README.md:
--------------------------------------------------------------------------------
1 | # Dataset
2 | The CALVIN dataset comes with 6 hours of teleoperated play data in each of the 4 environments.
3 | You can use [this script](scripts/visualize_dataset.py) to visualize the dataset.
4 |
5 | ## Download
6 |
7 | We provide a download script to download the three different splits or a small debug dataset:
8 |
9 | **1.** [Split D->D](http://calvin.cs.uni-freiburg.de/dataset/task_D_D.zip) (166 GB):
10 | ```bash
11 | $ cd $CALVIN_ROOT/dataset
12 | $ sh download_data.sh D
13 | ```
14 | **2.** [Split ABC->D](http://calvin.cs.uni-freiburg.de/dataset/task_ABC_D.zip) (517 GB)
15 | ```bash
16 | $ cd $CALVIN_ROOT/dataset
17 | $ sh download_data.sh ABC
18 | ```
19 | **3.** [Split ABCD->D](http://calvin.cs.uni-freiburg.de/dataset/task_ABCD_D.zip) (656 GB)
20 | ```bash
21 | $ cd $CALVIN_ROOT/dataset
22 | $ sh download_data.sh ABCD
23 | ```
24 |
25 | **4.** [Small debug dataset](http://calvin.cs.uni-freiburg.de/dataset/calvin_debug_dataset.zip) (1.3 GB)
26 | ```bash
27 | $ cd $CALVIN_ROOT/dataset
28 | $ sh download_data.sh debug
29 | ```
30 |
31 | ## Language Embeddings
32 | Since Sep 16 2022, additional language embeddings are part of the dataset on the server. If you downloaded the dataset before,
33 | you can manually download the embeddings by running
34 | ```
35 | cd $CALVIN_ROOT/dataset
36 | sh download_lang_embeddings.sh D | ABC | ABCD
37 | ```
38 | Currently, the available embeddings are:
39 | - lang_all-distilroberta-v1
40 | - lang_all-MiniLM-L6-v2
41 | - lang_all-mpnet-base-v2
42 | - lang_BERT
43 | - lang_clip_resnet50
44 | - lang_clip_ViTB32
45 | - lang_huggingface_distilroberta
46 | - lang_huggingface_mpnet
47 | - lang_msmarco-bert-base-dot-v5
48 | - lang_paraphrase-MiniLM-L3-v2
49 |
50 | ## Data Structure
51 | Each interaction timestep is stored in a dictionary inside a numpy file and contains all corresponding sensory observations, different action spaces, state information and language annoations.
52 | ### Camera Observations
53 | The keys to access the different camera observations are:
54 | ```
55 | ['rgb_static'] (dtype=np.uint8, shape=(200, 200, 3)),
56 | ['rgb_gripper'] (dtype=np.uint8, shape=(84, 84, 3)),
57 | ['rgb_tactile'] (dtype=np.uint8, shape=(160, 120, 6)),
58 | ['depth_static'] (dtype=np.float32, shape=(200, 200)),
59 | ['depth_gripper'] (dtype=np.float32, shape=(84, 84)),
60 | ['depth_tactile'] (dtype=np.float32, shape=(160, 120, 2))
61 | ```
62 | ### Actions
63 | Actions are in cartesian space and define the desired tcp pose wrt to the world frame and the binary gripper action.
64 | The keys to access the 7-DOF absolute and relative actions are:
65 | (tcp = tool center point, i.e. a virtual frame between the gripper finger tips of the robot)
66 | ```
67 | ['actions']
68 | (dtype=np.float32, shape=(7,))
69 | tcp position (3): x,y,z in absolute world coordinates
70 | tcp orientation (3): euler angles x,y,z in absolute world coordinates
71 | gripper_action (1): binary (close = -1, open = 1)
72 |
73 | ['rel_actions']
74 | (dtype=np.float32, shape=(7,))
75 | tcp position (3): x,y,z in relative world coordinates normalized and clipped to (-1, 1) with scaling factor 50
76 | tcp orientation (3): euler angles x,y,z in relative world coordinates normalized and clipped to (-1, 1) with scaling factor 20
77 | gripper_action (1): binary (close = -1, open = 1)
78 | ```
79 | For inference, Calvin env accepts both absolute and relative actions. To use absolute actions, the action is specified as a 3-tuple
80 | `action = ((x,y,z), (euler_x, euler_y, euler_z), (gripper))`. To use relative actions, the action is specified as a
81 | 7-tuple `action = (x,y,z, euler_x, euler_y, euler_z, gripper)`. IMPORTANT: the environment expects the relative actions
82 | to be scaled like the `rel_actions` in the dataset.
83 |
84 | ### State Observation
85 | The keys to access the scene state information containing the position and orientation of all objects in the scenes
86 | (we do not use them to better capture challenges present in real-world settings):
87 | ```
88 | ['scene_obs']
89 | (dtype=np.float32, shape=(24,))
90 | sliding door (1): joint state
91 | drawer (1): joint state
92 | button (1): joint state
93 | switch (1): joint state
94 | lightbulb (1): on=1, off=0
95 | green light (1): on=1, off=0
96 | red block (6): (x, y, z, euler_x, euler_y, euler_z)
97 | blue block (6): (x, y, z, euler_x, euler_y, euler_z)
98 | pink block (6): (x, y, z, euler_x, euler_y, euler_z)
99 | ```
100 | The robot proprioceptive information, which also includes joint positions can be accessed with:
101 | ```
102 | ['robot_obs']
103 | (dtype=np.float32, shape=(15,))
104 | tcp position (3): x,y,z in world coordinates
105 | tcp orientation (3): euler angles x,y,z in world coordinates
106 | gripper opening width (1): in meter
107 | arm_joint_states (7): in rad
108 | gripper_action (1): binary (close = -1, open = 1)
109 | ```
110 | ### Language Annotations
111 | The language annotations are in a subdirectory of the train and validation folders called `lang_annotations`.
112 | The file `auto_lang_ann.npy` contains the language annotations and its embeddings besides of additional metadata such as the task id, the sequence indexes.
113 | ```
114 | ['language']['ann']: list of raw language
115 | ['language']['task']: list of task_id
116 | ['language']['emb']: precomputed miniLM language embedding
117 | ['info']['indx']: list of start and end indices corresponding to the precomputed language embeddings
118 | ```
119 | The `embeddings.npy` file is only present on the validation folder, this file contains the embeddings used only during the Rollouts (test inference) to condition the policy.
120 |
121 | ## Visualize Language Annotations
122 | We provide a script to generate a video that visualizes the language annotations of the recorded play data.
123 | By default we visualize the first 100 sequences, but feel free to more sequences (just change this [line](https://github.com/mees/calvin/blob/main/calvin_models/calvin_agent/utils/visualize_annotations.py#L57)).
124 | A example video is.
125 | ```
126 | cd $CALVIN_ROOT/calvin_models/calvin_agent
127 | python utils/visualize_annotations.py datamodule.root_data_dir=$CALVIN_ROOT/dataset/task_D_D/ datamodule/observation_space=lang_rgb_static
128 | ```
129 |
--------------------------------------------------------------------------------
/dataset/download_data.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Download, Unzip, and Remove zip
4 | if [ "$1" = "D" ]
5 | then
6 |
7 | echo "Downloading task_D_D ..."
8 | wget http://calvin.cs.uni-freiburg.de/dataset/task_D_D.zip
9 | unzip task_D_D.zip && rm task_D_D.zip
10 | echo "saved folder: task_D_D"
11 | elif [ "$1" = "ABC" ]
12 | then
13 |
14 | echo "Downloading task_ABC_D ..."
15 | wget http://calvin.cs.uni-freiburg.de/dataset/task_ABC_D.zip
16 | unzip task_ABC_D.zip && rm task_ABC_D.zip
17 | echo "saved folder: task_ABC_D"
18 |
19 | elif [ "$1" = "ABCD" ]
20 | then
21 |
22 | echo "Downloading task_ABCD_D ..."
23 | wget http://calvin.cs.uni-freiburg.de/dataset/task_ABCD_D.zip
24 | unzip task_ABCD_D.zip && rm task_ABCD_D.zip
25 | echo "saved folder: task_ABCD_D"
26 |
27 | elif [ "$1" = "debug" ]
28 | then
29 |
30 | echo "Downloading debug dataset ..."
31 | wget http://calvin.cs.uni-freiburg.de/dataset/calvin_debug_dataset.zip
32 | unzip calvin_debug_dataset.zip && rm calvin_debug_dataset.zip
33 | echo "saved folder: calvin_debug_dataset"
34 |
35 |
36 | else
37 | echo "Failed: Usage download_data.sh D | ABC | ABCD | debug"
38 | exit 1
39 | fi
40 |
--------------------------------------------------------------------------------
/dataset/download_lang_embeddings.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Download, Unzip, and Remove zip
3 | if [ "$1" = "D" ]
4 | then
5 |
6 | echo "Downloading Language Embeddings for task_D_D ..."
7 | cd task_D_D
8 | wget http://hulc.cs.uni-freiburg.de/language_embeddings/D_D_lang_embs_train.zip
9 | unzip D_D_lang_embs_train.zip && rm D_D_lang_embs_train.zip
10 | wget http://hulc.cs.uni-freiburg.de/language_embeddings/D_D_lang_embs_val.zip
11 | unzip D_D_lang_embs_val.zip && rm D_D_lang_embs_val.zip
12 | echo "finished!"
13 | elif [ "$1" = "ABC" ]
14 | then
15 |
16 | echo "Downloading Language Embeddings for task_ABC_D ..."
17 | cd task_ABC_D
18 | wget http://hulc.cs.uni-freiburg.de/language_embeddings/ABC_D_lang_embs_train.zip
19 | unzip ABC_D_lang_embs_train.zip && rm ABC_D_lang_embs_train.zip
20 | wget http://hulc.cs.uni-freiburg.de/language_embeddings/ABC_D_lang_embs_val.zip
21 | unzip ABC_D_lang_embs_val.zip && rm ABC_D_lang_embs_val.zip
22 | echo "finished!"
23 |
24 | elif [ "$1" = "ABCD" ]
25 | then
26 |
27 | echo "Downloading Language Embeddings for task_ABCD_D ..."
28 | cd task_ABCD_D
29 | wget http://hulc.cs.uni-freiburg.de/language_embeddings/ABCD_D_lang_embs_train.zip
30 | unzip ABCD_D_lang_embs_train.zip && rm ABCD_D_lang_embs_train.zip
31 | wget http://hulc.cs.uni-freiburg.de/language_embeddings/ABCD_D_lang_embs_val.zip
32 | unzip ABCD_D_lang_embs_val.zip && rm ABCD_D_lang_embs_val.zip
33 | echo "finished!"
34 |
35 | else
36 | echo "Failed: Usage download_lang_embeddings.sh D | ABC | ABCD"
37 | exit 1
38 | fi
39 |
--------------------------------------------------------------------------------
/hulc/__init__.py:
--------------------------------------------------------------------------------
1 | """'Hierarchical Universal Language Conditioned Policies implementation in pytorch
2 | :copyright: 2022 by Oier Mees
3 | :license: MIT, see LICENSE for more details.
4 | """
5 |
6 | __version__ = "0.0.1"
7 | __project__ = "HULC"
8 | __author__ = "Oier Mees"
9 | __license__ = "MIT"
10 | __email__ = "meeso@informatik.uni-freiburg.de"
11 |
--------------------------------------------------------------------------------
/hulc/evaluation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lukashermann/hulc/7fdb09f5f1ff831ec8df29cfc61a41db573e9a06/hulc/evaluation/__init__.py
--------------------------------------------------------------------------------
/hulc/evaluation/evaluate_policy.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import logging
3 | from pathlib import Path
4 | import sys
5 |
6 | # This is for using the locally installed repo clone when using slurm
7 | from calvin_agent.evaluation.evaluate_policy import evaluate_policy
8 |
9 | sys.path.insert(0, Path(__file__).absolute().parents[2].as_posix())
10 | from calvin_agent.evaluation.utils import get_default_model_and_env
11 | from calvin_agent.utils.utils import get_all_checkpoints, get_checkpoints_for_epochs, get_last_checkpoint
12 | from pytorch_lightning import seed_everything
13 |
14 | logger = logging.getLogger(__name__)
15 |
16 |
17 | def get_epoch(checkpoint):
18 | if "=" not in checkpoint.stem:
19 | return "0"
20 | checkpoint.stem.split("=")[1]
21 |
22 |
23 | def main():
24 | seed_everything(0, workers=True) # type:ignore
25 | parser = argparse.ArgumentParser(description="Evaluate a trained model on multistep sequences with language goals.")
26 | parser.add_argument("--dataset_path", type=str, help="Path to the dataset root directory.")
27 |
28 | # arguments for loading default model
29 | parser.add_argument(
30 | "--train_folder", type=str, help="If calvin_agent was used to train, specify path to the log dir."
31 | )
32 | parser.add_argument(
33 | "--checkpoints",
34 | type=str,
35 | default=None,
36 | help="Comma separated list of epochs for which checkpoints will be loaded",
37 | )
38 | parser.add_argument(
39 | "--checkpoint",
40 | type=str,
41 | default=None,
42 | help="Path of the checkpoint",
43 | )
44 | parser.add_argument(
45 | "--last_k_checkpoints",
46 | type=int,
47 | help="Specify the number of checkpoints you want to evaluate (starting from last). Only used for calvin_agent.",
48 | )
49 |
50 | parser.add_argument("--debug", action="store_true", help="Print debug info and visualize environment.")
51 |
52 | parser.add_argument("--eval_log_dir", default=None, type=str, help="Where to log the evaluation results.")
53 |
54 | parser.add_argument("--device", default=0, type=int, help="CUDA device")
55 | args = parser.parse_args()
56 |
57 | assert "train_folder" in args
58 |
59 | checkpoints = []
60 | if args.checkpoints is None and args.last_k_checkpoints is None and args.checkpoint is None:
61 | print("Evaluating model with last checkpoint.")
62 | checkpoints = [get_last_checkpoint(Path(args.train_folder))]
63 | elif args.checkpoints is not None:
64 | print(f"Evaluating model with checkpoints {args.checkpoints}.")
65 | checkpoints = get_checkpoints_for_epochs(Path(args.train_folder), args.checkpoints)
66 | elif args.checkpoints is None and args.last_k_checkpoints is not None:
67 | print(f"Evaluating model with last {args.last_k_checkpoints} checkpoints.")
68 | checkpoints = get_all_checkpoints(Path(args.train_folder))[-args.last_k_checkpoints :]
69 | elif args.checkpoint is not None:
70 | checkpoints = [Path(args.checkpoint)]
71 |
72 | env = None
73 | for checkpoint in checkpoints:
74 | epoch = get_epoch(checkpoint)
75 | model, env, _ = get_default_model_and_env(
76 | args.train_folder,
77 | args.dataset_path,
78 | checkpoint,
79 | env=env,
80 | device_id=args.device,
81 | )
82 | evaluate_policy(model, env, epoch, eval_log_dir=args.eval_log_dir, debug=args.debug, create_plan_tsne=True)
83 |
84 |
85 | if __name__ == "__main__":
86 | main()
87 |
--------------------------------------------------------------------------------
/hulc/evaluation/rollouts_interactive.py:
--------------------------------------------------------------------------------
1 | import logging
2 | from pathlib import Path
3 |
4 | from calvin_agent.evaluation.utils import imshow_tensor
5 | from calvin_agent.utils.utils import get_last_checkpoint
6 | import cv2
7 | import hydra
8 | import numpy as np
9 | from omegaconf import DictConfig, OmegaConf
10 | from omegaconf.errors import MissingMandatoryValue
11 | from pytorch_lightning import seed_everything
12 | import torch
13 |
14 | from hulc.models.hulc import Hulc
15 |
16 | logger = logging.getLogger(__name__)
17 |
18 |
19 | def get_checkpoint(cfg):
20 | try:
21 | checkpoint = cfg.load_checkpoint
22 | except MissingMandatoryValue:
23 | checkpoint = get_last_checkpoint(Path(cfg.train_folder))
24 | return checkpoint
25 |
26 |
27 | def format_sftp_path(cfg):
28 | """
29 | When using network mount from nautilus, format path
30 | """
31 | if cfg.train_folder.startswith("sftp"):
32 | cfg.train_folder = "/run/user/9984/gvfs/sftp:host=" + cfg.train_folder[7:]
33 |
34 |
35 | @hydra.main(config_path="../../conf/inference", config_name="config_inference")
36 | def test_policy(input_cfg: DictConfig) -> None:
37 | """
38 | Run inference on trained policy.
39 | Arguments:
40 | train_folder (str): path of trained model.
41 | load_checkpoint (str): optional model checkpoint. If not specified, the last checkpoint is taken by default.
42 | +datamodule.root_data_dir (str): /path/dataset when running inference on another machine than were it was trained
43 | visualize (bool): wether to visualize the policy rollouts (default True).
44 | """
45 | # when mounting remote folder with sftp, format path
46 | format_sftp_path(input_cfg)
47 | # load config used during training
48 | train_cfg_path = Path(input_cfg.train_folder) / ".hydra/config.yaml"
49 | train_cfg = OmegaConf.load(train_cfg_path)
50 |
51 | # merge configs to keep current cmd line overrides
52 | cfg = OmegaConf.merge(train_cfg, input_cfg)
53 | seed_everything(cfg.seed)
54 |
55 | # since we don't use the trainer during inference, manually set up data_module
56 | data_module = hydra.utils.instantiate(cfg.datamodule, num_workers=4)
57 | data_module.prepare_data()
58 | data_module.setup()
59 | dataloader = data_module.val_dataloader()
60 | dataset = dataloader.dataset.datasets["vis"]
61 | env = hydra.utils.instantiate(cfg.callbacks.rollout.env_cfg, dataset, torch.device("cuda:0"), show_gui=False)
62 |
63 | tasks = hydra.utils.instantiate(cfg.callbacks.rollout.tasks)
64 | checkpoint = get_checkpoint(cfg)
65 | logger.info("Loading model from checkpoint.")
66 | model = Hulc.load_from_checkpoint(checkpoint)
67 | model.freeze()
68 | # model.action_decoder._setup_action_bounds(cfg.datamodule.root_data_dir, None, None)
69 | model = model.cuda(0)
70 | logger.info("Successfully loaded model.")
71 |
72 | ep_start_end_ids = np.sort(np.load(dataset.abs_datasets_dir / "ep_start_end_ids.npy"), axis=0)
73 |
74 | for s, e in ep_start_end_ids:
75 | i = start_i = s
76 | file = dataset.abs_datasets_dir / f"episode_{i:06d}.npz"
77 | data = np.load(file)
78 | obs = env.reset(scene_obs=data["scene_obs"], robot_obs=data["robot_obs"])
79 | start_info = env.get_info()
80 | current_img_obs = start_img_obs = obs["rgb_obs"]
81 | start_state_obs = obs["state_obs"]
82 | goal_imgs = obs["rgb_obs"]
83 | goal_state = obs["state_obs"]
84 | scene_obs = data["scene_obs"]
85 | robot_obs = data["robot_obs"]
86 | while 1:
87 | imshow_tensor("current_img", current_img_obs[0], wait=1)
88 | imshow_tensor("start", start_img_obs[0], wait=1)
89 | imshow_tensor("goal", goal_imgs[0], wait=1)
90 | cv2.imshow("keylistener", np.zeros((300, 300)))
91 | k = cv2.waitKey(0) % 256
92 | if k == ord("s"):
93 | start_info = env.get_info()
94 | start_img_obs = obs["rgb_obs"]
95 | start_state_obs = obs["state_obs"]
96 | scene_obs = data["scene_obs"]
97 | robot_obs = data["robot_obs"]
98 | start_i = i
99 | elif k == ord("w"):
100 | end_info = env.get_info()
101 | print(tasks.get_task_info(start_info, end_info))
102 | goal_imgs = obs["rgb_obs"]
103 | goal_state = obs["state_obs"]
104 | print(f"steps: {i - start_i}")
105 | elif k == ord("r"):
106 | file = dataset.abs_datasets_dir / f"episode_{i:06d}.npz"
107 | data = np.load(file)
108 | obs = env.reset(scene_obs=data["scene_obs"])
109 | current_img_obs = obs["rgb_obs"]
110 | elif k == ord("a"):
111 | i -= 1
112 | i = np.clip(i, s, e)
113 | file = dataset.abs_datasets_dir / f"episode_{i:06d}.npz"
114 | data = np.load(file)
115 | obs = env.reset(scene_obs=data["scene_obs"], robot_obs=data["robot_obs"])
116 | current_img_obs = obs["rgb_obs"]
117 |
118 | elif k == ord("d"):
119 | i += 1
120 | i = np.clip(i, s, e)
121 | file = dataset.abs_datasets_dir / f"episode_{i:06d}.npz"
122 | data = np.load(file)
123 | obs = env.reset(scene_obs=data["scene_obs"], robot_obs=data["robot_obs"])
124 | current_img_obs = obs["rgb_obs"]
125 | elif k == ord("q"):
126 | i -= 100
127 | i = np.clip(i, s, e)
128 | file = dataset.abs_datasets_dir / f"episode_{i:06d}.npz"
129 | data = np.load(file)
130 | obs = env.reset(scene_obs=data["scene_obs"], robot_obs=data["robot_obs"])
131 | current_img_obs = obs["rgb_obs"]
132 |
133 | elif k == ord("e"):
134 | i += 100
135 | i = np.clip(i, s, e)
136 | file = dataset.abs_datasets_dir / f"episode_{i:06d}.npz"
137 | data = np.load(file)
138 | obs = env.reset(scene_obs=data["scene_obs"], robot_obs=data["robot_obs"])
139 | current_img_obs = obs["rgb_obs"]
140 |
141 | elif k == ord("f"):
142 | env.reset(scene_obs=scene_obs, robot_obs=robot_obs)
143 | rollout(model, env, tasks, cfg, start_info, start_img_obs, start_state_obs, goal_imgs, goal_state)
144 | obs = env.reset(scene_obs=scene_obs, robot_obs=robot_obs)
145 | current_img_obs = obs["rgb_obs"]
146 | i = start_i
147 | elif k == ord("n"): # ESC
148 | break
149 |
150 |
151 | def rollout(model, env, tasks, cfg, start_info, current_img_obs, current_state_obs, goal_imgs, goal_state):
152 | # goal image is last step of the episode
153 | # goal_imgs = [goal_img.unsqueeze(0).cuda() for goal_img in goal_imgs]
154 | goal_imgs = goal_imgs[0].contiguous()
155 | for step in range(cfg.ep_len):
156 | # replan every replan_freq steps (default 30 i.e every second)
157 | if step % cfg.replan_freq == 0:
158 | plan, latent_goal = model.get_pp_plan_vision(
159 | current_img_obs, goal_imgs, current_state_obs, goal_state
160 | ) # type: ignore
161 | imshow_tensor("current_img", current_img_obs[0], wait=1)
162 |
163 | # use plan to predict actions with current observations
164 | action = model.predict_with_plan(current_img_obs, current_state_obs, latent_goal, plan)
165 | obs, _, _, current_info = env.step(action)
166 | # check if current step solves a task
167 | current_task_info = tasks.get_task_info(start_info, current_info)
168 | if len(current_task_info) > 0:
169 | print(current_task_info)
170 | # update current observation
171 | current_img_obs = obs["rgb_obs"]
172 | current_state_obs = obs["state_obs"]
173 |
174 |
175 | if __name__ == "__main__":
176 | test_policy()
177 |
--------------------------------------------------------------------------------
/hulc/evaluation/run_multiple.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import multiprocessing
3 | import os
4 | from pathlib import Path
5 | import subprocess
6 |
7 | from calvin_agent.utils.utils import get_all_checkpoints
8 | import numpy as np
9 |
10 |
11 | def get_log_dir(log_dir):
12 | log_dir = Path(log_dir)
13 | os.makedirs(log_dir, exist_ok=True)
14 | return log_dir
15 |
16 |
17 | def intervals(parts, duration):
18 | part_duration = duration / parts
19 | return [str(int(i * part_duration)) + "-" + str(int(((i + 1) * part_duration) - 1)) for i in range(parts)]
20 |
21 |
22 | def main():
23 | """
24 | This script calls the evaluate.sh script of the specified training_dir 8 times with different checkpoints
25 | """
26 | parser = argparse.ArgumentParser(description="Evaluate a trained model on multistep sequences with language goals.")
27 | parser.add_argument("--dataset_path", type=str, help="Path to the dataset root directory.")
28 |
29 | parser.add_argument(
30 | "--train_folder", type=str, help="If calvin_agent was used to train, specify path to the log dir."
31 | )
32 | parser.add_argument("--max_epoch", type=int, default=30, help="Evaluate until which epoch.")
33 | parser.add_argument(
34 | "--eval_log_dir", type=str, help="If calvin_agent was used to train, specify path to the log dir."
35 | )
36 |
37 | args = parser.parse_args()
38 | eval_log_dir = get_log_dir(args.eval_log_dir)
39 |
40 | eval_script = (Path(__file__).parent / "evaluate_policy.py").as_posix()
41 | training_dir = Path(args.train_folder)
42 | checkpoints = get_all_checkpoints(training_dir)
43 | epochs = [str(e) for chk in checkpoints if (e := int(chk.stem.split("=")[1])) <= args.max_epoch]
44 | split_epochs = np.array_split(epochs, 8)
45 | epoch_args = [",".join(arr) for arr in split_epochs]
46 | max_cpu_count = multiprocessing.cpu_count()
47 | local_cpus = intervals(8, max_cpu_count)
48 | for i, epoch_arg in enumerate(epoch_args):
49 | cmd = [
50 | "taskset",
51 | "--cpu-list",
52 | local_cpus[i],
53 | "python",
54 | eval_script,
55 | "--checkpoints",
56 | epoch_arg,
57 | "--dataset_path",
58 | args.dataset_path,
59 | "--train_folder",
60 | args.train_folder,
61 | "--eval_log_dir",
62 | args.eval_log_dir,
63 | "--device",
64 | str(i),
65 | ]
66 | std_out = eval_log_dir / f"stdout_{i}.out"
67 | std_err = eval_log_dir / f"stderr_{i}.err"
68 | with open(std_out, "wb") as out, open(std_err, "wb") as err:
69 | subprocess.Popen(cmd, stdout=out, stderr=err, preexec_fn=os.setpgrp)
70 |
71 |
72 | if __name__ == "__main__":
73 | main()
74 |
--------------------------------------------------------------------------------
/hulc/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lukashermann/hulc/7fdb09f5f1ff831ec8df29cfc61a41db573e9a06/hulc/models/__init__.py
--------------------------------------------------------------------------------
/hulc/models/auxiliary_loss_networks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lukashermann/hulc/7fdb09f5f1ff831ec8df29cfc61a41db573e9a06/hulc/models/auxiliary_loss_networks/__init__.py
--------------------------------------------------------------------------------
/hulc/models/auxiliary_loss_networks/bc_z_lang_decoder.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 |
4 |
5 | class BCZLangDecoder(nn.Module):
6 | def __init__(self, in_features: int, lang_dim: int):
7 | super().__init__()
8 | # include proprio info???
9 | self.mlp = nn.Sequential(
10 | nn.Linear(in_features=in_features, out_features=512),
11 | nn.ReLU(),
12 | nn.Linear(in_features=512, out_features=lang_dim),
13 | )
14 |
15 | def forward(self, x: torch.Tensor) -> torch.Tensor:
16 | x = self.mlp(x)
17 | return x
18 |
--------------------------------------------------------------------------------
/hulc/models/auxiliary_loss_networks/mia_lang_discriminator.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 |
4 |
5 | class MIALangDiscriminator(nn.Module):
6 | def __init__(self, in_features: int, lang_dim: int, dropout_p: float):
7 | super().__init__()
8 | self.mlp = nn.Sequential(
9 | nn.Linear(in_features=in_features + lang_dim, out_features=512),
10 | nn.ReLU(),
11 | nn.Dropout(dropout_p),
12 | nn.Linear(in_features=512, out_features=1),
13 | )
14 |
15 | def forward(self, vis_emb: torch.Tensor, lang_emb: torch.Tensor) -> torch.Tensor:
16 | x = torch.cat([vis_emb, lang_emb], dim=-1)
17 | x = self.mlp(x)
18 | return x
19 |
--------------------------------------------------------------------------------
/hulc/models/auxiliary_loss_networks/proj_vis_lang.py:
--------------------------------------------------------------------------------
1 | from typing import Tuple
2 |
3 | import torch
4 | import torch.nn as nn
5 |
6 |
7 | class ProjVisLang(nn.Module):
8 | def __init__(self, im_dim: int, lang_dim: int, output_dim: int, proj_lang: bool = True):
9 | super().__init__()
10 | self.mlp_im = nn.Sequential(
11 | nn.Linear(in_features=im_dim, out_features=128),
12 | nn.ReLU(),
13 | nn.Linear(in_features=128, out_features=output_dim),
14 | )
15 | self.mlp_lang = None
16 | if proj_lang:
17 | self.mlp_lang = nn.Sequential(
18 | nn.Linear(in_features=lang_dim, out_features=128),
19 | nn.ReLU(),
20 | nn.Linear(in_features=128, out_features=output_dim),
21 | )
22 |
23 | def forward(self, vis_emb: torch.Tensor, lang_emb: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
24 | vis_emb = self.mlp_im(vis_emb)
25 | if self.mlp_lang is not None:
26 | lang_emb = self.mlp_lang(lang_emb)
27 | return vis_emb, lang_emb
28 |
--------------------------------------------------------------------------------
/hulc/models/auxiliary_loss_networks/state_decoder.py:
--------------------------------------------------------------------------------
1 | from typing import Dict
2 |
3 | import torch
4 | import torch.nn as nn
5 |
6 |
7 | class StateDecoder(nn.Module):
8 | def __init__(self, visual_features: int, n_state_obs: int):
9 | super().__init__()
10 | self.mlp = nn.Sequential(
11 | nn.Linear(in_features=visual_features, out_features=40),
12 | nn.ReLU(),
13 | nn.Linear(in_features=40, out_features=40),
14 | nn.ReLU(),
15 | nn.Linear(in_features=40, out_features=n_state_obs),
16 | )
17 |
18 | def forward(self, x: torch.Tensor) -> torch.Tensor:
19 | x = self.mlp(x)
20 | return x
21 |
--------------------------------------------------------------------------------
/hulc/models/decoders/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lukashermann/hulc/7fdb09f5f1ff831ec8df29cfc61a41db573e9a06/hulc/models/decoders/__init__.py
--------------------------------------------------------------------------------
/hulc/models/decoders/action_decoder.py:
--------------------------------------------------------------------------------
1 | from typing import Optional, Tuple
2 |
3 | import torch
4 | import torch.nn as nn
5 |
6 |
7 | class ActionDecoder(nn.Module):
8 | def act(
9 | self,
10 | latent_plan: torch.Tensor,
11 | perceptual_emb: torch.Tensor,
12 | latent_goal: torch.Tensor,
13 | robot_obs: Optional[torch.Tensor] = None,
14 | ) -> torch.Tensor:
15 | raise NotImplementedError
16 |
17 | def loss(
18 | self,
19 | latent_plan: torch.Tensor,
20 | perceptual_emb: torch.Tensor,
21 | latent_goal: torch.Tensor,
22 | actions: torch.Tensor,
23 | robot_obs: Optional[torch.Tensor] = None,
24 | ) -> torch.Tensor:
25 | raise NotImplementedError
26 |
27 | def loss_and_act(
28 | self,
29 | latent_plan: torch.Tensor,
30 | perceptual_emb: torch.Tensor,
31 | latent_goal: torch.Tensor,
32 | actions: torch.Tensor,
33 | robot_obs: Optional[torch.Tensor] = None,
34 | ) -> Tuple[torch.Tensor, torch.Tensor]:
35 | raise NotImplementedError
36 |
37 | def _sample(self, *args, **kwargs):
38 | raise NotImplementedError
39 |
40 | def forward(
41 | self, latent_plan: torch.Tensor, perceptual_emb: torch.Tensor, latent_goal: torch.Tensor
42 | ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
43 | raise NotImplementedError
44 |
45 | def clear_hidden_state(self) -> None:
46 | pass
47 |
--------------------------------------------------------------------------------
/hulc/models/decoders/deterministic_decoder.py:
--------------------------------------------------------------------------------
1 | import logging
2 | from typing import Optional, Tuple
3 |
4 | import torch
5 | import torch.nn as nn
6 |
7 | from hulc.models.decoders.action_decoder import ActionDecoder
8 | from hulc.models.decoders.utils.gripper_control import tcp_to_world_frame, world_to_tcp_frame
9 |
10 | logger = logging.getLogger(__name__)
11 |
12 |
13 | class DeterministicDecoder(ActionDecoder):
14 | def __init__(
15 | self,
16 | perceptual_features: int,
17 | latent_goal_features: int,
18 | plan_features: int,
19 | hidden_size: int,
20 | out_features: int,
21 | policy_rnn_dropout_p: float,
22 | criterion: str,
23 | num_layers: int,
24 | rnn_model: str,
25 | perceptual_emb_slice: tuple,
26 | gripper_control: bool,
27 | ):
28 | super(DeterministicDecoder, self).__init__()
29 | self.plan_features = plan_features
30 | self.gripper_control = gripper_control
31 | self.out_features = out_features
32 | in_features = (perceptual_emb_slice[1] - perceptual_emb_slice[0]) + latent_goal_features + plan_features
33 | self.rnn = eval(rnn_model)
34 | self.rnn = self.rnn(in_features, hidden_size, num_layers, policy_rnn_dropout_p)
35 | self.actions = nn.Sequential(nn.Linear(hidden_size, out_features), nn.Tanh())
36 | self.criterion = getattr(nn, criterion)()
37 | self.perceptual_emb_slice = perceptual_emb_slice
38 | self.hidden_state = None
39 |
40 | def clear_hidden_state(self) -> None:
41 | self.hidden_state = None
42 |
43 | def forward( # type: ignore
44 | self,
45 | latent_plan: torch.Tensor,
46 | perceptual_emb: torch.Tensor,
47 | latent_goal: torch.Tensor,
48 | h_0: Optional[torch.Tensor] = None,
49 | ) -> Tuple[torch.Tensor, torch.Tensor]:
50 | perceptual_emb = perceptual_emb[..., slice(*self.perceptual_emb_slice)]
51 | batch_size, seq_len = perceptual_emb.shape[0], perceptual_emb.shape[1]
52 | latent_plan = latent_plan.unsqueeze(1).expand(-1, seq_len, -1) if latent_plan.nelement() > 0 else latent_plan
53 | latent_goal = latent_goal.unsqueeze(1).expand(-1, seq_len, -1)
54 | x = torch.cat([latent_plan, perceptual_emb, latent_goal], dim=-1) # b, s, (plan + visuo-propio + goal)
55 | if not isinstance(self.rnn, nn.Sequential) and isinstance(self.rnn, nn.RNNBase):
56 | x, h_n = self.rnn(x, h_0)
57 | else:
58 | x = self.rnn(x)
59 | h_n = None
60 | actions = self.actions(x)
61 | return actions, h_n
62 |
63 | def loss_and_act(
64 | self,
65 | latent_plan: torch.Tensor,
66 | perceptual_emb: torch.Tensor,
67 | latent_goal: torch.Tensor,
68 | actions: torch.Tensor,
69 | robot_obs: Optional[torch.Tensor] = None,
70 | ) -> Tuple[torch.Tensor, torch.Tensor]:
71 | pred_actions, _ = self(latent_plan, perceptual_emb, latent_goal)
72 | # loss
73 | if self.gripper_control:
74 | actions_tcp = world_to_tcp_frame(actions, robot_obs)
75 | loss = self.criterion(pred_actions, actions_tcp)
76 | pred_actions_world = tcp_to_world_frame(pred_actions, robot_obs)
77 | return loss, pred_actions_world
78 | else:
79 | loss = self.criterion(pred_actions, actions)
80 | return loss, pred_actions
81 |
82 | def loss(
83 | self,
84 | latent_plan: torch.Tensor,
85 | perceptual_emb: torch.Tensor,
86 | latent_goal: torch.Tensor,
87 | actions: torch.Tensor,
88 | robot_obs: Optional[torch.Tensor] = None,
89 | ) -> torch.Tensor:
90 | pred_actions, _ = self(latent_plan, perceptual_emb, latent_goal)
91 | if self.gripper_control:
92 | actions_tcp = world_to_tcp_frame(actions, robot_obs)
93 | self.criterion(pred_actions, actions_tcp)
94 | return self.criterion(pred_actions, actions)
95 |
96 | def act(
97 | self,
98 | latent_plan: torch.Tensor,
99 | perceptual_emb: torch.Tensor,
100 | latent_goal: torch.Tensor,
101 | robot_obs: Optional[torch.Tensor] = None,
102 | ) -> torch.Tensor:
103 | pred_actions, self.hidden_state = self(latent_plan, perceptual_emb, latent_goal, self.hidden_state)
104 | if self.gripper_control:
105 | pred_actions_world = tcp_to_world_frame(pred_actions, robot_obs)
106 | return pred_actions_world
107 | else:
108 | return pred_actions
109 |
--------------------------------------------------------------------------------
/hulc/models/decoders/logistic_decoder_rnn.py:
--------------------------------------------------------------------------------
1 | import logging
2 | from pathlib import Path
3 | from typing import List, Optional, Tuple, Union
4 |
5 | import numpy as np
6 | from omegaconf import ListConfig, OmegaConf
7 | import torch
8 | import torch.nn as nn
9 | import torch.nn.functional as F
10 |
11 | import hulc
12 | from hulc.models.decoders.action_decoder import ActionDecoder
13 | from hulc.models.decoders.utils.gripper_control import tcp_to_world_frame, world_to_tcp_frame
14 | from hulc.models.decoders.utils.rnn import gru_decoder, lstm_decoder, mlp_decoder, rnn_decoder # needed for line 60
15 |
16 | logger = logging.getLogger(__name__)
17 |
18 |
19 | def log_sum_exp(x):
20 | """numerically stable log_sum_exp implementation that prevents overflow"""
21 | axis = len(x.size()) - 1
22 | m, _ = torch.max(x, dim=axis)
23 | m2, _ = torch.max(x, dim=axis, keepdim=True)
24 | return m + torch.log(torch.sum(torch.exp(x - m2), dim=axis))
25 |
26 |
27 | class LogisticDecoderRNN(ActionDecoder):
28 | def __init__(
29 | self,
30 | perceptual_features: int,
31 | latent_goal_features: int,
32 | plan_features: int,
33 | n_mixtures: int,
34 | hidden_size: int,
35 | out_features: int,
36 | log_scale_min: float,
37 | act_max_bound: Union[List[float], ListConfig],
38 | act_min_bound: Union[List[float], ListConfig],
39 | dataset_dir: str,
40 | load_action_bounds: bool,
41 | num_classes: int,
42 | gripper_alpha: float,
43 | policy_rnn_dropout_p: float,
44 | num_layers: int,
45 | rnn_model: str,
46 | gripper_control: bool,
47 | discrete_gripper: bool,
48 | perceptual_emb_slice: Optional[tuple] = None,
49 | ):
50 | super(LogisticDecoderRNN, self).__init__()
51 | self.n_dist = n_mixtures
52 | self.gripper_control = gripper_control
53 | self.discrete_gripper = discrete_gripper
54 | self.log_scale_min = log_scale_min
55 | self.num_classes = num_classes
56 | self.plan_features = plan_features
57 | if perceptual_emb_slice is not None:
58 | in_features = (perceptual_emb_slice[1] - perceptual_emb_slice[0]) + latent_goal_features + plan_features
59 | else:
60 | in_features = perceptual_features + latent_goal_features + plan_features
61 | self.out_features = out_features - 1 if discrete_gripper else out_features # for discrete gripper act
62 | self.gripper_alpha = gripper_alpha
63 | self.rnn = eval(rnn_model)
64 | self.rnn = self.rnn(in_features, hidden_size, num_layers, policy_rnn_dropout_p)
65 | self.mean_fc = nn.Linear(hidden_size, self.out_features * self.n_dist)
66 | self.log_scale_fc = nn.Linear(hidden_size, self.out_features * self.n_dist)
67 | self.prob_fc = nn.Linear(hidden_size, self.out_features * self.n_dist)
68 | self.register_buffer("one_hot_embedding_eye", torch.eye(self.n_dist))
69 | self.register_buffer("ones", torch.ones(1, 1, self.n_dist))
70 | self._setup_action_bounds(dataset_dir, act_max_bound, act_min_bound, load_action_bounds)
71 | # hack for mypy
72 | self.one_hot_embedding_eye: torch.Tensor = self.one_hot_embedding_eye
73 | self.action_max_bound: torch.Tensor = self.action_max_bound
74 | self.action_min_bound: torch.Tensor = self.action_min_bound
75 | if self.discrete_gripper:
76 | self.gripper_bounds: torch.Tensor = self.gripper_bounds
77 | self.gripper_fc = nn.Linear(hidden_size, 2)
78 | self.criterion = nn.CrossEntropyLoss()
79 | self.perceptual_emb_slice = perceptual_emb_slice
80 | self.hidden_state = None
81 |
82 | def clear_hidden_state(self) -> None:
83 | self.hidden_state = None
84 |
85 | def loss_and_act( # type: ignore
86 | self,
87 | latent_plan: torch.Tensor,
88 | perceptual_emb: torch.Tensor,
89 | latent_goal: torch.Tensor,
90 | actions: torch.Tensor,
91 | robot_obs: torch.Tensor,
92 | ) -> Tuple[torch.Tensor, torch.Tensor]:
93 | logit_probs, log_scales, means, gripper_act, _ = self(latent_plan, perceptual_emb, latent_goal)
94 | pred_actions = self._sample(logit_probs, log_scales, means, gripper_act)
95 | if self.gripper_control:
96 | actions_tcp = world_to_tcp_frame(actions, robot_obs)
97 | loss = self._loss(logit_probs, log_scales, means, gripper_act, actions_tcp)
98 | pred_actions_world = tcp_to_world_frame(pred_actions, robot_obs)
99 | return loss, pred_actions_world
100 | else:
101 | loss = self._loss(logit_probs, log_scales, means, gripper_act, actions)
102 | return loss, pred_actions
103 |
104 | def act( # type: ignore
105 | self,
106 | latent_plan: torch.Tensor,
107 | perceptual_emb: torch.Tensor,
108 | latent_goal: torch.Tensor,
109 | robot_obs: torch.Tensor,
110 | ) -> torch.Tensor:
111 | logit_probs, log_scales, means, gripper_act, self.hidden_state = self(
112 | latent_plan, perceptual_emb, latent_goal, self.hidden_state
113 | )
114 | pred_actions = self._sample(logit_probs, log_scales, means, gripper_act)
115 | if self.gripper_control:
116 | pred_actions_world = tcp_to_world_frame(pred_actions, robot_obs)
117 | return pred_actions_world
118 | else:
119 | return pred_actions
120 |
121 | def loss( # type: ignore
122 | self,
123 | latent_plan: torch.Tensor,
124 | perceptual_emb: torch.Tensor,
125 | latent_goal: torch.Tensor,
126 | actions: torch.Tensor,
127 | robot_obs: torch.Tensor,
128 | ) -> torch.Tensor: # type: ignore
129 | logit_probs, log_scales, means, gripper_act, _ = self(latent_plan, perceptual_emb, latent_goal)
130 | if self.gripper_control:
131 | actions_tcp = world_to_tcp_frame(actions, robot_obs)
132 | return self._loss(logit_probs, log_scales, means, gripper_act, actions_tcp)
133 | else:
134 | return self._loss(logit_probs, log_scales, means, gripper_act, actions)
135 |
136 | def _loss(
137 | self,
138 | logit_probs: torch.Tensor,
139 | log_scales: torch.Tensor,
140 | means: torch.Tensor,
141 | gripper_act: torch.Tensor,
142 | actions: torch.Tensor,
143 | ) -> torch.Tensor:
144 | if self.discrete_gripper:
145 | logistics_loss = self._logistic_loss(logit_probs, log_scales, means, actions[:, :, :-1])
146 | gripper_gt = actions[:, :, -1].clone()
147 | # @fixme: hack because discrete actions are now -1 and 1, but we need 0, 1 for crossentropy loss
148 | m = gripper_gt == -1
149 | gripper_gt[m] = 0
150 | gripper_act_loss = self.criterion(gripper_act.view(-1, 2), gripper_gt.view(-1).long())
151 | total_loss = logistics_loss + self.gripper_alpha * gripper_act_loss
152 | return total_loss
153 | else:
154 | logistics_loss = self._logistic_loss(logit_probs, log_scales, means, actions)
155 | return logistics_loss
156 |
157 | def _setup_action_bounds(self, dataset_dir, act_max_bound, act_min_bound, load_action_bounds):
158 | if load_action_bounds:
159 | try:
160 | statistics_path = Path(hulc.__file__).parent / dataset_dir / "training/statistics.yaml"
161 | statistics = OmegaConf.load(statistics_path)
162 | act_max_bound = statistics.act_max_bound
163 | act_min_bound = statistics.act_min_bound
164 | logger.info(f"Loaded action bounds from {statistics_path}")
165 | except FileNotFoundError:
166 | logger.info(
167 | f"Could not load statistics.yaml in {statistics_path}, taking action bounds defined in hydra conf"
168 | )
169 | if self.discrete_gripper:
170 | self.register_buffer("gripper_bounds", torch.Tensor([act_min_bound[-1], act_max_bound[-1]]))
171 | act_max_bound = act_max_bound[:-1] # for discrete grasp
172 | act_min_bound = act_min_bound[:-1]
173 | action_max_bound = torch.Tensor(act_max_bound).float()
174 | action_min_bound = torch.Tensor(act_min_bound).float()
175 | assert action_max_bound.shape[0] == self.out_features
176 | assert action_min_bound.shape[0] == self.out_features
177 | action_max_bound = action_max_bound.unsqueeze(0).unsqueeze(0) # [1, 1, action_space]
178 | action_min_bound = action_min_bound.unsqueeze(0).unsqueeze(0) # [1, 1, action_space]
179 | action_max_bound = action_max_bound.unsqueeze(-1) * self.ones # broadcast to [1, 1, action_space, N_DIST]
180 | action_min_bound = action_min_bound.unsqueeze(-1) * self.ones # broadcast to [1, 1, action_space, N_DIST]
181 | self.register_buffer("action_max_bound", action_max_bound)
182 | self.register_buffer("action_min_bound", action_min_bound)
183 |
184 | def _logistic_loss(
185 | self,
186 | logit_probs: torch.Tensor,
187 | log_scales: torch.Tensor,
188 | means: torch.Tensor,
189 | actions: torch.Tensor,
190 | ) -> torch.Tensor:
191 | # Appropriate scale
192 | log_scales = torch.clamp(log_scales, min=self.log_scale_min)
193 | # Broadcast actions (B, A, N_DIST)
194 | actions = actions.unsqueeze(-1) * self.ones
195 | # Approximation of CDF derivative (PDF)
196 | centered_actions = actions - means
197 | inv_stdv = torch.exp(-log_scales)
198 | assert torch.is_tensor(self.action_max_bound)
199 | assert torch.is_tensor(self.action_min_bound)
200 | act_range = (self.action_max_bound - self.action_min_bound) / 2.0
201 | plus_in = inv_stdv * (centered_actions + act_range / (self.num_classes - 1))
202 | cdf_plus = torch.sigmoid(plus_in)
203 | min_in = inv_stdv * (centered_actions - act_range / (self.num_classes - 1))
204 | cdf_min = torch.sigmoid(min_in)
205 |
206 | # Corner Cases
207 | log_cdf_plus = plus_in - F.softplus(plus_in) # log probability for edge case of 0 (before scaling)
208 | log_one_minus_cdf_min = -F.softplus(min_in) # log probability for edge case of 255 (before scaling)
209 | # Log probability in the center of the bin
210 | mid_in = inv_stdv * centered_actions
211 | log_pdf_mid = mid_in - log_scales - 2.0 * F.softplus(mid_in)
212 | # Probability for all other cases
213 | cdf_delta = cdf_plus - cdf_min
214 |
215 | # Log probability
216 | log_probs = torch.where(
217 | actions < self.action_min_bound + 1e-3,
218 | log_cdf_plus,
219 | torch.where(
220 | actions > self.action_max_bound - 1e-3,
221 | log_one_minus_cdf_min,
222 | torch.where(
223 | cdf_delta > 1e-5,
224 | torch.log(torch.clamp(cdf_delta, min=1e-12)),
225 | log_pdf_mid - np.log((self.num_classes - 1) / 2),
226 | ),
227 | ),
228 | )
229 | log_probs = log_probs + F.log_softmax(logit_probs, dim=-1)
230 | loss = -torch.sum(log_sum_exp(log_probs), dim=-1).mean()
231 | return loss
232 |
233 | # Sampling from logistic distribution
234 | def _sample( # type: ignore
235 | self, logit_probs: torch.Tensor, log_scales: torch.Tensor, means: torch.Tensor, gripper_act: torch.Tensor
236 | ) -> torch.Tensor: # type: ignore
237 | # Selecting Logistic distribution (Gumbel Sample)
238 | r1, r2 = 1e-5, 1.0 - 1e-5
239 | temp = (r1 - r2) * torch.rand(means.shape, device=means.device) + r2
240 | temp = logit_probs - torch.log(-torch.log(temp))
241 | argmax = torch.argmax(temp, -1)
242 | # TODO: find out why mypy complains about type
243 | dist = self.one_hot_embedding_eye[argmax]
244 |
245 | # Select scales and means
246 | log_scales = (dist * log_scales).sum(dim=-1)
247 | means = (dist * means).sum(dim=-1)
248 |
249 | # Inversion sampling for logistic mixture sampling
250 | scales = torch.exp(log_scales) # Make positive
251 | u = (r1 - r2) * torch.rand(means.shape, device=means.device) + r2
252 | actions = means + scales * (torch.log(u) - torch.log(1.0 - u))
253 | if self.discrete_gripper:
254 | gripper_cmd = self.gripper_bounds[gripper_act.argmax(dim=-1)]
255 | full_action = torch.cat([actions, gripper_cmd.unsqueeze(-1)], 2)
256 | return full_action
257 | else:
258 | return actions
259 |
260 | def forward( # type: ignore
261 | self,
262 | latent_plan: torch.Tensor,
263 | perceptual_emb: torch.Tensor,
264 | latent_goal: torch.Tensor,
265 | h_0: Optional[torch.Tensor] = None,
266 | ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
267 | if self.perceptual_emb_slice is not None:
268 | perceptual_emb = perceptual_emb[..., slice(*self.perceptual_emb_slice)]
269 | batch_size, seq_len = perceptual_emb.shape[0], perceptual_emb.shape[1]
270 | latent_plan = latent_plan.unsqueeze(1).expand(-1, seq_len, -1)
271 | latent_goal = latent_goal.unsqueeze(1).expand(-1, seq_len, -1)
272 | x = torch.cat([latent_plan, perceptual_emb, latent_goal], dim=-1) # b, s, (plan + visuo-propio + goal)
273 | if not isinstance(self.rnn, nn.Sequential) and isinstance(self.rnn, nn.RNNBase):
274 | x, h_n = self.rnn(x, h_0)
275 | else:
276 | x = self.rnn(x)
277 | h_n = None
278 | probs = self.prob_fc(x)
279 | means = self.mean_fc(x)
280 | log_scales = self.log_scale_fc(x)
281 | log_scales = torch.clamp(log_scales, min=self.log_scale_min)
282 | gripper_act = self.gripper_fc(x) if self.discrete_gripper else None
283 | # Appropriate dimensions
284 | logit_probs = probs.view(batch_size, seq_len, self.out_features, self.n_dist)
285 | means = means.view(batch_size, seq_len, self.out_features, self.n_dist)
286 | log_scales = log_scales.view(batch_size, seq_len, self.out_features, self.n_dist)
287 | return logit_probs, log_scales, means, gripper_act, h_n
288 |
--------------------------------------------------------------------------------
/hulc/models/decoders/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lukashermann/hulc/7fdb09f5f1ff831ec8df29cfc61a41db573e9a06/hulc/models/decoders/utils/__init__.py
--------------------------------------------------------------------------------
/hulc/models/decoders/utils/gripper_control.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | import numpy as np
4 | from hulc.models.decoders.utils.pytorch3d_transforms import (
5 | euler_angles_to_matrix,
6 | matrix_to_euler_angles,
7 | matrix_to_quaternion,
8 | quaternion_to_matrix,
9 | )
10 | import torch
11 | from torch.cuda.amp import autocast
12 |
13 | logger = logging.getLogger(__name__)
14 |
15 |
16 | def world_to_tcp_frame(action, robot_obs):
17 | with autocast(dtype=torch.float32):
18 | b, s, _ = action.shape
19 | world_T_tcp = euler_angles_to_matrix(robot_obs[..., 3:6], convention="XYZ").float().view(-1, 3, 3)
20 | tcp_T_world = torch.inverse(world_T_tcp)
21 | pos_w_rel = action[..., :3].view(-1, 3, 1)
22 | pos_tcp_rel = tcp_T_world @ pos_w_rel
23 | # downscaling is necessary here to get pseudo infinitesimal rotation
24 | orn_w_rel = action[..., 3:6] * 0.01
25 | world_T_tcp_new = (
26 | euler_angles_to_matrix(robot_obs[..., 3:6] + orn_w_rel, convention="XYZ").float().view(-1, 3, 3)
27 | )
28 | tcp_new_T_tcp_old = torch.inverse(world_T_tcp_new) @ world_T_tcp
29 | orn_tcp_rel = matrix_to_euler_angles(tcp_new_T_tcp_old, convention="XYZ").float()
30 | orn_tcp_rel = torch.where(orn_tcp_rel < -np.pi, orn_tcp_rel + 2 * np.pi, orn_tcp_rel)
31 | orn_tcp_rel = torch.where(orn_tcp_rel > np.pi, orn_tcp_rel - 2 * np.pi, orn_tcp_rel)
32 | # upscaling again
33 | orn_tcp_rel *= 100
34 | action_tcp = torch.cat([pos_tcp_rel.view(b, s, -1), orn_tcp_rel.view(b, s, -1), action[..., -1:]], dim=-1)
35 | assert not torch.any(action_tcp.isnan())
36 | return action_tcp
37 |
38 |
39 | def tcp_to_world_frame(action, robot_obs):
40 | with autocast(dtype=torch.float32):
41 | b, s, _ = action.shape
42 | world_T_tcp = euler_angles_to_matrix(robot_obs[..., 3:6], convention="XYZ").float().view(-1, 3, 3)
43 | pos_tcp_rel = action[..., :3].view(-1, 3, 1)
44 | pos_w_rel = world_T_tcp @ pos_tcp_rel
45 | # downscaling is necessary here to get pseudo infinitesimal rotation
46 | orn_tcp_rel = action[..., 3:6] * 0.01
47 | tcp_new_T_tcp_old = euler_angles_to_matrix(orn_tcp_rel, convention="XYZ").float().view(-1, 3, 3)
48 | world_T_tcp_new = world_T_tcp @ torch.inverse(tcp_new_T_tcp_old)
49 |
50 | orn_w_new = matrix_to_euler_angles(world_T_tcp_new, convention="XYZ").float()
51 | if torch.any(orn_w_new.isnan()):
52 | logger.warning("NaN value in euler angles.")
53 | orn_w_new = matrix_to_euler_angles(
54 | quaternion_to_matrix(matrix_to_quaternion(world_T_tcp_new)), convention="XYZ"
55 | ).float()
56 | orn_w_rel = orn_w_new - robot_obs[..., 3:6].view(-1, 3)
57 | orn_w_rel = torch.where(orn_w_rel < -np.pi, orn_w_rel + 2 * np.pi, orn_w_rel)
58 | orn_w_rel = torch.where(orn_w_rel > np.pi, orn_w_rel - 2 * np.pi, orn_w_rel)
59 | # upscaling again
60 | orn_w_rel *= 100
61 | action_w = torch.cat([pos_w_rel.view(b, s, -1), orn_w_rel.view(b, s, -1), action[..., -1:]], dim=-1)
62 | assert not torch.any(action_w.isnan())
63 | return action_w
64 |
--------------------------------------------------------------------------------
/hulc/models/decoders/utils/rnn.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 |
4 |
5 | def rnn_decoder(in_features: int, hidden_size: int, num_layers: int, policy_rnn_dropout_p: float) -> torch.nn.Module:
6 | return nn.RNN(
7 | input_size=in_features,
8 | hidden_size=hidden_size,
9 | num_layers=num_layers,
10 | nonlinearity="relu",
11 | bidirectional=False,
12 | batch_first=True,
13 | dropout=policy_rnn_dropout_p,
14 | )
15 |
16 |
17 | def lstm_decoder(in_features: int, hidden_size: int, num_layers: int, policy_rnn_dropout_p: float) -> torch.nn.Module:
18 | return nn.LSTM(
19 | input_size=in_features,
20 | hidden_size=hidden_size,
21 | num_layers=num_layers,
22 | bidirectional=False,
23 | batch_first=True,
24 | dropout=policy_rnn_dropout_p,
25 | )
26 |
27 |
28 | def gru_decoder(in_features: int, hidden_size: int, num_layers: int, policy_rnn_dropout_p: float) -> torch.nn.Module:
29 | return nn.GRU(
30 | input_size=in_features,
31 | hidden_size=hidden_size,
32 | num_layers=num_layers,
33 | bidirectional=False,
34 | batch_first=True,
35 | dropout=policy_rnn_dropout_p,
36 | )
37 |
38 |
39 | def mlp_decoder(in_features: int, hidden_size: int, num_layers: int, policy_rnn_dropout_p: float) -> torch.nn.Module:
40 | return nn.Sequential(
41 | nn.Linear(in_features=in_features, out_features=hidden_size),
42 | nn.ReLU(),
43 | nn.Linear(in_features=hidden_size, out_features=hidden_size),
44 | nn.ReLU(),
45 | nn.Linear(in_features=hidden_size, out_features=hidden_size),
46 | )
47 |
--------------------------------------------------------------------------------
/hulc/models/encoders/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lukashermann/hulc/7fdb09f5f1ff831ec8df29cfc61a41db573e9a06/hulc/models/encoders/__init__.py
--------------------------------------------------------------------------------
/hulc/models/encoders/clip_lang_encoder.py:
--------------------------------------------------------------------------------
1 | from typing import List
2 |
3 | import torch
4 | import torch.nn as nn
5 |
6 | from hulc.models.perceptual_encoders.clip import build_model, load_clip, tokenize
7 |
8 |
9 | class LangClip(nn.Module):
10 | def __init__(self, freeze_backbone: bool = True, model_name: str = "RN50"):
11 | super(LangClip, self).__init__()
12 | self.device = "cuda" if torch.cuda.is_available() else "cpu"
13 | # Load CLIP model
14 | print(f"loading language CLIP model with backbone: {model_name}")
15 | self._load_clip(model_name)
16 | if freeze_backbone:
17 | for param in self.clip_rn50.parameters():
18 | param.requires_grad = False
19 |
20 | def _load_clip(self, model_name: str) -> None:
21 | model, _ = load_clip(model_name, device=self.device)
22 | self.clip_rn50 = build_model(model.state_dict()).to(self.device)
23 |
24 | def forward(self, x: List) -> torch.Tensor:
25 | with torch.no_grad():
26 | tokens = tokenize(x).to(self.device)
27 | emb = self.clip_rn50.encode_text(tokens)
28 | return torch.unsqueeze(emb, 1)
29 |
--------------------------------------------------------------------------------
/hulc/models/encoders/goal_encoders.py:
--------------------------------------------------------------------------------
1 | from typing import Dict
2 |
3 | import torch
4 | import torch.nn as nn
5 | import torch.nn.functional as F
6 |
7 |
8 | class VisualGoalEncoder(nn.Module):
9 | def __init__(
10 | self,
11 | hidden_size: int,
12 | latent_goal_features: int,
13 | in_features: int,
14 | l2_normalize_goal_embeddings: bool,
15 | activation_function: str,
16 | ):
17 | super().__init__()
18 | self.l2_normalize_output = l2_normalize_goal_embeddings
19 | self.act_fn = getattr(nn, activation_function)()
20 | self.mlp = nn.Sequential(
21 | nn.Linear(in_features=in_features, out_features=hidden_size),
22 | # nn.BatchNorm1d(hidden_size),
23 | self.act_fn,
24 | nn.Linear(in_features=hidden_size, out_features=hidden_size),
25 | # nn.BatchNorm1d(hidden_size),
26 | self.act_fn,
27 | nn.Linear(in_features=hidden_size, out_features=latent_goal_features),
28 | )
29 | self.ln = nn.LayerNorm(latent_goal_features)
30 |
31 | def forward(self, x: torch.Tensor) -> torch.Tensor:
32 | x = self.mlp(x)
33 | if self.l2_normalize_output:
34 | x = F.normalize(x, p=2, dim=1)
35 | x = self.ln(x)
36 | return x
37 |
38 |
39 | class LanguageGoalEncoder(nn.Module):
40 | def __init__(
41 | self,
42 | in_features: int,
43 | hidden_size: int,
44 | latent_goal_features: int,
45 | l2_normalize_goal_embeddings: bool,
46 | word_dropout_p: float,
47 | activation_function: str,
48 | ):
49 | super().__init__()
50 | self.l2_normalize_output = l2_normalize_goal_embeddings
51 | self.act_fn = getattr(nn, activation_function)()
52 | self.mlp = nn.Sequential(
53 | nn.Dropout(word_dropout_p),
54 | nn.Linear(in_features=in_features, out_features=hidden_size),
55 | # nn.BatchNorm1d(hidden_size),
56 | self.act_fn,
57 | nn.Linear(in_features=hidden_size, out_features=hidden_size),
58 | # nn.BatchNorm1d(hidden_size),
59 | self.act_fn,
60 | nn.Linear(in_features=hidden_size, out_features=latent_goal_features),
61 | )
62 | self.ln = nn.LayerNorm(latent_goal_features)
63 |
64 | def forward(self, x: torch.Tensor) -> torch.Tensor:
65 | x = self.mlp(x)
66 | if self.l2_normalize_output:
67 | x = F.normalize(x, p=2, dim=1)
68 | x = self.ln(x)
69 | return x
70 |
--------------------------------------------------------------------------------
/hulc/models/encoders/lang_encoder.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 |
4 |
5 | class LanguageEncoder(nn.Module):
6 | def __init__(
7 | self,
8 | language_features: int,
9 | hidden_size: int,
10 | out_features: int,
11 | word_dropout_p: float,
12 | activation_function: str,
13 | ):
14 | super().__init__()
15 | self.act_fn = getattr(nn, activation_function)()
16 | self.mlp = nn.Sequential(
17 | nn.Dropout(word_dropout_p),
18 | nn.Linear(in_features=language_features, out_features=hidden_size),
19 | # nn.BatchNorm1d(hidden_size),
20 | self.act_fn,
21 | nn.Linear(in_features=hidden_size, out_features=hidden_size),
22 | # nn.BatchNorm1d(hidden_size),
23 | self.act_fn,
24 | nn.Linear(in_features=hidden_size, out_features=out_features),
25 | )
26 |
27 | def forward(self, x: torch.Tensor) -> torch.Tensor:
28 | x = self.mlp(x)
29 | return x
30 |
--------------------------------------------------------------------------------
/hulc/models/encoders/language_network.py:
--------------------------------------------------------------------------------
1 | from typing import List
2 |
3 | from sentence_transformers import SentenceTransformer
4 | import torch
5 | import torch.nn as nn
6 |
7 |
8 | class SBert(nn.Module):
9 | def __init__(self, nlp_model: str):
10 | # choose model from https://www.sbert.net/docs/pretrained_models.html
11 | super().__init__()
12 | assert isinstance(nlp_model, str)
13 | self.model = SentenceTransformer(nlp_model)
14 |
15 | def forward(self, x: List) -> torch.Tensor:
16 | emb = self.model.encode(x, convert_to_tensor=True)
17 | return torch.unsqueeze(emb, 1)
18 |
--------------------------------------------------------------------------------
/hulc/models/gcbc.py:
--------------------------------------------------------------------------------
1 | import logging
2 | from typing import Dict
3 |
4 | import torch
5 |
6 | from hulc.models.hulc import Hulc
7 |
8 | logger = logging.getLogger(__name__)
9 |
10 |
11 | class GCBC(Hulc):
12 | """
13 | Goal-conditioned behavior cloning.
14 | """
15 |
16 | @staticmethod
17 | def setup_input_sizes(
18 | perceptual_encoder,
19 | plan_proposal,
20 | plan_recognition,
21 | visual_goal,
22 | action_decoder,
23 | distribution,
24 | ):
25 | """
26 | Configure the input feature sizes of the respective parts of the network.
27 |
28 | Args:
29 | perceptual_encoder: DictConfig for perceptual encoder.
30 | plan_proposal: DictConfig for plan proposal network.
31 | plan_recognition: DictConfig for plan recognition network.
32 | visual_goal: DictConfig for visual goal encoder.
33 | action_decoder: DictConfig for action decoder network.
34 | distribution: DictConfig for plan distribution (continuous or discrete).
35 | """
36 | plan_proposal.perceptual_features = perceptual_encoder.latent_size
37 | plan_recognition.in_features = perceptual_encoder.latent_size
38 | visual_goal.in_features = perceptual_encoder.latent_size
39 | action_decoder.perceptual_features = perceptual_encoder.latent_size
40 |
41 | if distribution.dist == "discrete":
42 | plan_proposal.plan_features = distribution.class_size * distribution.category_size
43 | plan_recognition.plan_features = distribution.class_size * distribution.category_size
44 | action_decoder.plan_features = 0
45 | elif distribution.dist == "continuous":
46 | plan_proposal.plan_features = distribution.plan_features
47 | plan_recognition.plan_features = distribution.plan_features
48 | action_decoder.plan_features = 0
49 |
50 | def training_step(self, batch: Dict[str, Dict], batch_idx: int) -> torch.Tensor: # type: ignore
51 | """
52 | Compute and return the training loss.
53 |
54 | Args:
55 | batch (dict):
56 | - 'vis' (dict):
57 | - 'rgb_obs' (dict):
58 | - 'rgb_static' (Tensor): RGB camera image of static camera
59 | - ...
60 | - 'depth_obs' (dict):
61 | - 'depth_static' (Tensor): Depth camera image of depth camera
62 | - ...
63 | - 'robot_obs' (Tensor): Proprioceptive state observation.
64 | - 'actions' (Tensor): Ground truth actions.
65 | - 'state_info' (dict):
66 | - 'robot_obs' (Tensor): Unnormalized robot states.
67 | - 'scene_obs' (Tensor): Unnormalized scene states.
68 | - 'idx' (LongTensor): Episode indices.
69 | - 'lang' (dict):
70 | Like 'vis' but with additional keys:
71 | - 'language' (Tensor): Embedded Language labels.
72 | - 'use_for_aux_lang_loss' (BoolTensor): Mask of which sequences in the batch to consider for
73 | auxiliary loss.
74 | batch_idx (int): Integer displaying index of this batch.
75 |
76 |
77 | Returns:
78 | loss tensor
79 | """
80 | action_loss, proprio_loss, lang_pred_loss, lang_contrastive_loss, lang_clip_loss, total_loss = (
81 | torch.tensor(0.0).to(self.device),
82 | torch.tensor(0.0).to(self.device),
83 | torch.tensor(0.0).to(self.device),
84 | torch.tensor(0.0).to(self.device),
85 | torch.tensor(0.0).to(self.device),
86 | torch.tensor(0.0).to(self.device),
87 | )
88 |
89 | batch_size: Dict[str, int] = {}
90 | total_bs = 0
91 | for self.modality_scope, dataset_batch in batch.items():
92 | perceptual_emb = self.perceptual_encoder(
93 | dataset_batch["rgb_obs"], dataset_batch["depth_obs"], dataset_batch["robot_obs"]
94 | )
95 | if self.state_recons:
96 | proprio_loss += self.perceptual_encoder.state_reconstruction_loss()
97 | if "lang" in self.modality_scope:
98 | latent_goal = self.language_goal(dataset_batch["lang"])
99 | else:
100 | latent_goal = self.visual_goal(perceptual_emb[:, -1])
101 |
102 | robot_obs = dataset_batch["state_info"]["robot_obs"]
103 | actions = dataset_batch["actions"]
104 | empty_plan = torch.empty((dataset_batch["actions"].shape[0]), 0).to(self.device)
105 | act_loss = self.action_decoder.loss(empty_plan, perceptual_emb, latent_goal, actions, robot_obs)
106 | _, seq_feat = self.plan_recognition(perceptual_emb)
107 |
108 | if "lang" in self.modality_scope:
109 | if not torch.any(dataset_batch["use_for_aux_lang_loss"]):
110 | batch_size["aux_lang"] = 1
111 | else:
112 | batch_size["aux_lang"] = torch.sum(dataset_batch["use_for_aux_lang_loss"]).detach() # type:ignore
113 | if self.use_bc_z_auxiliary_loss:
114 | lang_pred_loss += self.bc_z_auxiliary_loss(
115 | seq_feat, dataset_batch["lang"], dataset_batch["use_for_aux_lang_loss"]
116 | )
117 | if self.use_clip_auxiliary_loss:
118 | lang_clip_loss += self.clip_auxiliary_loss(
119 | seq_feat, latent_goal, dataset_batch["use_for_aux_lang_loss"]
120 | )
121 | if self.use_mia_auxiliary_loss:
122 | lang_contrastive_loss += self.mia_auxiliary_loss(
123 | seq_feat, latent_goal, dataset_batch["use_for_aux_lang_loss"]
124 | )
125 | action_loss += act_loss
126 | total_loss += act_loss
127 | batch_size[self.modality_scope] = dataset_batch["actions"].shape[0]
128 | total_bs += dataset_batch["actions"].shape[0]
129 |
130 | self.log(
131 | f"train/action_loss_{self.modality_scope}",
132 | act_loss,
133 | on_step=False,
134 | on_epoch=True,
135 | batch_size=batch_size[self.modality_scope],
136 | )
137 | total_loss = total_loss / len(batch) # divide accumulated gradients by number of datasets
138 | action_loss = action_loss / len(batch)
139 | if self.state_recons:
140 | proprio_loss = proprio_loss / len(batch)
141 | total_loss = total_loss + self.st_recon_beta * proprio_loss
142 | self.log(
143 | "train/pred_proprio",
144 | self.st_recon_beta * proprio_loss,
145 | on_step=False,
146 | on_epoch=True,
147 | batch_size=total_bs,
148 | )
149 | if self.use_bc_z_auxiliary_loss:
150 | total_loss = total_loss + self.bc_z_auxiliary_loss_beta * lang_pred_loss
151 | self.log(
152 | "train/pred_lang",
153 | self.bc_z_auxiliary_loss_beta * lang_pred_loss,
154 | on_step=False,
155 | on_epoch=True,
156 | batch_size=batch_size["aux_lang"],
157 | sync_dist=True,
158 | )
159 | if self.use_mia_auxiliary_loss:
160 | total_loss = total_loss + self.mia_auxiliary_loss_beta * lang_contrastive_loss
161 | self.log(
162 | "train/lang_contrastive",
163 | self.mia_auxiliary_loss_beta * lang_contrastive_loss,
164 | on_step=False,
165 | on_epoch=True,
166 | batch_size=batch_size["aux_lang"],
167 | sync_dist=True,
168 | )
169 | if self.use_clip_auxiliary_loss:
170 | total_loss = total_loss + self.clip_auxiliary_loss_beta * lang_clip_loss
171 | self.log(
172 | "train/lang_clip_loss",
173 | self.clip_auxiliary_loss_beta * lang_clip_loss,
174 | on_step=False,
175 | on_epoch=True,
176 | batch_size=batch_size["aux_lang"],
177 | sync_dist=True,
178 | )
179 | self.log("train/action_loss", action_loss, on_step=False, on_epoch=True, batch_size=total_bs)
180 | self.log("train/total_loss", total_loss, on_step=False, on_epoch=True, batch_size=total_bs)
181 | return total_loss
182 |
183 | def validation_step(self, batch: Dict[str, Dict], batch_idx: int) -> Dict[str, torch.Tensor]: # type: ignore
184 | """
185 | Compute and log the validation losses and additional metrics.
186 |
187 | Args:
188 | batch (dict):
189 | - 'vis' (dict):
190 | - 'rgb_obs' (dict):
191 | - 'rgb_static' (Tensor): RGB camera image of static camera
192 | - ...
193 | - 'depth_obs' (dict):
194 | - 'depth_static' (Tensor): Depth camera image of depth camera
195 | - ...
196 | - 'robot_obs' (Tensor): Proprioceptive state observation.
197 | - 'actions' (Tensor): Ground truth actions.
198 | - 'state_info' (dict):
199 | - 'robot_obs' (Tensor): Unnormalized robot states.
200 | - 'scene_obs' (Tensor): Unnormalized scene states.
201 | - 'idx' (LongTensor): Episode indices.
202 | - 'lang' (dict):
203 | Like 'vis' but with additional keys:
204 | - 'language' (Tensor): Embedded Language labels.
205 | - 'use_for_aux_lang_loss' (BoolTensor): Mask of which sequences in the batch to consider for
206 | auxiliary loss.
207 | batch_idx (int): Integer displaying index of this batch.
208 |
209 | Returns:
210 | Dictionary containing the sampled plans of plan recognition and plan proposal networks, as well as the
211 | episode indices.
212 | """
213 | output = {}
214 | val_total_act_loss = torch.tensor(0.0).to(self.device)
215 | for self.modality_scope, dataset_batch in batch.items():
216 | perceptual_emb = self.perceptual_encoder(
217 | dataset_batch["rgb_obs"], dataset_batch["depth_obs"], dataset_batch["robot_obs"]
218 | )
219 | if self.state_recons:
220 | state_recon_loss = self.perceptual_encoder.state_reconstruction_loss()
221 | self.log(f"val/proprio_loss_{self.modality_scope}", state_recon_loss, sync_dist=True)
222 | if "lang" in self.modality_scope:
223 | latent_goal = self.language_goal(dataset_batch["lang"])
224 | else:
225 | latent_goal = self.visual_goal(perceptual_emb[:, -1])
226 |
227 | robot_obs = dataset_batch["state_info"]["robot_obs"]
228 | actions = dataset_batch["actions"]
229 | empty_plan = torch.empty((dataset_batch["actions"].shape[0]), 0).to(self.device)
230 | action_loss, sample_act = self.action_decoder.loss_and_act( # type: ignore
231 | empty_plan, perceptual_emb, latent_goal, actions, robot_obs
232 | )
233 | mae = torch.nn.functional.l1_loss(
234 | sample_act[..., :-1], actions[..., :-1], reduction="none"
235 | ) # (batch, seq, 6)
236 | mae = torch.mean(mae, 1) # (batch, 6)
237 | # gripper action
238 | gripper_discrete = sample_act[..., -1]
239 | gt_gripper_act = actions[..., -1]
240 | m = gripper_discrete > 0
241 | gripper_discrete[m] = 1
242 | gripper_discrete[~m] = -1
243 | gripper_sr = torch.mean((gt_gripper_act == gripper_discrete).float())
244 | _, seq_feat = self.plan_recognition(perceptual_emb)
245 |
246 | if "lang" in self.modality_scope:
247 | if self.use_bc_z_auxiliary_loss:
248 | val_pred_lang_loss = self.bc_z_auxiliary_loss(
249 | seq_feat, dataset_batch["lang"], dataset_batch["use_for_aux_lang_loss"]
250 | )
251 | self.log("val/lang_pred_loss", val_pred_lang_loss, sync_dist=True)
252 | if self.use_clip_auxiliary_loss:
253 | val_pred_clip_loss = self.clip_auxiliary_loss(
254 | seq_feat, latent_goal, dataset_batch["use_for_aux_lang_loss"]
255 | )
256 | self.log("val/val_pred_clip_loss", val_pred_clip_loss, sync_dist=True)
257 | self.clip_groundtruth(seq_feat, dataset_batch["idx"], dataset_batch["use_for_aux_lang_loss"])
258 | if self.use_mia_auxiliary_loss:
259 | val_pred_contrastive_loss = self.mia_auxiliary_loss(
260 | seq_feat, latent_goal, dataset_batch["use_for_aux_lang_loss"]
261 | )
262 | self.log("val/lang_contrastive_loss", val_pred_contrastive_loss, sync_dist=True)
263 | val_total_act_loss += action_loss
264 | mae_mean = mae.mean()
265 | pos_mae = mae[..., :3].mean()
266 | orn_mae = mae[..., 3:6].mean()
267 | self.log(f"val_total_mae/{self.modality_scope}_total_mae", mae_mean, sync_dist=True)
268 | self.log(f"val_pos_mae/{self.modality_scope}_pos_mae", pos_mae, sync_dist=True)
269 | self.log(f"val_orn_mae/{self.modality_scope}_orn_mae", orn_mae, sync_dist=True)
270 | self.log(f"val_act/{self.modality_scope}_act_loss", action_loss, sync_dist=True)
271 | self.log(f"val_grip/{self.modality_scope}_grip_sr", gripper_sr, sync_dist=True)
272 | self.log(
273 | "val_act/action_loss",
274 | val_total_act_loss / len(self.trainer.datamodule.modalities), # type:ignore
275 | sync_dist=True,
276 | )
277 | output[f"idx_{self.modality_scope}"] = dataset_batch["idx"]
278 |
279 | return output
280 |
281 | def reset(self):
282 | """
283 | Call this at the beginning of a new rollout when doing inference.
284 | """
285 | self.latent_goal = None
286 |
287 | def step(self, obs, goal):
288 | """
289 | Do one step of inference with the model.
290 |
291 | Args:
292 | obs (dict): Observation from environment.
293 | goal (dict): Goal as visual observation or embedded language instruction.
294 |
295 | Returns:
296 | Predicted action.
297 | """
298 | with torch.no_grad():
299 | if self.latent_goal is None:
300 | if isinstance(goal, str):
301 | embedded_lang = torch.from_numpy(self.lang_embeddings[goal]).to(self.device).squeeze(0).float()
302 | self.latent_goal = self.language_goal(embedded_lang)
303 | else:
304 | imgs = {
305 | k: torch.cat([v, goal["rgb_obs"][k]], dim=1) for k, v in obs["rgb_obs"].items()
306 | } # (1, 2, C, H, W)
307 | depth_imgs = {k: torch.cat([v, goal["depth_obs"][k]], dim=1) for k, v in obs["depth_obs"].items()}
308 | state = torch.cat([obs["robot_obs"], goal["robot_obs"]], dim=1)
309 | perceptual_emb = self.perceptual_encoder(imgs, depth_imgs, state)
310 | self.latent_goal = self.visual_goal(perceptual_emb[:, -1])
311 |
312 | perceptual_emb = self.perceptual_encoder(obs["rgb_obs"], obs["depth_obs"], obs["robot_obs"])
313 | empty_plan = torch.empty(1, 0).to(self.device)
314 | action = self.action_decoder.act(
315 | empty_plan, perceptual_emb, self.latent_goal, obs["robot_obs_raw"]
316 | ) # type: ignore
317 | return action
318 |
--------------------------------------------------------------------------------
/hulc/models/perceptual_encoders/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lukashermann/hulc/7fdb09f5f1ff831ec8df29cfc61a41db573e9a06/hulc/models/perceptual_encoders/__init__.py
--------------------------------------------------------------------------------
/hulc/models/perceptual_encoders/concat_encoders.py:
--------------------------------------------------------------------------------
1 | from typing import Dict, Optional
2 |
3 | import hydra
4 | from omegaconf import DictConfig
5 | import torch
6 | import torch.nn as nn
7 | from torch.nn.functional import mse_loss
8 |
9 |
10 | class ConcatEncoders(nn.Module):
11 | def __init__(
12 | self,
13 | rgb_static: DictConfig,
14 | proprio: DictConfig,
15 | device: torch.device,
16 | depth_static: Optional[DictConfig] = None,
17 | rgb_gripper: Optional[DictConfig] = None,
18 | depth_gripper: Optional[DictConfig] = None,
19 | tactile: Optional[DictConfig] = None,
20 | state_decoder: Optional[DictConfig] = None,
21 | ):
22 | super().__init__()
23 | self._latent_size = rgb_static.visual_features
24 | if rgb_gripper:
25 | self._latent_size += rgb_gripper.visual_features
26 | if depth_static:
27 | self._latent_size += depth_static.visual_features
28 | if depth_gripper:
29 | self._latent_size += depth_gripper.visual_features
30 | if tactile:
31 | self._latent_size += tactile.visual_features
32 | visual_features = self._latent_size
33 | # super ugly, fix this clip ddp thing in a better way
34 | if "clip" in rgb_static["_target_"]:
35 | self.rgb_static_encoder = hydra.utils.instantiate(rgb_static, device=device)
36 | else:
37 | self.rgb_static_encoder = hydra.utils.instantiate(rgb_static)
38 | self.depth_static_encoder = hydra.utils.instantiate(depth_static) if depth_static else None
39 | self.rgb_gripper_encoder = hydra.utils.instantiate(rgb_gripper) if rgb_gripper else None
40 | self.depth_gripper_encoder = hydra.utils.instantiate(depth_gripper) if depth_gripper else None
41 | self.tactile_encoder = hydra.utils.instantiate(tactile)
42 | self.proprio_encoder = hydra.utils.instantiate(proprio)
43 | if self.proprio_encoder:
44 | self._latent_size += self.proprio_encoder.out_features
45 |
46 | self.state_decoder = None
47 | if state_decoder:
48 | state_decoder.visual_features = visual_features
49 | state_decoder.n_state_obs = self.proprio_encoder.out_features
50 | self.state_decoder = hydra.utils.instantiate(state_decoder)
51 |
52 | self.current_visual_embedding = None
53 | self.current_state_obs = None
54 |
55 | @property
56 | def latent_size(self):
57 | return self._latent_size
58 |
59 | def forward(
60 | self, imgs: Dict[str, torch.Tensor], depth_imgs: Dict[str, torch.Tensor], state_obs: torch.Tensor
61 | ) -> torch.Tensor:
62 | rgb_static = imgs["rgb_static"]
63 | rgb_gripper = imgs["rgb_gripper"] if "rgb_gripper" in imgs else None
64 | rgb_tactile = imgs["rgb_tactile"] if "rgb_tactile" in imgs else None
65 | depth_static = depth_imgs["depth_static"] if "depth_static" in depth_imgs else None
66 | depth_gripper = depth_imgs["depth_gripper"] if "depth_gripper" in depth_imgs else None
67 |
68 | b, s, c, h, w = rgb_static.shape
69 | rgb_static = rgb_static.reshape(-1, c, h, w) # (batch_size * sequence_length, 3, 200, 200)
70 | # ------------ Vision Network ------------ #
71 | encoded_imgs = self.rgb_static_encoder(rgb_static) # (batch*seq_len, 64)
72 | encoded_imgs = encoded_imgs.reshape(b, s, -1) # (batch, seq, 64)
73 |
74 | if depth_static is not None:
75 | depth_static = torch.unsqueeze(depth_static, 2)
76 | depth_static = depth_static.reshape(-1, 1, h, w) # (batch_size * sequence_length, 3, 200, 200)
77 | encoded_depth_static = self.depth_static_encoder(depth_static) # (batch*seq_len, 64)
78 | encoded_depth_static = encoded_depth_static.reshape(b, s, -1) # (batch, seq, 64)
79 | encoded_imgs = torch.cat([encoded_imgs, encoded_depth_static], dim=-1)
80 |
81 | if rgb_gripper is not None:
82 | b, s, c, h, w = rgb_gripper.shape
83 | rgb_gripper = rgb_gripper.reshape(-1, c, h, w) # (batch_size * sequence_length, 3, 84, 84)
84 | encoded_imgs_gripper = self.rgb_gripper_encoder(rgb_gripper) # (batch*seq_len, 64)
85 | encoded_imgs_gripper = encoded_imgs_gripper.reshape(b, s, -1) # (batch, seq, 64)
86 | encoded_imgs = torch.cat([encoded_imgs, encoded_imgs_gripper], dim=-1)
87 | if depth_gripper is not None:
88 | depth_gripper = torch.unsqueeze(depth_gripper, 2)
89 | depth_gripper = depth_gripper.reshape(-1, 1, h, w) # (batch_size * sequence_length, 1, 84, 84)
90 | encoded_depth_gripper = self.depth_gripper_encoder(depth_gripper)
91 | encoded_depth_gripper = encoded_depth_gripper.reshape(b, s, -1) # (batch, seq, 64)
92 | encoded_imgs = torch.cat([encoded_imgs, encoded_depth_gripper], dim=-1)
93 |
94 | if rgb_tactile is not None:
95 | b, s, c, h, w = rgb_tactile.shape
96 | rgb_tactile = rgb_tactile.reshape(-1, c, h, w) # (batch_size * sequence_length, 3, 84, 84)
97 | encoded_tactile = self.tactile_encoder(rgb_tactile)
98 | encoded_tactile = encoded_tactile.reshape(b, s, -1)
99 | encoded_imgs = torch.cat([encoded_imgs, encoded_tactile], dim=-1)
100 |
101 | self.current_visual_embedding = encoded_imgs
102 | self.current_state_obs = state_obs # type: ignore
103 | if self.proprio_encoder:
104 | state_obs_out = self.proprio_encoder(state_obs)
105 | perceptual_emb = torch.cat([encoded_imgs, state_obs_out], dim=-1)
106 | else:
107 | perceptual_emb = encoded_imgs
108 |
109 | return perceptual_emb
110 |
111 | def state_reconstruction_loss(self):
112 | assert self.state_decoder is not None
113 | proprio_pred = self.state_decoder(self.current_visual_embedding)
114 | return mse_loss(self.current_state_obs, proprio_pred)
115 |
--------------------------------------------------------------------------------
/hulc/models/perceptual_encoders/proprio_encoder.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from torch import nn
3 |
4 |
5 | class IdentityEncoder(nn.Module):
6 | def __init__(self, proprioception_dims):
7 | super(IdentityEncoder, self).__init__()
8 | # remove a dimension if we convert robot orientation quaternion to euler angles
9 | self.n_state_obs = int(np.sum(np.diff([list(x) for x in [list(y) for y in proprioception_dims.keep_indices]])))
10 | self.identity = nn.Identity()
11 |
12 | @property
13 | def out_features(self):
14 | return self.n_state_obs
15 |
16 | def forward(self, x):
17 | return self.identity(x)
18 |
--------------------------------------------------------------------------------
/hulc/models/perceptual_encoders/tactile_encoder.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 | import torchvision.models as models
5 |
6 |
7 | class TactileEncoder(nn.Module):
8 | def __init__(self, visual_features: int, freeze_tactile_backbone: bool = True):
9 | super(TactileEncoder, self).__init__()
10 | # Load pre-trained resnet-18
11 | net = models.resnet18(pretrained=True)
12 | # Remove the last fc layer, and rebuild
13 | modules = list(net.children())[:-1]
14 | self.net = nn.Sequential(*modules)
15 | if freeze_tactile_backbone:
16 | for param in self.net.parameters():
17 | param.requires_grad = False
18 | self.fc1 = nn.Linear(1024, 512)
19 | self.fc2 = nn.Linear(512, visual_features)
20 |
21 | def forward(self, x: torch.Tensor) -> torch.Tensor:
22 | x_l = self.net(x[:, :3, :, :]).squeeze()
23 | x_r = self.net(x[:, 3:, :, :]).squeeze()
24 | x = torch.cat((x_l, x_r), dim=-1)
25 | # Add fc layer for final prediction
26 | output = F.relu(self.fc1(x)) # batch, 512
27 | output = self.fc2(output) # batch, 64
28 | return output
29 |
--------------------------------------------------------------------------------
/hulc/models/perceptual_encoders/vision_clip.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 |
5 | from hulc.models.perceptual_encoders.clip import load_clip
6 |
7 |
8 | class VisionClip(nn.Module):
9 | def __init__(
10 | self, device: torch.device, visual_features: int, freeze_backbone: bool = True, model_name: str = "RN50"
11 | ):
12 | super(VisionClip, self).__init__()
13 | # Load CLIP model
14 | print(f"loading vision CLIP model with backbone: {model_name}")
15 | self.clip_model, _ = load_clip(model_name, device=device)
16 | if freeze_backbone:
17 | for param in self.clip_model.parameters():
18 | param.requires_grad = False
19 | if "RN50" in model_name:
20 | self.fc1 = nn.Linear(1024, 512)
21 | self.fc2 = nn.Linear(512, visual_features)
22 | elif "ViT-B/32" in model_name:
23 | self.fc1 = nn.Linear(512, 256)
24 | self.fc2 = nn.Linear(256, visual_features)
25 |
26 | def forward(self, x: torch.Tensor) -> torch.Tensor:
27 | x = self.clip_model.encode_image(x) # type:ignore
28 | output = F.relu(self.fc1(x)) # batch, 512
29 | output = self.fc2(output) # batch, 64
30 | return output
31 |
--------------------------------------------------------------------------------
/hulc/models/perceptual_encoders/vision_network.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | from typing import Optional, Tuple
4 |
5 | import torch
6 | import torch.nn as nn
7 | import torch.nn.functional as F
8 | from torch.nn.parameter import Parameter
9 |
10 |
11 | class VisionNetwork(nn.Module):
12 | # reference: https://arxiv.org/pdf/2005.07648.pdf
13 | def __init__(
14 | self,
15 | input_width: int,
16 | input_height: int,
17 | activation_function: str,
18 | dropout_vis_fc: float,
19 | l2_normalize_output: bool,
20 | visual_features: int,
21 | num_c: int,
22 | use_sinusoid: bool,
23 | spatial_softmax_temp: float,
24 | ):
25 | super(VisionNetwork, self).__init__()
26 | self.l2_normalize_output = l2_normalize_output
27 | self.act_fn = getattr(nn, activation_function)()
28 | # w,h,kernel_size,padding,stride
29 | w, h = self.calc_out_size(input_width, input_height, 8, 0, 4)
30 | w, h = self.calc_out_size(w, h, 4, 0, 2)
31 | w, h = self.calc_out_size(w, h, 3, 0, 1)
32 | self.use_sinusoid = use_sinusoid
33 | temp = None if not isinstance(spatial_softmax_temp, float) else spatial_softmax_temp
34 | self.spatial_softmax = SpatialSoftmax(num_rows=w, num_cols=h, temperature=temp) # shape: [N, 128]
35 | # model
36 | self.conv_model = nn.Sequential(
37 | # input shape: [N, 3, 200, 200]
38 | nn.Conv2d(in_channels=num_c, out_channels=32, kernel_size=8, stride=4), # shape: [N, 32, 49, 49]
39 | # nn.BatchNorm2d(32),
40 | self.act_fn,
41 | nn.Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2), # shape: [N, 64, 23, 23]
42 | # nn.BatchNorm2d(64),
43 | self.act_fn,
44 | nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1), # shape: [N, 64, 21, 21]
45 | # nn.BatchNorm2d(64),
46 | self.act_fn,
47 | )
48 | k = 3 if self.use_sinusoid else 1
49 | self.fc1 = nn.Sequential(
50 | nn.Linear(in_features=128 * k, out_features=512), self.act_fn, nn.Dropout(dropout_vis_fc)
51 | ) # shape: [N, 512]
52 | self.fc2 = nn.Linear(in_features=512, out_features=visual_features) # shape: [N, 64]
53 | self.ln = nn.LayerNorm(visual_features)
54 |
55 | def forward(self, x: torch.Tensor) -> torch.Tensor:
56 | x = self.conv_model(x)
57 | x = self.spatial_softmax(x)
58 | if self.use_sinusoid:
59 | x = torch.cat([x, torch.sin(x), torch.cos(x)], 1)
60 | x = self.fc1(x)
61 | x = self.fc2(x)
62 | if self.l2_normalize_output:
63 | x = F.normalize(x, p=2, dim=1)
64 | x = self.ln(x)
65 | return x # shape: [N, 64]
66 |
67 | @staticmethod
68 | def calc_out_size(w: int, h: int, kernel_size: int, padding: int, stride: int) -> Tuple[int, int]:
69 | width = (w - kernel_size + 2 * padding) // stride + 1
70 | height = (h - kernel_size + 2 * padding) // stride + 1
71 | return width, height
72 |
73 |
74 | class SpatialSoftmax(nn.Module):
75 | def __init__(self, num_rows: int, num_cols: int, temperature: Optional[float] = None):
76 | """
77 | Computes the spatial softmax of a convolutional feature map.
78 | Read more here:
79 | "Learning visual feature spaces for robotic manipulation with
80 | deep spatial autoencoders." Finn et al., http://arxiv.org/abs/1509.06113.
81 | :param num_rows: size related to original image width
82 | :param num_cols: size related to original image height
83 | :param temperature: Softmax temperature (optional). If None, a learnable temperature is created.
84 | """
85 | super(SpatialSoftmax, self).__init__()
86 | self.num_rows = num_rows
87 | self.num_cols = num_cols
88 | grid_x, grid_y = torch.meshgrid(
89 | torch.linspace(-1.0, 1.0, num_cols), torch.linspace(-1.0, 1.0, num_rows), indexing="ij"
90 | )
91 | x_map = grid_x.reshape(-1)
92 | y_map = grid_y.reshape(-1)
93 | self.register_buffer("x_map", x_map)
94 | self.register_buffer("y_map", y_map)
95 | if temperature:
96 | self.register_buffer("temperature", torch.ones(1) * temperature)
97 | else:
98 | self.temperature = Parameter(torch.ones(1))
99 |
100 | def forward(self, x: torch.Tensor) -> torch.Tensor:
101 | n, c, h, w = x.shape
102 | x = x.contiguous().view(-1, h * w) # batch, C, W*H
103 | softmax_attention = F.softmax(x / self.temperature, dim=1) # batch, C, W*H
104 | expected_x = torch.sum(self.x_map * softmax_attention, dim=1, keepdim=True)
105 | expected_y = torch.sum(self.y_map * softmax_attention, dim=1, keepdim=True)
106 | expected_xy = torch.cat((expected_x, expected_y), 1)
107 | self.coords = expected_xy.view(-1, c * 2)
108 | return self.coords # batch, C*2
109 |
--------------------------------------------------------------------------------
/hulc/models/perceptual_encoders/vision_network_gripper.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | from typing import Tuple
4 |
5 | import torch
6 | import torch.nn as nn
7 | import torch.nn.functional as F
8 |
9 |
10 | def nature_cnn(act_fn, num_c):
11 | return nn.Sequential(
12 | nn.Conv2d(num_c, 32, 8, stride=4),
13 | act_fn,
14 | nn.Conv2d(32, 64, 4, stride=2),
15 | act_fn,
16 | nn.Conv2d(64, 64, 3, stride=1),
17 | act_fn,
18 | nn.Flatten(start_dim=1),
19 | nn.Linear(64 * 7 * 7, 128),
20 | act_fn,
21 | )
22 |
23 |
24 | class VisionNetwork(nn.Module):
25 | def __init__(
26 | self,
27 | input_width: int,
28 | input_height: int,
29 | conv_encoder: str,
30 | activation_function: str,
31 | dropout_vis_fc: float,
32 | l2_normalize_output: bool,
33 | visual_features: int,
34 | num_c: int,
35 | ):
36 | super(VisionNetwork, self).__init__()
37 | self.l2_normalize_output = l2_normalize_output
38 | self.act_fn = getattr(nn, activation_function)()
39 | # model
40 | # this calls the method with the name conv_encoder
41 | self.conv_model = eval(conv_encoder)
42 | self.conv_model = self.conv_model(self.act_fn, num_c)
43 | self.fc1 = nn.Sequential(
44 | nn.Linear(in_features=128, out_features=512), self.act_fn, nn.Dropout(dropout_vis_fc)
45 | ) # shape: [N, 512]
46 | self.fc2 = nn.Linear(in_features=512, out_features=visual_features) # shape: [N, 64]
47 | self.ln = nn.LayerNorm(visual_features)
48 |
49 | def forward(self, x: torch.Tensor) -> torch.Tensor:
50 | x = self.conv_model(x)
51 | x = self.fc1(x)
52 | x = self.fc2(x)
53 | if self.l2_normalize_output:
54 | x = F.normalize(x, p=2, dim=1)
55 | x = self.ln(x)
56 | return x # shape: [N, 64]
57 |
58 | @staticmethod
59 | def calc_out_size(w: int, h: int, kernel_size: int, padding: int, stride: int) -> Tuple[int, int]:
60 | width = (w - kernel_size + 2 * padding) // stride + 1
61 | height = (h - kernel_size + 2 * padding) // stride + 1
62 | return width, height
63 |
--------------------------------------------------------------------------------
/hulc/models/plan_encoders/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lukashermann/hulc/7fdb09f5f1ff831ec8df29cfc61a41db573e9a06/hulc/models/plan_encoders/__init__.py
--------------------------------------------------------------------------------
/hulc/models/plan_encoders/plan_proposal_net.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | import torch
3 | import torch.nn as nn
4 |
5 | from hulc.utils.distributions import Distribution, State
6 |
7 |
8 | class PlanProposalNetwork(nn.Module):
9 | def __init__(
10 | self,
11 | perceptual_features: int,
12 | latent_goal_features: int,
13 | plan_features: int,
14 | activation_function: str,
15 | hidden_size: int,
16 | dist: Distribution,
17 | ):
18 | super(PlanProposalNetwork, self).__init__()
19 | self.perceptual_features = perceptual_features
20 | self.latent_goal_features = latent_goal_features
21 | self.plan_features = plan_features
22 | self.hidden_size = hidden_size
23 | self.in_features = self.perceptual_features + self.latent_goal_features
24 | self.act_fn = getattr(nn, activation_function)()
25 | self.dist = dist
26 | self.fc_model = nn.Sequential(
27 | nn.Linear(in_features=self.in_features, out_features=hidden_size), # shape: [N, 136]
28 | # nn.BatchNorm1d(hidden_size),
29 | self.act_fn,
30 | nn.Linear(in_features=hidden_size, out_features=hidden_size),
31 | # nn.BatchNorm1d(hidden_size),
32 | self.act_fn,
33 | nn.Linear(in_features=hidden_size, out_features=hidden_size),
34 | # nn.BatchNorm1d(hidden_size),
35 | self.act_fn,
36 | nn.Linear(in_features=hidden_size, out_features=hidden_size),
37 | # nn.BatchNorm1d(hidden_size),
38 | self.act_fn,
39 | )
40 | self.fc_state = self.dist.build_state(self.hidden_size, self.plan_features)
41 |
42 | def forward(self, initial_percep_emb: torch.Tensor, latent_goal: torch.Tensor) -> State:
43 | x = torch.cat([initial_percep_emb, latent_goal], dim=-1)
44 | x = self.fc_model(x)
45 | my_state = self.fc_state(x)
46 | state = self.dist.forward_dist(my_state)
47 | return state
48 |
--------------------------------------------------------------------------------
/hulc/models/plan_encoders/plan_recognition_net.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import math
4 | from typing import Tuple
5 |
6 | import torch
7 | import torch.nn as nn
8 |
9 | from hulc.utils.distributions import Distribution, State
10 |
11 |
12 | class PlanRecognitionBiRNNNetwork(nn.Module):
13 | def __init__(
14 | self,
15 | in_features: int,
16 | plan_features: int,
17 | action_space: int,
18 | birnn_dropout_p: float,
19 | dist: Distribution,
20 | rnn_type: str,
21 | ):
22 | super(PlanRecognitionBiRNNNetwork, self).__init__()
23 | self.plan_features = plan_features
24 | self.action_space = action_space
25 | self.in_features = in_features
26 | self.dist = dist
27 | self.birnn_model = eval(rnn_type)(
28 | input_size=self.in_features,
29 | hidden_size=2048,
30 | num_layers=2,
31 | bidirectional=True,
32 | batch_first=True,
33 | dropout=birnn_dropout_p,
34 | ) # shape: [N, seq_len, feat]
35 | self.fc_state = self.dist.build_state(4096, self.plan_features)
36 |
37 | def forward(self, perceptual_emb: torch.Tensor) -> Tuple[State, torch.Tensor]:
38 | x, hn = self.birnn_model(perceptual_emb)
39 | x = x[:, -1] # we just need only last unit output
40 | my_state = self.fc_state(x)
41 | state = self.dist.forward_dist(my_state)
42 | return state, x
43 |
44 |
45 | class PlanRecognitionTransformersNetwork(nn.Module):
46 | def __init__(
47 | self,
48 | num_heads: int,
49 | num_layers: int,
50 | encoder_hidden_size: int,
51 | fc_hidden_size: int,
52 | plan_features: int,
53 | in_features: int,
54 | action_space: int,
55 | encoder_normalize: bool,
56 | positional_normalize: bool,
57 | position_embedding: bool,
58 | max_position_embeddings: int,
59 | dropout_p: bool,
60 | dist: Distribution,
61 | ):
62 |
63 | super().__init__()
64 | self.in_features = in_features
65 | self.plan_features = plan_features
66 | self.action_space = action_space
67 | self.padding = False
68 | self.dist = dist
69 | self.hidden_size = fc_hidden_size
70 | self.position_embedding = position_embedding
71 | self.encoder_normalize = encoder_normalize
72 | self.positional_normalize = positional_normalize
73 | mod = self.in_features % num_heads
74 | if mod != 0:
75 | print(f"Padding for Num of Heads : {num_heads}")
76 | self.padding = True
77 | self.pad = num_heads - mod
78 | self.in_features += self.pad
79 | if position_embedding:
80 | self.position_embeddings = nn.Embedding(max_position_embeddings, self.in_features)
81 | else:
82 | self.positional_encoder = PositionalEncoding(self.in_features) # TODO: with max window_size
83 | encoder_layer = nn.TransformerEncoderLayer(
84 | self.in_features, num_heads, dim_feedforward=encoder_hidden_size, dropout=dropout_p
85 | )
86 | encoder_norm = nn.LayerNorm(self.in_features) if encoder_normalize else None
87 | if self.positional_normalize:
88 | self.layernorm = nn.LayerNorm(self.in_features)
89 | self.dropout = nn.Dropout(p=dropout_p)
90 | self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers, norm=encoder_norm)
91 | self.fc = nn.Linear(in_features=self.in_features, out_features=fc_hidden_size)
92 | self.fc_state = self.dist.build_state(fc_hidden_size, self.plan_features)
93 |
94 | def forward(self, perceptual_emb: torch.Tensor) -> Tuple[State, torch.Tensor]:
95 | batch_size, seq_len = perceptual_emb.shape[0], perceptual_emb.shape[1]
96 | perceptual_emb = (
97 | torch.cat([perceptual_emb, torch.zeros((batch_size, seq_len, self.pad)).to(perceptual_emb.device)], dim=-1)
98 | if self.padding
99 | else perceptual_emb
100 | )
101 | if self.position_embedding:
102 | position_ids = torch.arange(seq_len, dtype=torch.long, device=perceptual_emb.device).unsqueeze(0)
103 | position_embeddings = self.position_embeddings(position_ids)
104 | x = perceptual_emb + position_embeddings
105 | x = x.permute(1, 0, 2)
106 | else:
107 | # padd the perceptual embeddig
108 | x = self.positional_encoder(perceptual_emb.permute(1, 0, 2)) # [s, b, emb]
109 | if self.positional_normalize:
110 | x = self.layernorm(x)
111 | x = self.dropout(x)
112 | x = self.transformer_encoder(x)
113 | x = self.fc(x.permute(1, 0, 2))
114 | x = torch.mean(x, dim=1) # gather all the sequence info
115 | my_state = self.fc_state(x)
116 | state = self.dist.forward_dist(my_state)
117 | return state, x
118 |
119 |
120 | class PositionalEncoding(nn.Module):
121 | """Implementation from: https://pytorch.org/tutorials/beginner/transformer_tutorial.html"""
122 |
123 | def __init__(self, d_model, max_len=5000):
124 | super(PositionalEncoding, self).__init__()
125 |
126 | pe = torch.zeros(max_len, d_model)
127 | position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
128 | div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
129 | pe[:, 0::2] = torch.sin(position * div_term)
130 | pe[:, 1::2] = torch.cos(position * div_term) if d_model % 2 == 0 else torch.cos(position * div_term[:-1])
131 | pe = pe.unsqueeze(0).transpose(0, 1)
132 | self.register_buffer("pe", pe)
133 |
134 | def forward(self, x):
135 | x = x + self.pe[: x.size(0), :]
136 | return x
137 |
--------------------------------------------------------------------------------
/hulc/training.py:
--------------------------------------------------------------------------------
1 | from datetime import timedelta
2 | import logging
3 | from pathlib import Path
4 | import sys
5 | from typing import List, Union
6 |
7 | from lightning_lite.accelerators.cuda import num_cuda_devices
8 | from pytorch_lightning.strategies import DDPStrategy
9 |
10 | # This is for using the locally installed repo clone when using slurm
11 | sys.path.insert(0, Path(__file__).absolute().parents[1].as_posix())
12 | from calvin_agent.utils.utils import get_git_commit_hash, get_last_checkpoint, print_system_env_info
13 | import hydra
14 | from omegaconf import DictConfig, ListConfig, OmegaConf
15 | from pytorch_lightning import Callback, LightningModule, seed_everything, Trainer
16 | from pytorch_lightning.callbacks import LearningRateMonitor
17 | from pytorch_lightning.loggers import Logger
18 | from pytorch_lightning.utilities import rank_zero_only
19 |
20 | import hulc
21 | import hulc.models.hulc as models_m
22 | from hulc.utils.utils import initialize_pretrained_weights
23 |
24 | logger = logging.getLogger(__name__)
25 |
26 |
27 | @hydra.main(config_path="../conf", config_name="config")
28 | def train(cfg: DictConfig) -> None:
29 | """
30 | This is called to start a training.
31 |
32 | Args:
33 | cfg: hydra config
34 | """
35 | # sets seeds for numpy, torch, python.random and PYTHONHASHSEED.
36 | seed_everything(cfg.seed, workers=True) # type: ignore
37 | datamodule = hydra.utils.instantiate(cfg.datamodule, training_repo_root=Path(hulc.__file__).parents[1])
38 | chk = get_last_checkpoint(Path.cwd())
39 |
40 | # Load Model
41 | if chk is not None:
42 | model = getattr(models_m, cfg.model["_target_"].split(".")[-1]).load_from_checkpoint(chk.as_posix())
43 | else:
44 | model = hydra.utils.instantiate(cfg.model)
45 | if "pretrain_chk" in cfg:
46 | initialize_pretrained_weights(model, cfg)
47 |
48 | log_rank_0(f"Training with the following config:\n{OmegaConf.to_yaml(cfg)}")
49 | log_rank_0("Repo commit hash: {}".format(get_git_commit_hash(Path(hydra.utils.to_absolute_path(__file__)))))
50 | log_rank_0(print_system_env_info())
51 |
52 | train_logger = setup_logger(cfg, model)
53 | callbacks = setup_callbacks(cfg.callbacks)
54 | lr_logger = LearningRateMonitor(logging_interval="step")
55 | callbacks.append(lr_logger)
56 |
57 | trainer_args = {
58 | **cfg.trainer,
59 | "logger": train_logger,
60 | "callbacks": callbacks,
61 | "benchmark": False,
62 | }
63 |
64 | # Configure multi-GPU training
65 | if is_multi_gpu_training(trainer_args["devices"]):
66 | # increase default timeout for loading data into shared memory
67 | trainer_args["strategy"] = DDPStrategy(find_unused_parameters=False, timeout=timedelta(seconds=3600))
68 | if not cfg.slurm:
69 | modify_argv_hydra()
70 |
71 | trainer = Trainer(**trainer_args)
72 |
73 | # Start training
74 | trainer.fit(model, datamodule=datamodule, ckpt_path=chk) # type: ignore
75 |
76 |
77 | def setup_callbacks(callbacks_cfg: DictConfig) -> List[Callback]:
78 | """
79 | Instantiate all training callbacks.
80 |
81 | Args:
82 | callbacks_cfg: DictConfig with all callback params
83 |
84 | Returns:
85 | List of instantiated callbacks.
86 | """
87 | callbacks = [hydra.utils.instantiate(cb) for cb in callbacks_cfg.values()]
88 | return callbacks
89 |
90 |
91 | def setup_logger(cfg: DictConfig, model: LightningModule) -> Logger:
92 | """
93 | Set up the logger (tensorboard or wandb) from hydra config.
94 |
95 | Args:
96 | cfg: Hydra config
97 | model: LightningModule
98 |
99 | Returns:
100 | logger
101 | """
102 | pathlib_cwd = Path.cwd()
103 | if "group" in cfg.logger:
104 | cfg.logger.group = pathlib_cwd.parent.name
105 | cfg.logger.name = pathlib_cwd.parent.name + "/" + pathlib_cwd.name
106 | cfg.logger.id = cfg.logger.name.replace("/", "_")
107 | train_logger = hydra.utils.instantiate(cfg.logger)
108 | # train_logger.watch(model)
109 | else:
110 | train_logger = hydra.utils.instantiate(cfg.logger)
111 | return train_logger
112 |
113 |
114 | def modify_argv_hydra() -> None:
115 | """
116 | To make hydra work with pytorch-lightning and ddp, we modify sys.argv for the child processes spawned with ddp.
117 | This is only used when NOT using slurm.
118 | """
119 | cwd = Path.cwd().as_posix()
120 | cwd = f'"{cwd}"'
121 | sys.argv = sys.argv[:1]
122 | sys.argv.extend(
123 | [
124 | f"hydra.run.dir={cwd}",
125 | "hydra/hydra_logging=disabled",
126 | "hydra/job_logging=disabled",
127 | ]
128 | )
129 | overrides = OmegaConf.load(".hydra/overrides.yaml")
130 | for o in overrides:
131 | if "hydra/sweeper" in o: # type: ignore
132 | continue
133 |
134 | if "hydra/launcher" in o: # type: ignore
135 | continue
136 |
137 | sys.argv.append(o) # type: ignore
138 |
139 |
140 | def is_multi_gpu_training(devices: Union[int, str, ListConfig]) -> bool:
141 | """
142 | Check if training on multiple GPUs.
143 | See https://pytorch-lightning.readthedocs.io/en/stable/common/trainer.html#devices
144 |
145 | Args:
146 | devices: int, str or ListConfig specifying devices
147 |
148 | Returns:
149 | True if multi-gpu training (ddp), False otherwise.
150 | """
151 | num_gpu_available = num_cuda_devices()
152 | if isinstance(devices, int):
153 | return devices > 1 or (devices == -1 and num_gpu_available > 1)
154 | elif isinstance(devices, str) and devices == "auto":
155 | return num_gpu_available > 1
156 | elif isinstance(devices, str):
157 | return len(devices) > 1
158 | elif isinstance(devices, ListConfig):
159 | return len(devices) > 1
160 | else:
161 | raise ValueError
162 |
163 |
164 | @rank_zero_only
165 | def log_rank_0(*args, **kwargs):
166 | # when using ddp, only log with rank 0 process
167 | logger.info(*args, **kwargs)
168 |
169 |
170 | if __name__ == "__main__":
171 | train()
172 |
--------------------------------------------------------------------------------
/hulc/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lukashermann/hulc/7fdb09f5f1ff831ec8df29cfc61a41db573e9a06/hulc/utils/__init__.py
--------------------------------------------------------------------------------
/hulc/utils/bpe_simple_vocab_16e6.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lukashermann/hulc/7fdb09f5f1ff831ec8df29cfc61a41db573e9a06/hulc/utils/bpe_simple_vocab_16e6.txt.gz
--------------------------------------------------------------------------------
/hulc/utils/clip_tokenizer.py:
--------------------------------------------------------------------------------
1 | from functools import lru_cache
2 | import gzip
3 | import html
4 | import os
5 |
6 | import ftfy
7 | import regex as re
8 |
9 |
10 | @lru_cache()
11 | def default_bpe():
12 | return os.path.join(os.path.dirname(os.path.abspath(__file__)), "bpe_simple_vocab_16e6.txt.gz")
13 |
14 |
15 | @lru_cache()
16 | def bytes_to_unicode():
17 | """
18 | Returns list of utf-8 byte and a corresponding list of unicode strings.
19 | The reversible bpe codes work on unicode strings.
20 | This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
21 | When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
22 | This is a signficant percentage of your normal, say, 32K bpe vocab.
23 | To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
24 | And avoids mapping to whitespace/control characters the bpe code barfs on.
25 | """
26 | bs = list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
27 | cs = bs[:]
28 | n = 0
29 | for b in range(2 ** 8):
30 | if b not in bs:
31 | bs.append(b)
32 | cs.append(2 ** 8 + n)
33 | n += 1
34 | cs = [chr(n) for n in cs]
35 | return dict(zip(bs, cs))
36 |
37 |
38 | def get_pairs(word):
39 | """Return set of symbol pairs in a word.
40 | Word is represented as tuple of symbols (symbols being variable-length strings).
41 | """
42 | pairs = set()
43 | prev_char = word[0]
44 | for char in word[1:]:
45 | pairs.add((prev_char, char))
46 | prev_char = char
47 | return pairs
48 |
49 |
50 | def basic_clean(text):
51 | text = ftfy.fix_text(text)
52 | text = html.unescape(html.unescape(text))
53 | return text.strip()
54 |
55 |
56 | def whitespace_clean(text):
57 | text = re.sub(r"\s+", " ", text)
58 | text = text.strip()
59 | return text
60 |
61 |
62 | class SimpleTokenizer(object):
63 | def __init__(self, bpe_path: str = default_bpe()):
64 | self.byte_encoder = bytes_to_unicode()
65 | self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
66 | merges = gzip.open(bpe_path).read().decode("utf-8").split("\n")
67 | merges = merges[1 : 49152 - 256 - 2 + 1]
68 | merges = [tuple(merge.split()) for merge in merges] # type:ignore
69 | vocab = list(bytes_to_unicode().values())
70 | vocab = vocab + [v + "" for v in vocab]
71 | for merge in merges:
72 | vocab.append("".join(merge))
73 | vocab.extend(["<|startoftext|>", "<|endoftext|>"])
74 | self.encoder = dict(zip(vocab, range(len(vocab))))
75 | self.decoder = {v: k for k, v in self.encoder.items()}
76 | self.bpe_ranks = dict(zip(merges, range(len(merges))))
77 | self.cache = {"<|startoftext|>": "<|startoftext|>", "<|endoftext|>": "<|endoftext|>"}
78 | self.pat = re.compile(
79 | r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""",
80 | re.IGNORECASE,
81 | )
82 |
83 | def bpe(self, token):
84 | if token in self.cache:
85 | return self.cache[token]
86 | word = tuple(token[:-1]) + (token[-1] + "",)
87 | pairs = get_pairs(word)
88 |
89 | if not pairs:
90 | return token + ""
91 |
92 | while True:
93 | bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
94 | if bigram not in self.bpe_ranks:
95 | break
96 | first, second = bigram
97 | new_word = []
98 | i = 0
99 | while i < len(word):
100 | try:
101 | j = word.index(first, i)
102 | new_word.extend(word[i:j])
103 | i = j
104 | except Exception as ex:
105 | new_word.extend(word[i:])
106 | print(ex.message, ex.args)
107 | break
108 |
109 | if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
110 | new_word.append(first + second)
111 | i += 2
112 | else:
113 | new_word.append(word[i])
114 | i += 1
115 | new_word = tuple(new_word)
116 | word = new_word
117 | if len(word) == 1:
118 | break
119 | else:
120 | pairs = get_pairs(word)
121 | word = " ".join(word)
122 | self.cache[token] = word
123 | return word
124 |
125 | def encode(self, text):
126 | bpe_tokens = []
127 | text = whitespace_clean(basic_clean(text)).lower()
128 | for token in re.findall(self.pat, text):
129 | token = "".join(self.byte_encoder[b] for b in token.encode("utf-8"))
130 | bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(" "))
131 | return bpe_tokens
132 |
133 | def decode(self, tokens):
134 | text = "".join([self.decoder[token] for token in tokens])
135 | text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors="replace").replace("", " ")
136 | return text
137 |
--------------------------------------------------------------------------------
/hulc/utils/distributions.py:
--------------------------------------------------------------------------------
1 | from collections import namedtuple
2 | from typing import Union
3 |
4 | import torch
5 | from torch.distributions import Independent, Normal, OneHotCategoricalStraightThrough # type: ignore
6 | import torch.nn as nn
7 | import torch.nn.functional as F
8 |
9 | DiscState = namedtuple("DiscState", ["logit"])
10 | ContState = namedtuple("ContState", ["mean", "std"])
11 |
12 | State = Union[DiscState, ContState]
13 |
14 |
15 | class Distribution:
16 | def __init__(self, **kwargs):
17 | self.dist = kwargs.get("dist")
18 | assert self.dist == "discrete" or self.dist == "continuous"
19 | if self.dist == "discrete":
20 | self.category_size = kwargs.get("category_size")
21 | self.class_size = kwargs.get("class_size")
22 |
23 | def get_dist(self, state):
24 | if self.dist == "discrete":
25 | shape = state.logit.shape
26 | logits = torch.reshape(state.logit, shape=(*shape[:-1], self.category_size, self.class_size))
27 | return Independent(OneHotCategoricalStraightThrough(logits=logits), 1)
28 | elif self.dist == "continuous":
29 | return Independent(Normal(state.mean, state.std), 1)
30 |
31 | def detach_state(self, state):
32 | if self.dist == "discrete":
33 | return DiscState(state.logit.detach())
34 | elif self.dist == "continuous":
35 | return ContState(state.mean.detach(), state.std.detach())
36 |
37 | def sample_latent_plan(self, distribution):
38 | sampled_plan = distribution.sample()
39 | if self.dist == "discrete":
40 | sampled_plan = torch.flatten(sampled_plan, start_dim=-2, end_dim=-1)
41 | return sampled_plan
42 |
43 | def build_state(self, hidden_size, plan_features):
44 | fc_state = []
45 | if self.dist == "discrete":
46 | fc_state += [nn.Linear(hidden_size, plan_features)]
47 | elif self.dist == "continuous":
48 | fc_state += [nn.Linear(hidden_size, 2 * plan_features)]
49 | return nn.Sequential(*fc_state)
50 |
51 | def forward_dist(self, x):
52 | if self.dist == "discrete":
53 | prior_logit = x
54 | state = DiscState(prior_logit) # type: State
55 | elif self.dist == "continuous":
56 | mean, var = torch.chunk(x, 2, dim=-1)
57 | min_std = 0.0001
58 | std = F.softplus(var) + min_std
59 | state = ContState(mean, std)
60 | return state
61 |
--------------------------------------------------------------------------------
/hulc/utils/kl_callbacks.py:
--------------------------------------------------------------------------------
1 | from pytorch_lightning import Callback, LightningModule, Trainer
2 | import torch
3 |
4 |
5 | def sigmoid(scale: float, shift: float, x: int) -> float:
6 | return torch.sigmoid(torch.Tensor([(x - shift) / (scale / 12)])).item()
7 |
8 |
9 | class KLSchedule(Callback):
10 | """
11 | Base class for KL Annealing
12 | """
13 |
14 | def __init__(self, start_epoch: int, end_epoch: int, max_kl_beta: float):
15 | self.start_epoch = start_epoch
16 | self.end_epoch = end_epoch
17 | self.max_kl_beta = max_kl_beta
18 |
19 | def on_train_epoch_start(self, trainer: Trainer, pl_module: LightningModule) -> None:
20 | epoch = pl_module.current_epoch
21 | kl_beta = self._anneal_fn(epoch)
22 | pl_module.set_kl_beta(kl_beta) # type: ignore
23 |
24 | def _anneal_fn(self, epoch):
25 | raise NotImplementedError
26 |
27 |
28 | class KLConstantSchedule(KLSchedule):
29 | def __init__(self):
30 | pass
31 |
32 | def on_train_epoch_start(self, trainer: Trainer, pl_module: LightningModule) -> None:
33 | pass
34 |
35 | def _anneal_fn(self, epoch: int) -> None:
36 | pass
37 |
38 |
39 | class KLSigmoidSchedule(KLSchedule):
40 | def _anneal_fn(self, epoch: int) -> float:
41 | if epoch < self.start_epoch:
42 | kl_beta = 0.0
43 | elif epoch > self.end_epoch:
44 | kl_beta = self.max_kl_beta
45 | else:
46 | scale = self.end_epoch - self.start_epoch
47 | shift = (self.end_epoch + self.start_epoch) / 2
48 | kl_beta = sigmoid(scale=scale, shift=shift, x=epoch) * self.max_kl_beta
49 | return kl_beta
50 |
51 |
52 | class KLLinearSchedule(KLSchedule):
53 | def _anneal_fn(self, epoch: int) -> float:
54 | if epoch < self.start_epoch:
55 | kl_beta = 0.0
56 | elif epoch > self.end_epoch:
57 | kl_beta = self.max_kl_beta
58 | else:
59 | kl_beta = self.max_kl_beta * (epoch - self.start_epoch) / (self.end_epoch - self.start_epoch)
60 | return kl_beta
61 |
62 |
63 | if __name__ == "__main__":
64 | import matplotlib
65 | import matplotlib.pyplot as plt
66 |
67 | matplotlib.use("TkAgg")
68 | import numpy as np
69 |
70 | kl = KLLinearSchedule(10, 50, 0.1)
71 | x = np.arange(200)
72 | y = [kl._anneal_fn(i) for i in x]
73 | plt.plot(x, y)
74 |
75 | kl2 = KLSigmoidSchedule(10, 50, 0.1)
76 | x = np.arange(200)
77 | y = [kl2._anneal_fn(i) for i in x]
78 | plt.plot(x, y)
79 |
80 | plt.show()
81 |
--------------------------------------------------------------------------------
/hulc/utils/transforms.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 | import torch.nn as nn
4 | import torch.nn.functional as F
5 |
6 |
7 | # source: https://github.com/facebookresearch/drqv2/blob/main/drqv2.py
8 | class RandomShiftsAug(nn.Module):
9 | def __init__(self, pad):
10 | super().__init__()
11 | self.pad = pad
12 |
13 | def forward(self, x):
14 | x = x.float()
15 | n, c, h, w = x.size()
16 | assert h == w
17 | padding = tuple([self.pad] * 4)
18 | x = F.pad(x, padding, "replicate")
19 | eps = 1.0 / (h + 2 * self.pad)
20 | arange = torch.linspace(-1.0 + eps, 1.0 - eps, h + 2 * self.pad, device=x.device, dtype=x.dtype)[:h]
21 | arange = arange.unsqueeze(0).repeat(h, 1).unsqueeze(2)
22 | base_grid = torch.cat([arange, arange.transpose(1, 0)], dim=2)
23 | base_grid = base_grid.unsqueeze(0).repeat(n, 1, 1, 1)
24 |
25 | shift = torch.randint(0, 2 * self.pad + 1, size=(n, 1, 1, 2), device=x.device, dtype=x.dtype)
26 | shift *= 2.0 / (h + 2 * self.pad)
27 |
28 | grid = base_grid + shift
29 | return F.grid_sample(x, grid, padding_mode="zeros", align_corners=False)
30 |
31 |
32 | class RelativeActions(object):
33 | """Transform absolute actions to relative"""
34 |
35 | def __init__(self, max_pos, max_orn):
36 | self.max_pos = max_pos
37 | self.max_orn = max_orn
38 |
39 | @staticmethod
40 | def batch_angle_between(a, b):
41 | diff = b - a
42 | return (diff + np.pi) % (2 * np.pi) - np.pi
43 |
44 | def __call__(self, action_and_obs):
45 | actions, robot_obs = action_and_obs
46 | assert isinstance(actions, np.ndarray)
47 | assert isinstance(robot_obs, np.ndarray)
48 |
49 | rel_pos = actions[:, :3] - robot_obs[:, :3]
50 | rel_pos = np.clip(rel_pos, -self.max_pos, self.max_pos) / self.max_pos
51 |
52 | rel_orn = self.batch_angle_between(robot_obs[:, 3:6], actions[:, 3:6])
53 | rel_orn = np.clip(rel_orn, -self.max_orn, self.max_orn) / self.max_orn
54 |
55 | gripper = actions[:, -1:]
56 | return np.concatenate([rel_pos, rel_orn, gripper], axis=1)
57 |
58 | def __repr__(self):
59 | return self.__class__.__name__ + f"(max_pos={self.max_pos}, max_orn={self.max_orn})"
60 |
--------------------------------------------------------------------------------
/hulc/utils/utils.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 |
3 | from calvin_agent.utils.utils import format_sftp_path
4 | from pytorch_lightning.utilities.cloud_io import load as pl_load
5 |
6 |
7 | def initialize_pretrained_weights(model, cfg):
8 | pretrain_chk = pl_load(format_sftp_path(Path(cfg.pretrain_chk)), map_location=lambda storage, loc: storage)
9 | batch_size = model.plan_recognition.position_embeddings.weight.shape[0]
10 | weight = "plan_recognition.position_embeddings.weight"
11 | pretrain_chk["state_dict"][weight] = pretrain_chk["state_dict"][weight][:batch_size]
12 | if "pretrain_exclude_pr" in cfg and cfg.pretrain_exclude_pr:
13 | for key in list(pretrain_chk["state_dict"].keys()):
14 | if key.startswith("plan_recognition"):
15 | del pretrain_chk["state_dict"][key]
16 | model.load_state_dict(pretrain_chk["state_dict"], strict=False)
17 |
--------------------------------------------------------------------------------
/install.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | cd calvin_env/tacto
4 | pip install -e .
5 | cd ..
6 | pip install -e .
7 | cd ..
8 | pip install -e .
9 |
--------------------------------------------------------------------------------
/media/hulc_rollout.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lukashermann/hulc/7fdb09f5f1ff831ec8df29cfc61a41db573e9a06/media/hulc_rollout.gif
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.black]
2 | # https://github.com/psf/black
3 | line-length = 120
4 | target-version = ["py38"]
5 | exclude = "(.eggs|.git|.hg|.mypy_cache|.nox|.tox|.venv|.svn|_build|buck-out|build|dist)"
6 |
7 | [tool.isort]
8 | profile = "black"
9 | line_length = 120
10 | force_sort_within_sections = "True"
11 | order_by_type = "False"
12 |
--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | black
2 | flake8
3 | isort
4 | pre-commit
5 | mypy
6 | pytest
7 | pytest-cov
8 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | cmake
2 | wheel
3 | numpy>1.2
4 | hydra-core==1.1.1
5 | hydra-colorlog
6 | matplotlib
7 | opencv-python
8 | omegaconf
9 | plotly
10 | ftfy
11 | pytorch-lightning==1.8.6
12 | lightning_lite
13 | torch==1.13.1
14 | torchvision
15 | MulticoreTSNE
16 | gitpython
17 | scipy
18 | sentence-transformers
19 | gym
20 | moviepy
21 | tqdm
22 | termcolor
23 | wandb
24 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | """Setup hulc installation."""
4 |
5 | from os import path as op
6 | import re
7 |
8 | from setuptools import find_packages, setup
9 |
10 |
11 | def _read(f):
12 | return open(op.join(op.dirname(__file__), f)).read() if op.exists(f) else ""
13 |
14 |
15 | _meta = _read("hulc/__init__.py")
16 |
17 |
18 | def find_meta(_meta, string):
19 | l_match = re.search(r"^" + string + r'\s*=\s*"(.*)"', _meta, re.M)
20 | if l_match:
21 | return l_match.group(1)
22 | raise RuntimeError(f"Unable to find {string} string.")
23 |
24 |
25 | install_requires = [
26 | l for l in _read("requirements.txt").split("\n") if l and not l.startswith("#") and not l.startswith("-")
27 | ]
28 |
29 | meta = dict(
30 | name=find_meta(_meta, "__project__"),
31 | version=find_meta(_meta, "__version__"),
32 | license=find_meta(_meta, "__license__"),
33 | description="Hierarchical Universal Language Conditioned Policies",
34 | platforms=("Any"),
35 | zip_safe=False,
36 | keywords="pytorch hulc".split(),
37 | author=find_meta(_meta, "__author__"),
38 | author_email=find_meta(_meta, "__email__"),
39 | url=" https://github.com/mees/hulc",
40 | packages=find_packages(exclude=["tests"]),
41 | install_requires=install_requires,
42 | )
43 |
44 | if __name__ == "__main__":
45 | print("find_package", find_packages(exclude=["tests"]))
46 | setup(**meta)
47 |
--------------------------------------------------------------------------------
/setup_local.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | """Setup hulc installation."""
4 |
5 | from os import path as op
6 | import re
7 |
8 | from setuptools import find_packages, setup
9 |
10 |
11 | def _read(f):
12 | return open(op.join(op.dirname(__file__), f)).read() if op.exists(f) else ""
13 |
14 |
15 | _meta = _read("hulc/__init__.py")
16 |
17 |
18 | def find_meta(_meta, string):
19 | l_match = re.search(r"^" + string + r'\s*=\s*"(.*)"', _meta, re.M)
20 | if l_match:
21 | return l_match.group(1)
22 | raise RuntimeError(f"Unable to find {string} string.")
23 |
24 |
25 | meta = dict(
26 | name=find_meta(_meta, "__project__"),
27 | version=find_meta(_meta, "__version__"),
28 | license=find_meta(_meta, "__license__"),
29 | description="Hierarchical Universal Language Conditioned Policies",
30 | platforms=("Any"),
31 | zip_safe=False,
32 | keywords="pytorch Lfp".split(),
33 | author=find_meta(_meta, "__author__"),
34 | author_email=find_meta(_meta, "__email__"),
35 | url=" https://github.com/mees/hulc",
36 | packages=find_packages(exclude=["tests"]),
37 | )
38 |
39 | if __name__ == "__main__":
40 | print("find_package", find_packages(exclude=["tests"]))
41 | setup(**meta)
42 |
--------------------------------------------------------------------------------
/slurm_scripts/README.md:
--------------------------------------------------------------------------------
1 | ## Training CALVIN on a Slurm Cluster
2 | ### Starting a training
3 | ```bash
4 | $ cd $HULC_ROOT/slurm_scripts
5 | $ python slurm_training.py --venv hulc_venv datamodule.root_data_dir=/path/to/dataset/
6 | ```
7 | This assumes that `--venv hulc_venv` specifies a conda environment.
8 | To use virtualenv instead, change line 18 of sbatch_lfp.sh accordingly.
9 |
10 | All hydra arguments can be used as in the normal training.
11 |
12 | Use the following optional command line arguments for slurm:
13 | - `--log_dir`: slurm log directory
14 | - `--job_name`: slurm job name
15 | - `--gpus`: number of gpus
16 | - `--mem`: memory
17 | - `--cpus`: number of cpus
18 | - `--days`: time limit in days
19 | - `--partition`: name of slurm partition
20 |
21 | The script will create a new folder in the specified log dir with a date tag and the job name.
22 | This is done *before* the job is submitted to the slurm queue.
23 | In order to ensure reproducibility, the current state of the calvin repository
24 | is copied to the log directory at *submit time* and is
25 | locally installed, such that you can schedule multiple trainings and there is no interference with
26 | future changes to the repository.
27 |
28 | ### Resuming a training
29 | Every job submission creates a `resume_training.sh` script in the log folder. To resume a training,
30 | call `$ sh /resume_training.sh`. By default, the model loads the latest saved checkpoint.
31 |
32 | ### Evaluating a model
33 | To evaluate a trained model via slurm, run `$ sh /evaluate.sh`, which will automatically place a job on the
34 | same partition as it was trained on. Note that this script is also autogenerated.
35 |
--------------------------------------------------------------------------------
/slurm_scripts/sbatch_eval.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Print some information about the job to STDOUT
3 | echo "Workingdir: $PWD";
4 | echo "Started at $(date)";
5 | echo "Running job $SLURM_JOB_NAME";
6 | echo "cpus per node: $SLURM_JOB_CPUS_PER_NODE";
7 | echo "gres: $SLURM_GRES";
8 | echo "mem: $SLURM_MEM_PER_NODE";
9 | echo "ntasks: $SLURM_NTASKS";
10 | echo "JID $SLURM_JOB_ID on queue $SLURM_JOB_PARTITION";
11 |
12 | export NCCL_DEBUG=INFO
13 | export PYTHONFAULTHANDLER=1
14 | export HYDRA_FULL_ERROR=1
15 |
16 | # Job to perform
17 | source ~/.bashrc
18 | conda activate $1
19 | srun python ${@:2}
20 |
21 | # Print some Information about the end-time to STDOUT
22 | echo "DONE";
23 | echo "Finished at $(date)";
24 |
--------------------------------------------------------------------------------
/slurm_scripts/sbatch_lfp.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Print some information about the job to STDOUT
3 | echo "Workingdir: $PWD";
4 | echo "Started at $(date)";
5 | echo "Running job $SLURM_JOB_NAME";
6 | echo "cpus per node: $SLURM_JOB_CPUS_PER_NODE";
7 | echo "gres: $SLURM_GRES";
8 | echo "mem: $SLURM_MEM_PER_NODE";
9 | echo "ntasks: $SLURM_NTASKS";
10 | echo "JID $SLURM_JOB_ID on queue $SLURM_JOB_PARTITION";
11 |
12 | export NCCL_DEBUG=INFO
13 | export PYTHONFAULTHANDLER=1
14 | export HYDRA_FULL_ERROR=1
15 |
16 | # Job to perform
17 | source ~/.bashrc
18 | conda activate $1
19 | timeout 23.8h srun python $3 slurm=true hydra.run.dir=$4 trainer.devices=$5 ${@:6}
20 |
21 | if [[ $? -eq 124 ]]; then
22 | echo "Time limit exceeded. Resubmit job.";
23 | ssh ${USER}@$2 < 2 else np.inf
16 |
17 | checkpoints = get_all_checkpoints(training_dir)
18 | epochs = [str(e) for chk in checkpoints if (e := int(chk.stem.split("=")[1])) <= max_epoch]
19 | split_epochs = np.array_split(epochs, 8)
20 | epoch_args = [",".join(arr) for arr in split_epochs if len(arr)]
21 | for epoch_arg in epoch_args:
22 | cmd = [(training_dir / "evaluate.sh").as_posix(), "--checkpoints", epoch_arg, "--eval_log_dir", eval_log_dir]
23 | output = subprocess.check_output(cmd)
24 | print(output.decode("utf-8"))
25 |
26 |
27 | if __name__ == "__main__":
28 | main()
29 |
--------------------------------------------------------------------------------
/slurm_scripts/slurm_training.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import datetime
3 | import os
4 | from pathlib import Path
5 | import stat
6 | import subprocess
7 |
8 | from git import Repo
9 | import numpy as np
10 | from setuptools import sandbox
11 |
12 | default_log_dir = f"/home/{os.environ['USER']}/logs" if "USER" in os.environ else "/tmp"
13 | if default_log_dir == "/tmp":
14 | print("CAUTION: logging to /tmp")
15 | parser = argparse.ArgumentParser(description="Parse slurm parameters and hydra config overrides")
16 |
17 | parser.add_argument("--script", type=str, default="./sbatch_lfp.sh")
18 | parser.add_argument("--train_file", type=str, default="../hulc/training.py")
19 | parser.add_argument("-l", "--log_dir", type=str, default=default_log_dir)
20 | parser.add_argument("-j", "--job_name", type=str, default="play_training")
21 | parser.add_argument("-g", "--gpus", type=int, default=1)
22 | parser.add_argument("--mem", type=int, default=0) # 0 means no memory limit
23 | parser.add_argument("--cpus", type=int, default=8)
24 | parser.add_argument("--days", type=int, default=1)
25 | parser.add_argument("-v", "--venv", type=str)
26 | parser.add_argument("-p", "--partition", type=str, default="alldlc_gpu-rtx2080")
27 | parser.add_argument("--login_node", type=str, default="kis3bat1")
28 | parser.add_argument("-x", "--exclude", type=str)
29 | parser.add_argument("--no_clone", action="store_true")
30 | args, unknownargs = parser.parse_known_args()
31 |
32 |
33 | assert np.all(["gpu" not in arg for arg in unknownargs])
34 | assert np.all(["hydra.run.dir" not in arg for arg in unknownargs])
35 | assert np.all(["log_dir" not in arg for arg in unknownargs])
36 | assert np.all(["hydra.sweep.dir" not in arg for arg in unknownargs])
37 |
38 | log_dir = Path(args.log_dir).absolute() / f'{datetime.datetime.now().strftime("%Y-%m-%d/%H-%M-%S")}_{args.job_name}'
39 | os.makedirs(log_dir)
40 | args.script = Path(args.script).absolute()
41 | args.train_file = Path(args.train_file).absolute()
42 |
43 |
44 | def create_git_copy(repo_src_dir, repo_target_dir):
45 | repo = Repo(repo_src_dir)
46 | repo.clone(repo_target_dir)
47 | orig_cwd = os.getcwd()
48 | os.chdir(repo_target_dir)
49 | os.environ["PYTHONPATH"] = os.getcwd() + ":" + os.environ.get("PYTHONPATH", "")
50 | sandbox.run_setup("setup_local.py", ["develop", "--install-dir", "."])
51 | os.chdir(orig_cwd)
52 |
53 |
54 | if not args.no_clone:
55 | repo_src_dir = Path(__file__).absolute().parents[1]
56 | repo_target_dir = log_dir / "hulc"
57 | create_git_copy(repo_src_dir, repo_target_dir)
58 |
59 | args.script = repo_target_dir / os.path.relpath(args.script, repo_src_dir)
60 | args.train_file = repo_target_dir / os.path.relpath(args.train_file, repo_src_dir)
61 |
62 | if args.partition == "test":
63 | args.partition = "testdlc_gpu-rtx2080"
64 |
65 | args.time = f"{args.days}-00:00"
66 | if args.partition == "testdlc_gpu-rtx2080":
67 | args.time = "01:00:00"
68 |
69 | job_opts = {
70 | "script": f"{args.script.as_posix()} {args.venv} {args.login_node} {args.train_file.as_posix()} {log_dir.as_posix()} {args.gpus} {' '.join(unknownargs)}",
71 | "partition": args.partition,
72 | "mem": args.mem,
73 | "ntasks-per-node": args.gpus,
74 | "cpus-per-task": args.cpus,
75 | "gres": f"gpu:{args.gpus}",
76 | "output": os.path.join(log_dir, "%x.%N.%j.out"),
77 | "error": os.path.join(log_dir, "%x.%N.%j.err"),
78 | "job-name": args.job_name,
79 | "mail-type": "END,FAIL",
80 | "time": args.time,
81 | }
82 |
83 | if args.exclude is not None:
84 | job_opts["exclude"] = ",".join(map(lambda x: f"dlcgpu{int(x):02d}", args.exclude.split(",")))
85 |
86 |
87 | def submit_job(job_info):
88 | # Construct sbatch command
89 | slurm_cmd = ["sbatch"]
90 | for key, value in job_info.items():
91 | # Check for special case keys
92 | if key == "script":
93 | continue
94 | slurm_cmd.append(f"--{key}={value}")
95 | slurm_cmd.append(job_info["script"])
96 | print("Generated slurm batch command: '%s'" % slurm_cmd)
97 |
98 | # Run sbatch command as subprocess.
99 | try:
100 | sbatch_output = subprocess.check_output(slurm_cmd)
101 | create_resume_script(slurm_cmd)
102 | except subprocess.CalledProcessError as e:
103 | # Print error message from sbatch for easier debugging, then pass on exception
104 | if sbatch_output is not None:
105 | print("ERROR: Subprocess call output: %s" % sbatch_output)
106 | raise e
107 |
108 | print(sbatch_output.decode("utf-8"))
109 |
110 |
111 | def create_resume_script(slurm_cmd):
112 | file_path = os.path.join(log_dir, "resume_training.sh")
113 | with open(file_path, "w") as file:
114 | file.write("#!/bin/bash\n")
115 | file.write(" ".join(slurm_cmd))
116 | st = os.stat(file_path)
117 | os.chmod(file_path, st.st_mode | stat.S_IEXEC)
118 |
119 |
120 | def create_eval_script():
121 | # Construct sbatch command
122 | eval_log_dir = log_dir / "evaluation"
123 | os.makedirs(eval_log_dir, exist_ok=True)
124 | eval_sbatch_script = Path("./sbatch_eval.sh").absolute()
125 | eval_file = args.train_file.parent / "evaluation/evaluate_policy.py"
126 |
127 | dataset_path = next(filter(lambda x: x.split("=")[0] == "datamodule.root_data_dir", unknownargs)).split("=")[1]
128 |
129 | eval_cmd = ["sbatch"]
130 | eval_job_opts = {
131 | "partition": args.partition,
132 | "mem": args.mem,
133 | "ntasks-per-node": 1,
134 | "cpus-per-task": 8,
135 | "gres": "gpu:1",
136 | "output": os.path.join(eval_log_dir, "%x.%N.%j.out"),
137 | "error": os.path.join(eval_log_dir, "%x.%N.%j.err"),
138 | "job-name": f"{args.job_name}_eval",
139 | "mail-type": "END,FAIL",
140 | "time": "1-00:00",
141 | }
142 | for key, value in eval_job_opts.items():
143 | eval_cmd.append(f"--{key}={value}")
144 | eval_args = f"{eval_sbatch_script.as_posix()} {args.venv} {eval_file.as_posix()}"
145 | eval_args += f" --dataset_path {dataset_path}"
146 | eval_args += f" --train_folder {log_dir}"
147 | eval_args += " ${@:1}"
148 | eval_cmd.append(eval_args)
149 |
150 | file_path = os.path.join(log_dir, "evaluate.sh")
151 | with open(file_path, "w") as file:
152 | file.write("#!/bin/bash\n")
153 | file.write(" ".join(eval_cmd))
154 | st = os.stat(file_path)
155 | os.chmod(file_path, st.st_mode | stat.S_IEXEC)
156 |
157 |
158 | submit_job(job_opts)
159 | create_eval_script()
160 |
--------------------------------------------------------------------------------