├── .flake8 ├── .gitignore ├── .gitmodules ├── .pre-commit-config.yaml ├── LICENSE ├── README.md ├── checkpoints └── download_model_weights.sh ├── conf ├── __init__.py ├── annotations │ ├── new_playtable.yaml │ └── new_playtable_validation.yaml ├── callbacks │ ├── checkpoint │ │ ├── all.yaml │ │ ├── clip_loss.yaml │ │ ├── kl.yaml │ │ ├── lh_sr.yaml │ │ ├── state_recon.yaml │ │ ├── task_sr.yaml │ │ └── val_action.yaml │ ├── default.yaml │ ├── kl_schedule │ │ ├── constant.yaml │ │ ├── linear.yaml │ │ └── sigmoid.yaml │ ├── rollout │ │ ├── default.yaml │ │ └── tasks │ │ │ └── new_playtable_tasks.yaml │ ├── rollout_lh │ │ └── default.yaml │ ├── shm_signal │ │ └── default.yaml │ └── tsne_plot │ │ └── default.yaml ├── config.yaml ├── datamodule │ ├── datasets │ │ ├── lang_dataset │ │ │ ├── lang.yaml │ │ │ └── lang_shm.yaml │ │ ├── lang_only.yaml │ │ ├── vision_dataset │ │ │ ├── vision.yaml │ │ │ └── vision_shm.yaml │ │ ├── vision_lang.yaml │ │ ├── vision_lang_shm.yaml │ │ └── vision_only.yaml │ ├── default.yaml │ ├── mcil.yaml │ ├── observation_space │ │ ├── all_mods_abs_act.yaml │ │ ├── lang_rgb_static_abs_act.yaml │ │ ├── lang_rgb_static_gripper_abs_act.yaml │ │ ├── lang_rgb_static_gripper_rel_act.yaml │ │ ├── lang_rgb_static_rel_act.yaml │ │ ├── lang_rgb_static_robot_scene_abs_act.yaml │ │ ├── lang_rgb_static_tactile_abs_act.yaml │ │ ├── lang_rgbd_both_abs_act.yaml │ │ ├── lang_rgbd_both_rel_act.yaml │ │ ├── lang_rgbd_static_gripper_rel_act.yaml │ │ ├── lang_rgbd_static_robot_abs_act.yaml │ │ ├── rgb_static_abs_act.yaml │ │ ├── rgb_static_robot_scene_abs_act.yaml │ │ └── state_only.yaml │ ├── proprioception_dims │ │ ├── none.yaml │ │ ├── robot_full.yaml │ │ ├── robot_no_joints.yaml │ │ ├── robot_no_joints_no_gripper_width.yaml │ │ └── robot_scene.yaml │ └── transforms │ │ ├── clip.yaml │ │ ├── play_basic.yaml │ │ └── rand_shift.yaml ├── inference │ └── config_inference.yaml ├── lang_ann.yaml ├── logger │ ├── tb_logger.yaml │ └── wandb.yaml ├── loss │ └── default.yaml ├── model │ ├── action_decoder │ │ ├── deterministic.yaml │ │ ├── hulc_default.yaml │ │ └── mcil_default.yaml │ ├── bc_z_lang_decoder │ │ ├── default.yaml │ │ └── none.yaml │ ├── clip_lang.yaml │ ├── distribution │ │ ├── continuous.yaml │ │ └── discrete.yaml │ ├── gcbc.yaml │ ├── hulc.yaml │ ├── language_encoder │ │ ├── default.yaml │ │ └── none.yaml │ ├── language_goal │ │ ├── default.yaml │ │ └── none.yaml │ ├── lr_scheduler │ │ ├── constant.yaml │ │ ├── cosine_schedule_with_warmup.yaml │ │ └── linear_schedule_with_warmup.yaml │ ├── mcil.yaml │ ├── mia_lang_discriminator │ │ ├── default.yaml │ │ └── none.yaml │ ├── optimizer │ │ ├── adam.yaml │ │ ├── adamw.yaml │ │ └── sgd.yaml │ ├── perceptual_encoder │ │ ├── default.yaml │ │ ├── depth_gripper │ │ │ ├── default.yaml │ │ │ └── none.yaml │ │ ├── depth_static │ │ │ ├── default.yaml │ │ │ └── none.yaml │ │ ├── gripper_cam.yaml │ │ ├── proprio │ │ │ ├── identity.yaml │ │ │ └── none.yaml │ │ ├── rgb_gripper │ │ │ ├── default.yaml │ │ │ └── none.yaml │ │ ├── rgb_static │ │ │ ├── clip.yaml │ │ │ └── default.yaml │ │ ├── state_decoder │ │ │ ├── default.yaml │ │ │ └── none.yaml │ │ └── tactile │ │ │ ├── default.yaml │ │ │ └── none.yaml │ ├── plan_proposal │ │ └── default.yaml │ ├── plan_recognition │ │ ├── birnn.yaml │ │ └── transformers.yaml │ ├── proj_vis_lang │ │ ├── default.yaml │ │ └── none.yaml │ ├── sbert.yaml │ └── visual_goal │ │ └── default.yaml ├── trainer │ └── play_trainer.yaml └── training │ └── default_training.yaml ├── dataset ├── README.md ├── download_data.sh └── download_lang_embeddings.sh ├── hulc ├── __init__.py ├── evaluation │ ├── __init__.py │ ├── create_plots.py │ ├── evaluate_policy.py │ ├── rollouts_interactive.py │ └── run_multiple.py ├── models │ ├── __init__.py │ ├── auxiliary_loss_networks │ │ ├── __init__.py │ │ ├── bc_z_lang_decoder.py │ │ ├── mia_lang_discriminator.py │ │ ├── proj_vis_lang.py │ │ └── state_decoder.py │ ├── decoders │ │ ├── __init__.py │ │ ├── action_decoder.py │ │ ├── deterministic_decoder.py │ │ ├── logistic_decoder_rnn.py │ │ └── utils │ │ │ ├── __init__.py │ │ │ ├── gripper_control.py │ │ │ ├── pytorch3d_transforms.py │ │ │ └── rnn.py │ ├── encoders │ │ ├── __init__.py │ │ ├── clip_lang_encoder.py │ │ ├── goal_encoders.py │ │ ├── lang_encoder.py │ │ └── language_network.py │ ├── gcbc.py │ ├── hulc.py │ ├── perceptual_encoders │ │ ├── __init__.py │ │ ├── clip.py │ │ ├── concat_encoders.py │ │ ├── proprio_encoder.py │ │ ├── tactile_encoder.py │ │ ├── vision_clip.py │ │ ├── vision_network.py │ │ └── vision_network_gripper.py │ └── plan_encoders │ │ ├── __init__.py │ │ ├── plan_proposal_net.py │ │ └── plan_recognition_net.py ├── training.py └── utils │ ├── __init__.py │ ├── bpe_simple_vocab_16e6.txt.gz │ ├── clip_tokenizer.py │ ├── distributions.py │ ├── kl_callbacks.py │ ├── transforms.py │ └── utils.py ├── install.sh ├── media └── hulc_rollout.gif ├── pyproject.toml ├── requirements-dev.txt ├── requirements.txt ├── setup.py ├── setup_local.py └── slurm_scripts ├── README.md ├── sbatch_eval.sh ├── sbatch_lfp.sh ├── slurm_eval.py └── slurm_training.py /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | exclude = .git 3 | # Default is 79 in PEP 8 4 | max-line-length = 120 5 | select = E,F,W,C 6 | ignore=W503, # line break before binary operator, need for black 7 | E203, # whitespace before ':'. Opposite convention enforced by black 8 | E731, # do not assign a lambda expression, use a def 9 | E722, 10 | F401, 11 | F841, 12 | E402, # module level import not at top of file 13 | E741, # ambiguous variable name 14 | E501, # line too long. Handled by black 15 | C406, # Unnecessary list literal - rewrite as a dict literal 16 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | # Pycharm 132 | .idea 133 | 134 | # log files 135 | runs 136 | 137 | checkpoints/HULC* 138 | 139 | dataset/calvin_debug_dataset/ 140 | dataset/task* 141 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "calvin_env"] 2 | path = calvin_env 3 | url = https://github.com/mees/calvin_env.git 4 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | default_language_version: 2 | python: python3.8 3 | repos: 4 | - repo: https://github.com/psf/black 5 | rev: 21.5b2 6 | hooks: 7 | - id: black 8 | language_version: python3.8 9 | 10 | - repo: https://gitlab.com/pycqa/flake8 11 | rev: 3.8.4 12 | hooks: 13 | - id: flake8 14 | additional_dependencies: [-e, "git+git://github.com/pycqa/pyflakes.git@c72d6cf#egg=pyflakes"] 15 | exclude: telegram_bot 16 | 17 | - repo: https://github.com/pycqa/isort 18 | rev: 5.7.0 19 | hooks: 20 | - id: isort 21 | 22 | - repo: https://github.com/pre-commit/mirrors-mypy 23 | rev: v0.812 24 | hooks: 25 | - id: mypy 26 | args: [--ignore-missing-imports, --warn-no-return, --warn-redundant-casts, --disallow-incomplete-defs] 27 | additional_dependencies: [pytorch-lightning==1.5.5, torch==1.10.0, numpy] 28 | exclude: telegram_bot 29 | 30 | - repo: https://github.com/pre-commit/pre-commit-hooks 31 | rev: v4.0.1 32 | hooks: 33 | - id: check-yaml 34 | - id: trailing-whitespace 35 | - id: end-of-file-fixer 36 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Oier Mees 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # HULC 2 | [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) 3 | [![Language grade: Python](https://img.shields.io/lgtm/grade/python/g/mees/hulc.svg?logo=lgtm&logoWidth=18)](https://lgtm.com/projects/g/mees/hulc/context:python) 4 | [![Total alerts](https://img.shields.io/lgtm/alerts/g/mees/hulc.svg?logo=lgtm&logoWidth=18)](https://lgtm.com/projects/g/mees/hulc/alerts/) 5 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) 6 | 7 | [What Matters in Language Conditioned Imitation Learning over Unstructured Data](https://arxiv.org/pdf/2204.06252.pdf) 8 | 9 | [Oier Mees](https://www.oiermees.com/), [Lukas Hermann](https://lukashermann.github.io/), [Wolfram Burgard](http://www2.informatik.uni-freiburg.de/~burgard) 10 | 11 | We present **HULC** (**H**ierarchical **U**niversal **L**anguage **C**onditioned Policies), an end-to-end model that can 12 | learn a wide variety of language conditioned robot skills from offline free-form imitation datasets. HULC sets a new state of the art on the challenging CALVIN benchmark, 13 | on learning a single 7-DoF policy that can perform long-horizon manipulation tasks in a 3D environment, directly from images, and only specified with natural language. 14 | This code accompanies the paper What Matters in Language Conditioned Imitation Learning, which can be found [here](https://arxiv.org/pdf/2204.06252.pdf). 15 | We hope the code will be useful as a starting point for further research on language conditioned policy learning and will bring us closer towards general-purpose robots that can relate human language to their perception and actions. 16 | 17 | ![](media/hulc_rollout.gif) 18 | ## Installation 19 | As a prerequisite, you need to have [calvin](https://github.com/mees/calvin) installed. 20 | This is needed because HULC builds upon calvin_agent and calvin_env. 21 | 22 | Next, clone this repository locally 23 | ```bash 24 | git clone https://github.com/mees/hulc.git 25 | export HULC_ROOT=$(pwd)/hulc 26 | 27 | ``` 28 | 29 | Install requirements: 30 | ```bash 31 | cd $HULC_ROOT 32 | conda create -n hulc_venv python=3.10 # or use virtualenv 33 | conda activate hulc_venv 34 | sh install.sh 35 | ``` 36 | We originally used Python 3.8, but we 3.10 should also work. 37 | 38 | If you encounter problems installing pyhash, you might have to downgrade setuptools to a version below 58. 39 | 40 | ## Download 41 | ### CALVIN Dataset 42 | If you want to train on the [CALVIN](https://github.com/mees/calvin) dataset, choose a split with: 43 | ```bash 44 | cd $HULC_ROOT/dataset 45 | sh download_data.sh D | ABC | ABCD | debug 46 | ``` 47 | If you have previously downloaded the dataset in the calvin repo, you can just set the paths to that folder via the command line when starting a training. 48 | If you want to get started without downloading the whole dataset, use the argument `debug` to download a small debug dataset (1.3 GB). 49 | ### Language Embeddings 50 | We provide the precomputed embeddings of the different Language Models we evaluate in the paper. 51 | The script assumes the corresponding split has been already downloaded. 52 | ```bash 53 | cd $HULC_ROOT/dataset 54 | sh download_lang_embeddings.sh D | ABC | ABCD 55 | ``` 56 | 57 | ### Pre-trained Models 58 | We provide our final models for all three CALVIN splits. 59 | ```bash 60 | cd $HULC_ROOT/checkpoints 61 | sh download_model_weights.sh D | ABC | ABCD 62 | ``` 63 | For instructions how to use the pretrained models, look at the training and evaluation sections. 64 | 65 | ## Hardware Requirements 66 | 67 | We leverage [Pytorch Lightning's](https://www.pytorchlightning.ai/) DDP implementation to scale our training to 8x NVIDIA GPUs with **12GB** memory each. 68 | Evaluating the models requires a single NVIDIA GPU with **8GB**. As each GPU receives a batch of 64 sequences (32 language + 32 vision), the effective batch size is 512 for all our experiments. 69 | 70 | Trained with: 71 | - **GPU** - 8x NVIDIA RTX 2080Ti 72 | - **CPU** - AMD EPYC 7502 73 | - **RAM** - 512GB 74 | - **OS** - Ubuntu 20.04 75 | 76 | With this setup, one epoch takes around 1.5 hours and the whole training with 30 epochs can be completed in 45 hours (without the evaluation callbacks). 77 | 78 | ## Training 79 | To train our HULC model with the maximum amount of available GPUS, run: 80 | ``` 81 | python hulc/training.py trainer.devices-1 datamodule.root_data_dir=path/to/dataset datamodule/datasets=vision_lang_shm 82 | ``` 83 | The `vision_lang_shm` option loads the CALVIN dataset into shared memory at the beginning of the training, 84 | speeding up the data loading during training. 85 | The preparation of the shared memory cache will take some time 86 | (approx. 20 min at our SLURM cluster). \ 87 | If you want to use the original data loader (e.g. for debugging) just override the command with `datamodule/datasets=vision_lang`. \ 88 | For an additional speed up, you can disable the evaluation callbacks during training by adding `~callbacks/rollout` and `~callbacks/rollout_lh` 89 | 90 | If you have access to a SLURM cluster, follow this [guide](https://github.com/mees/hulc/blob/main/slurm_scripts/README.md). 91 | 92 | You can use our [pre-trained models](#pre-trained-models) to initialize a training by running 93 | ``` 94 | python hulc/training.py trainer.devices-1 datamodule.root_data_dir=path/to/dataset hydra.run.dir=$HULC_ROOT/checkpoints/HULC_D_D 95 | ``` 96 | Note that this will log the training into the checkpoint folder. 97 | 98 | ### Ablations 99 | Multi-context imitation learning (MCIL), (Lynch et al., 2019): 100 | ``` 101 | python hulc/training.py trainer.devices-1 datamodule.root_data_dir=path/to/dataset datamodule/datasets=vision_lang_shm model=mcil 102 | datamodule=mcil 103 | ``` 104 | 105 | Goal-conditioned behavior cloning (GCBC), (Lynch et al., 2019): 106 | ``` 107 | python hulc/training.py trainer.devices-1 datamodule.root_data_dir=path/to/dataset datamodule/datasets=vision_lang_shm model=gcbc 108 | ~callbacks/tsne_plot 109 | ``` 110 | 111 | 112 | ## Evaluation 113 | See detailed inference instructions on the [CALVIN repo](https://github.com/mees/calvin#muscle-evaluation-the-calvin-challenge). 114 | ``` 115 | python hulc/evaluation/evaluate_policy.py --dataset_path --train_folder 116 | ``` 117 | Set `--train_folder $HULC_ROOT/checkpoints/HULC_D_D` to evaluate our [pre-trained models](#pre-trained-models). 118 | 119 | Optional arguments: 120 | 121 | - `--checkpoint `: by default, the evaluation loads the last checkpoint in the training log directory. 122 | You can instead specify the path to another checkpoint by adding this to the evaluation command. 123 | - `--debug`: print debug information and visualize environment. 124 | 125 | ## Changelog 126 | 127 | ### 16 Sep 2022 128 | - **MAJOR BUG IN ABC and ABCD dataset:** If you downloaded these datasets before this date you have to do these fixes: 129 | - Wrong language annotations in ABC and ABCD dataset. You can download the corrected language embeddings [here](https://github.com/mees/calvin/blob/main/dataset/README.md#language-embeddings). 130 | - Bug in `calvin_env` that only affects the generation of language embeddings. 131 | - Wrong `scene_info.npy` in ABC and ABCD dataset. Please replace as follows: 132 | ``` 133 | cd task_ABCD_D 134 | wget http://calvin.cs.uni-freiburg.de/scene_info_fix/task_ABCD_D_scene_info.zip 135 | unzip task_ABCD_D_scene_info.zip && rm task_ABCD_D_scene_info.zip 136 | ``` 137 | ``` 138 | cd task_ABC_D 139 | wget http://calvin.cs.uni-freiburg.de/scene_info_fix/task_ABC_D_scene_info.zip 140 | unzip task_ABC_D_scene_info.zip && rm task_ABC_D_scene_info.zip 141 | ``` 142 | 143 | ### 1 Sep 2022 144 | - Updated the language embeddings for the splits ABC and ABCD due to a bug in switching scenes during the automatic language labeling. Additionally, added various precomputed language embeddings. 145 | 146 | ## Acknowledgements 147 | 148 | This work uses code from the following open-source projects and datasets: 149 | 150 | #### CALVIN 151 | Original: [https://github.com/mees/calvin](https://github.com/mees/calvin) 152 | License: [MIT](https://github.com/mees/calvin/blob/main/LICENSE) 153 | 154 | #### Sentence-Transformers 155 | Original: [https://github.com/UKPLab/sentence-transformers](https://github.com/UKPLab/sentence-transformers) 156 | License: [Apache 2.0](https://github.com/UKPLab/sentence-transformers/blob/master/LICENSE) 157 | 158 | #### OpenAI CLIP 159 | Original: [https://github.com/openai/CLIP](https://github.com/openai/CLIP) 160 | License: [MIT](https://github.com/openai/CLIP/blob/main/LICENSE) 161 | ## Citations 162 | 163 | If you find the code useful, please cite: 164 | 165 | **HULC** 166 | ```bibtex 167 | @article{mees2022hulc, 168 | author={Oier Mees and Lukas Hermann and Wolfram Burgard}, 169 | title={What Matters in Language Conditioned Robotic Imitation Learning Over Unstructured Data}, 170 | journal={IEEE Robotics and Automation Letters (RA-L)}, 171 | volume={7}, 172 | number={4}, 173 | pages={11205-11212}, 174 | year={2022} 175 | } 176 | ``` 177 | **CALVIN** 178 | ```bibtex 179 | @article{mees2022calvin, 180 | author = {Oier Mees and Lukas Hermann and Erick Rosete-Beas and Wolfram Burgard}, 181 | title = {CALVIN: A Benchmark for Language-Conditioned Policy Learning for Long-Horizon Robot Manipulation Tasks}, 182 | journal={IEEE Robotics and Automation Letters (RA-L)}, 183 | volume={7}, 184 | number={3}, 185 | pages={7327-7334}, 186 | year={2022} 187 | } 188 | ``` 189 | 190 | ## License 191 | 192 | MIT License 193 | -------------------------------------------------------------------------------- /checkpoints/download_model_weights.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Download, Unzip, and Remove zip 3 | if [ "$1" = "D" ] 4 | then 5 | 6 | echo "Downloading HULC Checkpoint for task_D_D ..." 7 | wget http://hulc.cs.uni-freiburg.de/model_weights/HULC_D_D.zip 8 | unzip HULC_D_D.zip && rm HULC_D_D.zip 9 | echo "finished!" 10 | elif [ "$1" = "ABC" ] 11 | then 12 | 13 | echo "Downloading HULC Checkpoint for task_ABC_D ..." 14 | wget http://hulc.cs.uni-freiburg.de/model_weights/HULC_ABC_D.zip 15 | unzip HULC_ABC_D.zip && rm HULC_ABC_D.zip 16 | echo "finished!" 17 | 18 | elif [ "$1" = "ABCD" ] 19 | then 20 | 21 | echo "Downloading HULC Checkpoint for task_ABCD_D ..." 22 | wget http://hulc.cs.uni-freiburg.de/model_weights/HULC_ABCD_D.zip 23 | unzip HULC_ABCD_D.zip && rm HULC_ABCD_D.zip 24 | echo "finished!" 25 | 26 | else 27 | echo "Failed: Usage download_model_weights.sh D | ABC | ABCD" 28 | exit 1 29 | fi 30 | -------------------------------------------------------------------------------- /conf/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lukashermann/hulc/7fdb09f5f1ff831ec8df29cfc61a41db573e9a06/conf/__init__.py -------------------------------------------------------------------------------- /conf/annotations/new_playtable_validation.yaml: -------------------------------------------------------------------------------- 1 | # rotation 2 | rotate_red_block_right: ["take the red block and rotate it to the right"] 3 | rotate_red_block_left: ["take the red block and rotate it to the left"] 4 | rotate_blue_block_right: ["take the blue block and rotate it to the right"] 5 | rotate_blue_block_left: ["take the blue block and rotate it to the left"] 6 | rotate_pink_block_right: ["take the pink block and rotate it to the right"] 7 | rotate_pink_block_left: ["take the pink block and rotate it to the left"] 8 | 9 | # sliding 10 | push_red_block_right: ["go push the red block right"] 11 | push_red_block_left: ["go push the red block left"] 12 | push_blue_block_right: ["go push the blue block right"] 13 | push_blue_block_left: ["go push the blue block left"] 14 | push_pink_block_right: ["go push the pink block right"] 15 | push_pink_block_left: ["go push the pink block left"] 16 | 17 | # open/close 18 | move_slider_left: [ "push the sliding door to the left side"] 19 | move_slider_right: [ "push the sliding door to the right side"] 20 | open_drawer: ["pull the handle to open the drawer"] 21 | close_drawer: ["push the handle to close the drawer"] 22 | 23 | # lifting 24 | lift_red_block_table: ["grasp and lift the red block"] 25 | lift_blue_block_table: ["grasp and lift the blue block"] 26 | lift_pink_block_table: ["grasp and lift the pink block"] 27 | 28 | lift_red_block_slider: [ "lift the red block from the sliding cabinet"] 29 | lift_blue_block_slider: [ "lift the blue block from the sliding cabinet"] 30 | lift_pink_block_slider: [ "lift the pink block from the sliding cabinet"] 31 | 32 | lift_red_block_drawer: ["Take the red block from the drawer"] 33 | lift_blue_block_drawer: ["Take the blue block from the drawer"] 34 | lift_pink_block_drawer: ["Take the pink block from the drawer"] 35 | 36 | place_in_slider: [ "store the grasped block in the sliding cabinet"] 37 | place_in_drawer: [ "store the grasped block in the drawer"] 38 | 39 | push_into_drawer: ["slide the block that it falls into the drawer"] 40 | 41 | stack_block: ["stack the grasped block"] 42 | unstack_block: ["remove the stacked block"] 43 | 44 | turn_on_lightbulb: ["use the switch to turn on the light bulb"] 45 | turn_off_lightbulb: ["use the switch to turn off the light bulb"] 46 | turn_on_led: ["press the button to turn on the led light"] 47 | turn_off_led: ["press the button to turn off the led light"] 48 | -------------------------------------------------------------------------------- /conf/callbacks/checkpoint/all.yaml: -------------------------------------------------------------------------------- 1 | _target_: pytorch_lightning.callbacks.ModelCheckpoint 2 | save_top_k: -1 3 | verbose: True 4 | dirpath: saved_models 5 | filename: '{epoch}' #put back in when PL fixes this _{val/accuracy:.4f}' 6 | -------------------------------------------------------------------------------- /conf/callbacks/checkpoint/clip_loss.yaml: -------------------------------------------------------------------------------- 1 | _target_: pytorch_lightning.callbacks.ModelCheckpoint 2 | save_top_k: 3 3 | verbose: True 4 | monitor: val/val_pred_clip_loss 5 | mode: min 6 | dirpath: saved_models 7 | filename: '{epoch}' #put back in when PL fixes this _{val/accuracy:.4f}' 8 | -------------------------------------------------------------------------------- /conf/callbacks/checkpoint/kl.yaml: -------------------------------------------------------------------------------- 1 | _target_: pytorch_lightning.callbacks.ModelCheckpoint 2 | save_top_k: 3 3 | verbose: True 4 | monitor: train/kl_loss 5 | mode: max 6 | dirpath: saved_models 7 | filename: '{epoch}' #put back in when PL fixes this _{val/accuracy:.4f}' 8 | -------------------------------------------------------------------------------- /conf/callbacks/checkpoint/lh_sr.yaml: -------------------------------------------------------------------------------- 1 | _target_: pytorch_lightning.callbacks.ModelCheckpoint 2 | save_top_k: 3 3 | verbose: True 4 | monitor: eval_lh/avg_seq_len 5 | mode: max 6 | dirpath: saved_models 7 | filename: '{epoch}' #put back in when PL fixes this _{val/accuracy:.4f}' 8 | every_n_epochs: ${callbacks.rollout_lh.rollout_freq} 9 | -------------------------------------------------------------------------------- /conf/callbacks/checkpoint/state_recon.yaml: -------------------------------------------------------------------------------- 1 | _target_: pytorch_lightning.callbacks.ModelCheckpoint 2 | save_top_k: 3 3 | verbose: True 4 | monitor: val/state_recon_loss 5 | mode: min 6 | dirpath: saved_models 7 | filename: '{epoch}' #put back in when PL fixes this _{val/accuracy:.4f}' 8 | -------------------------------------------------------------------------------- /conf/callbacks/checkpoint/task_sr.yaml: -------------------------------------------------------------------------------- 1 | _target_: pytorch_lightning.callbacks.ModelCheckpoint 2 | save_top_k: 3 3 | verbose: True 4 | monitor: tasks/average_sr 5 | mode: max 6 | dirpath: saved_models 7 | filename: '{epoch}' #put back in when PL fixes this _{val/accuracy:.4f}' 8 | every_n_epochs: ${callbacks.rollout.rollout_freq} 9 | -------------------------------------------------------------------------------- /conf/callbacks/checkpoint/val_action.yaml: -------------------------------------------------------------------------------- 1 | _target_: pytorch_lightning.callbacks.ModelCheckpoint 2 | save_top_k: -1 3 | verbose: True 4 | monitor: val_act/action_loss_pp 5 | mode: min 6 | dirpath: saved_models 7 | filename: '{epoch}' #put back in when PL fixes this _{val/accuracy:.4f}' 8 | -------------------------------------------------------------------------------- /conf/callbacks/default.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | # - rollout: default 3 | - rollout_lh: default 4 | - checkpoint: all 5 | - tsne_plot: default 6 | - kl_schedule: constant 7 | - shm_signal: default 8 | -------------------------------------------------------------------------------- /conf/callbacks/kl_schedule/constant.yaml: -------------------------------------------------------------------------------- 1 | _target_: hulc.utils.kl_callbacks.KLConstantSchedule 2 | -------------------------------------------------------------------------------- /conf/callbacks/kl_schedule/linear.yaml: -------------------------------------------------------------------------------- 1 | _target_: hulc.utils.kl_callbacks.KLLinearSchedule 2 | start_epoch: 10 3 | end_epoch: 50 4 | max_kl_beta: ${loss.kl_beta} 5 | -------------------------------------------------------------------------------- /conf/callbacks/kl_schedule/sigmoid.yaml: -------------------------------------------------------------------------------- 1 | # @package _group_ 2 | _target_: hulc.utils.kl_callbacks.KLSigmoidSchedule 3 | start_epoch: 10 4 | end_epoch: 50 5 | max_kl_beta: ${loss.kl_beta} 6 | -------------------------------------------------------------------------------- /conf/callbacks/rollout/default.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - /callbacks/rollout/tasks@tasks: new_playtable_tasks 3 | - /annotations@val_annotations: new_playtable_validation 4 | _target_: calvin_agent.rollout.rollout.Rollout 5 | _recursive_: false 6 | env_cfg: 7 | _target_: calvin_agent.wrappers.calvin_env_wrapper.CalvinEnvWrapper 8 | skip_epochs: 1 9 | rollout_freq: 5 10 | video: true 11 | num_rollouts_per_task: 10 12 | check_percentage_of_batch: 1 # which percentage of sequences do we want to check for possible tasks 13 | ep_len: 120 14 | empty_cache: false 15 | log_video_to_file: false 16 | save_dir: ./videos 17 | add_goal_thumbnail: true 18 | min_window_size: ${datamodule.datasets.vision_dataset.min_window_size} 19 | max_window_size: ${datamodule.datasets.vision_dataset.max_window_size} 20 | id_selection_strategy: "select_longest" 21 | lang_folder: ${datamodule.datasets.lang_dataset.lang_folder} 22 | -------------------------------------------------------------------------------- /conf/callbacks/rollout/tasks/new_playtable_tasks.yaml: -------------------------------------------------------------------------------- 1 | _target_: calvin_env.envs.tasks.Tasks 2 | tasks: 3 | # rotation 4 | rotate_red_block_right: [rotate_object, 'block_red', -60] 5 | rotate_red_block_left: [rotate_object, 'block_red', 60] 6 | rotate_blue_block_right: [ rotate_object, 'block_blue', -60 ] 7 | rotate_blue_block_left: [ rotate_object, 'block_blue', 60 ] 8 | rotate_pink_block_right: [ rotate_object, 'block_pink', -60 ] 9 | rotate_pink_block_left: [ rotate_object, 'block_pink', 60 ] 10 | 11 | # pushing 12 | push_red_block_right: [ push_object, 'block_red', 0.1, 0] 13 | push_red_block_left: [ push_object, 'block_red', -0.1, 0] 14 | push_blue_block_right: [ push_object, 'block_blue', 0.1, 0] 15 | push_blue_block_left: [ push_object, 'block_blue', -0.1, 0] 16 | push_pink_block_right: [ push_object, 'block_pink', 0.1, 0] 17 | push_pink_block_left: [ push_object, 'block_pink', -0.1, 0] 18 | 19 | # open/close 20 | move_slider_left: [move_door_rel, 'base__slide', 0.15] # 0 - 0.56 21 | move_slider_right: [move_door_rel, 'base__slide', -0.15] 22 | open_drawer: [move_door_rel, 'base__drawer', 0.12] # 0 - 0.24 23 | close_drawer: [move_door_rel, 'base__drawer', -0.12] 24 | 25 | # lifting 26 | lift_red_block_table: [lift_object, 'block_red', 0.05, 'table', 'base_link'] 27 | lift_red_block_slider: [lift_object, 'block_red', 0.03, 'table', 'plank_link'] 28 | lift_red_block_drawer: [lift_object, 'block_red', 0.05, 'table', 'drawer_link'] 29 | lift_blue_block_table: [ lift_object, 'block_blue', 0.05, 'table', 'base_link' ] 30 | lift_blue_block_slider: [ lift_object, 'block_blue', 0.03, 'table', 'plank_link' ] 31 | lift_blue_block_drawer: [ lift_object, 'block_blue', 0.05, 'table', 'drawer_link' ] 32 | lift_pink_block_table: [ lift_object, 'block_pink', 0.05, 'table', 'base_link' ] 33 | lift_pink_block_slider: [ lift_object, 'block_pink', 0.03, 'table', 'plank_link' ] 34 | lift_pink_block_drawer: [ lift_object, 'block_pink', 0.05, 'table', 'drawer_link' ] 35 | 36 | # placing 37 | place_in_slider: [place_object, 'table', 'plank_link'] 38 | place_in_drawer: [place_object, 'table', 'drawer_link'] 39 | 40 | # stacking 41 | stack_block: [stack_objects] 42 | unstack_block: [unstack_objects] 43 | 44 | # lights 45 | turn_on_lightbulb: [toggle_light, 'lightbulb', 0, 1] 46 | turn_off_lightbulb: [toggle_light, 'lightbulb', 1, 0] 47 | turn_on_led: [ toggle_light, 'led', 0, 1 ] 48 | turn_off_led: [ toggle_light, 'led', 1, 0 ] 49 | 50 | # pushing into drawer 51 | push_into_drawer: [push_object_into, ['block_red', 'block_blue', 'block_pink'], 'table', 'base_link', 'table', 'drawer_link'] 52 | 53 | # signatures of available base tasks: 54 | # rotate_object(obj_name, degrees, x_y_threshold=30, z_treshold=180): 55 | # push_object(obj_name, x_direction, y_direction): 56 | # lift_object(obj_name, z_direction, surface_body=None, surface_link=None): 57 | # place_object(dest_body, dest_link=None): 58 | # push_object_into(obj_name, src_body, dest_body): 59 | # move_door_abs(start_info, end_info, obj_name, joint_name, start_threshold, end_threshold): 60 | # move_door_rel(obj_name, joint_name, threshold): 61 | -------------------------------------------------------------------------------- /conf/callbacks/rollout_lh/default.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - /callbacks/rollout/tasks@tasks: new_playtable_tasks 3 | - /annotations@val_annotations: new_playtable_validation 4 | _target_: calvin_agent.rollout.rollout_long_horizon.RolloutLongHorizon 5 | _recursive_: false 6 | env_cfg: 7 | _target_: calvin_agent.wrappers.calvin_env_wrapper.CalvinEnvWrapper 8 | skip_epochs: 1 9 | rollout_freq: 1 10 | num_videos: 16 11 | num_sequences: 128 12 | replan_freq: 30 13 | ep_len: 360 14 | empty_cache: false 15 | log_video_to_file: false 16 | save_dir: ./videos 17 | lang_folder: ${datamodule.datasets.lang_dataset.lang_folder} 18 | debug: false 19 | -------------------------------------------------------------------------------- /conf/callbacks/shm_signal/default.yaml: -------------------------------------------------------------------------------- 1 | _target_: calvin_agent.datasets.utils.shared_memory_utils.SignalCallback 2 | -------------------------------------------------------------------------------- /conf/callbacks/tsne_plot/default.yaml: -------------------------------------------------------------------------------- 1 | _target_: calvin_agent.visualization.tsne_plot.TSNEPlot 2 | perplexity: 40 3 | n_jobs: 8 4 | plot_percentage: 0.2 5 | opacity: 0.3 6 | marker_size: 5 7 | -------------------------------------------------------------------------------- /conf/config.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - callbacks: default 3 | - datamodule: default 4 | - model: hulc 5 | - loss: default 6 | - training: default_training 7 | - trainer: play_trainer 8 | - logger: wandb 9 | - override hydra/job_logging: colorlog 10 | - override hydra/hydra_logging: colorlog 11 | - _self_ 12 | 13 | seed: 42 14 | log_dir: ../ 15 | slurm: false 16 | 17 | hydra: 18 | run: 19 | dir: ${log_dir}/runs/${now:%Y-%m-%d}/${now:%H-%M-%S} 20 | sweep: 21 | dir: ${log_dir}/runs/${now:%Y-%m-%d}/${now:%H-%M-%S} 22 | subdir: ${hydra.job.override_dirname} 23 | job: 24 | config: 25 | override_dirname: 26 | exclude_keys: 27 | - log_dir 28 | - datamodule.root_data_dir 29 | - trainer.gpus 30 | - model.tsne_plot 31 | - datamodule.num_workers 32 | - trainer.limit_train_batches 33 | - trainer.limit_val_batches 34 | - model.action_decoder.load_action_bounds 35 | -------------------------------------------------------------------------------- /conf/datamodule/datasets/lang_dataset/lang.yaml: -------------------------------------------------------------------------------- 1 | _target_: calvin_agent.datasets.disk_dataset.DiskDataset 2 | key: "lang" 3 | save_format: "npz" 4 | batch_size: 32 5 | min_window_size: 20 6 | max_window_size: 32 7 | proprio_state: ${datamodule.proprioception_dims} 8 | obs_space: ${datamodule.observation_space} 9 | skip_frames: 1 10 | pad: true 11 | lang_folder: "lang_paraphrase-MiniLM-L3-v2" 12 | aux_lang_loss_window: 8 13 | num_workers: 2 14 | -------------------------------------------------------------------------------- /conf/datamodule/datasets/lang_dataset/lang_shm.yaml: -------------------------------------------------------------------------------- 1 | _target_: calvin_agent.datasets.shm_dataset.ShmDataset 2 | key: "lang" 3 | batch_size: 32 4 | min_window_size: 20 5 | max_window_size: 32 6 | proprio_state: ${datamodule.proprioception_dims} 7 | obs_space: ${datamodule.observation_space} 8 | pad: true 9 | lang_folder: "lang_paraphrase-MiniLM-L3-v2" 10 | aux_lang_loss_window: 8 11 | num_workers: 2 12 | -------------------------------------------------------------------------------- /conf/datamodule/datasets/lang_only.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - lang_dataset: lang 3 | -------------------------------------------------------------------------------- /conf/datamodule/datasets/vision_dataset/vision.yaml: -------------------------------------------------------------------------------- 1 | _target_: calvin_agent.datasets.disk_dataset.DiskDataset 2 | key: "vis" 3 | save_format: "npz" 4 | batch_size: 32 5 | min_window_size: 20 6 | max_window_size: 32 7 | proprio_state: ${datamodule.proprioception_dims} 8 | obs_space: ${datamodule.observation_space} 9 | pad: true 10 | lang_folder: "lang_paraphrase-MiniLM-L3-v2" 11 | num_workers: 2 12 | -------------------------------------------------------------------------------- /conf/datamodule/datasets/vision_dataset/vision_shm.yaml: -------------------------------------------------------------------------------- 1 | _target_: calvin_agent.datasets.shm_dataset.ShmDataset 2 | key: "vis" 3 | batch_size: 32 4 | min_window_size: 20 5 | max_window_size: 32 6 | proprio_state: ${datamodule.proprioception_dims} 7 | obs_space: ${datamodule.observation_space} 8 | pad: true 9 | lang_folder: "lang_paraphrase-MiniLM-L3-v2" 10 | num_workers: 2 11 | -------------------------------------------------------------------------------- /conf/datamodule/datasets/vision_lang.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - vision_dataset: vision 3 | - lang_dataset: lang 4 | -------------------------------------------------------------------------------- /conf/datamodule/datasets/vision_lang_shm.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - vision_dataset: vision_shm 3 | - lang_dataset: lang_shm 4 | -------------------------------------------------------------------------------- /conf/datamodule/datasets/vision_only.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - vision_dataset: vision 3 | -------------------------------------------------------------------------------- /conf/datamodule/default.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - datasets: vision_lang_shm 3 | - transforms: rand_shift 4 | - proprioception_dims: robot_no_joints #robot_full 5 | - observation_space: lang_rgb_static_gripper_rel_act 6 | _target_: calvin_agent.datasets.calvin_data_module.CalvinDataModule 7 | _recursive_: false 8 | root_data_dir: ??? 9 | action_space: 7 10 | action_max: [1., 1., 1., 1., 1., 1., 1.,] 11 | action_min: [-1., -1., -1., -1., -1., -1., -1] 12 | shuffle_val: false 13 | -------------------------------------------------------------------------------- /conf/datamodule/mcil.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - datasets: vision_lang_shm 3 | - transforms: play_basic 4 | - proprioception_dims: robot_no_joints #robot_full 5 | - observation_space: lang_rgb_static_gripper_abs_act 6 | _target_: calvin_agent.datasets.calvin_data_module.CalvinDataModule 7 | _recursive_: false 8 | root_data_dir: ??? 9 | action_space: 7 10 | action_max: [1., 1., 1., 1., 1., 1., 1.,] 11 | action_min: [-1., -1., -1., -1., -1., -1., -1] 12 | shuffle_val: false 13 | -------------------------------------------------------------------------------- /conf/datamodule/observation_space/all_mods_abs_act.yaml: -------------------------------------------------------------------------------- 1 | rgb_obs: ['rgb_static', 'rgb_gripper', 'rgb_tactile'] 2 | depth_obs: ['depth_static', 'depth_gripper', 'depth_tactile'] 3 | state_obs: ['robot_obs', 'scene_obs'] 4 | actions: ['actions'] 5 | language: ['language'] 6 | -------------------------------------------------------------------------------- /conf/datamodule/observation_space/lang_rgb_static_abs_act.yaml: -------------------------------------------------------------------------------- 1 | rgb_obs: ['rgb_static'] 2 | depth_obs: [] 3 | state_obs: ['robot_obs'] 4 | actions: ['actions'] 5 | language: ['language'] 6 | -------------------------------------------------------------------------------- /conf/datamodule/observation_space/lang_rgb_static_gripper_abs_act.yaml: -------------------------------------------------------------------------------- 1 | rgb_obs: ['rgb_static', 'rgb_gripper'] 2 | depth_obs: [] 3 | state_obs: ['robot_obs'] 4 | actions: ['actions'] 5 | language: ['language'] 6 | -------------------------------------------------------------------------------- /conf/datamodule/observation_space/lang_rgb_static_gripper_rel_act.yaml: -------------------------------------------------------------------------------- 1 | rgb_obs: ['rgb_static', 'rgb_gripper'] 2 | depth_obs: [] 3 | state_obs: ['robot_obs'] 4 | actions: ['rel_actions'] 5 | language: ['language'] 6 | -------------------------------------------------------------------------------- /conf/datamodule/observation_space/lang_rgb_static_rel_act.yaml: -------------------------------------------------------------------------------- 1 | rgb_obs: ['rgb_static'] 2 | depth_obs: [] 3 | state_obs: ['robot_obs'] 4 | actions: ['rel_actions'] 5 | language: ['language'] 6 | -------------------------------------------------------------------------------- /conf/datamodule/observation_space/lang_rgb_static_robot_scene_abs_act.yaml: -------------------------------------------------------------------------------- 1 | rgb_obs: ['rgb_static'] 2 | depth_obs: [] 3 | state_obs: ['robot_obs', 'scene_obs'] 4 | actions: ['actions'] 5 | language: ['language'] 6 | -------------------------------------------------------------------------------- /conf/datamodule/observation_space/lang_rgb_static_tactile_abs_act.yaml: -------------------------------------------------------------------------------- 1 | rgb_obs: ['rgb_static', 'rgb_tactile'] 2 | depth_obs: [] 3 | state_obs: ['robot_obs'] 4 | actions: ['actions'] 5 | language: ['language'] 6 | -------------------------------------------------------------------------------- /conf/datamodule/observation_space/lang_rgbd_both_abs_act.yaml: -------------------------------------------------------------------------------- 1 | rgb_obs: ['rgb_static', 'rgb_gripper'] 2 | depth_obs: ['depth_static', 'depth_gripper'] 3 | state_obs: ['robot_obs'] 4 | actions: ['actions'] 5 | language: ['language'] 6 | -------------------------------------------------------------------------------- /conf/datamodule/observation_space/lang_rgbd_both_rel_act.yaml: -------------------------------------------------------------------------------- 1 | rgb_obs: ['rgb_static', 'rgb_gripper'] 2 | depth_obs: ['depth_static', 'depth_gripper'] 3 | state_obs: ['robot_obs'] 4 | actions: ['rel_actions'] 5 | language: ['language'] 6 | -------------------------------------------------------------------------------- /conf/datamodule/observation_space/lang_rgbd_static_gripper_rel_act.yaml: -------------------------------------------------------------------------------- 1 | rgb_obs: ['rgb_static', 'rgb_gripper'] 2 | depth_obs: ['depth_gripper'] 3 | state_obs: ['robot_obs'] 4 | actions: ['rel_actions'] 5 | language: ['language'] 6 | -------------------------------------------------------------------------------- /conf/datamodule/observation_space/lang_rgbd_static_robot_abs_act.yaml: -------------------------------------------------------------------------------- 1 | rgb_obs: ['rgb_static'] 2 | depth_obs: ['depth_static'] 3 | state_obs: ['robot_obs'] 4 | actions: ['actions'] 5 | language: ['language'] 6 | -------------------------------------------------------------------------------- /conf/datamodule/observation_space/rgb_static_abs_act.yaml: -------------------------------------------------------------------------------- 1 | rgb_obs: ['rgb_static'] 2 | depth_obs: [] 3 | state_obs: ['robot_obs'] 4 | actions: ['actions'] 5 | -------------------------------------------------------------------------------- /conf/datamodule/observation_space/rgb_static_robot_scene_abs_act.yaml: -------------------------------------------------------------------------------- 1 | rgb_obs: ['rgb_static'] 2 | depth_obs: [] 3 | state_obs: ['robot_obs', 'scene_obs'] 4 | actions: ['actions'] 5 | -------------------------------------------------------------------------------- /conf/datamodule/observation_space/state_only.yaml: -------------------------------------------------------------------------------- 1 | rgb_obs: [] 2 | depth_obs: [] 3 | state_obs: ['robot_obs'] 4 | actions: ['actions'] 5 | language: ['language'] 6 | -------------------------------------------------------------------------------- /conf/datamodule/proprioception_dims/none.yaml: -------------------------------------------------------------------------------- 1 | n_state_obs: 0 2 | keep_indices: [[0, 0]] 3 | robot_orientation_idx: [3, 6] 4 | normalize: False 5 | normalize_robot_orientation: False 6 | -------------------------------------------------------------------------------- /conf/datamodule/proprioception_dims/robot_full.yaml: -------------------------------------------------------------------------------- 1 | n_state_obs: 15 2 | keep_indices: [[0, 15]] 3 | robot_orientation_idx: [3, 6] 4 | normalize: True 5 | normalize_robot_orientation: True 6 | -------------------------------------------------------------------------------- /conf/datamodule/proprioception_dims/robot_no_joints.yaml: -------------------------------------------------------------------------------- 1 | n_state_obs: 8 2 | keep_indices: [[0, 7], [14,15]] 3 | robot_orientation_idx: [3, 6] 4 | normalize: True 5 | normalize_robot_orientation: True 6 | -------------------------------------------------------------------------------- /conf/datamodule/proprioception_dims/robot_no_joints_no_gripper_width.yaml: -------------------------------------------------------------------------------- 1 | n_state_obs: 7 2 | keep_indices: [[0, 6], [14,15]] 3 | robot_orientation_idx: [3, 6] 4 | normalize: True 5 | normalize_robot_orientation: True 6 | -------------------------------------------------------------------------------- /conf/datamodule/proprioception_dims/robot_scene.yaml: -------------------------------------------------------------------------------- 1 | n_state_obs: 54 2 | keep_indices: [[0, 54]] 3 | robot_orientation_idx: [3, 6] 4 | normalize: True 5 | normalize_robot_orientation: True 6 | -------------------------------------------------------------------------------- /conf/datamodule/transforms/clip.yaml: -------------------------------------------------------------------------------- 1 | train: 2 | rgb_static: 3 | - _target_: torchvision.transforms.Resize 4 | size: 224 5 | - _target_: hulc.utils.transforms.RandomShiftsAug 6 | pad: 10 7 | - _target_: calvin_agent.utils.transforms.ScaleImageTensor 8 | - _target_: torchvision.transforms.Normalize 9 | mean: [0.48145466, 0.4578275, 0.40821073] 10 | std: [0.26862954, 0.26130258, 0.27577711] 11 | rgb_gripper: 12 | - _target_: torchvision.transforms.Resize 13 | size: 84 14 | - _target_: hulc.utils.transforms.RandomShiftsAug 15 | pad: 4 16 | - _target_: calvin_agent.utils.transforms.ScaleImageTensor 17 | - _target_: torchvision.transforms.Normalize 18 | mean: [0.48145466, 0.4578275, 0.40821073] 19 | std: [0.26862954, 0.26130258, 0.27577711] 20 | depth_static: 21 | - _target_: torchvision.transforms.Resize 22 | size: 200 23 | - _target_: calvin_agent.utils.transforms.AddDepthNoise 24 | shape: [1000.0] 25 | rate: [1000.0] 26 | - _target_: calvin_agent.utils.transforms.AddGaussianNoise 27 | mean: [0.0] 28 | std: [0.01] 29 | depth_gripper: 30 | - _target_: torchvision.transforms.Resize 31 | size: 84 32 | # - _target_: calvin.utils.transforms.AddDepthNoise 33 | # shape: [ 1000.0 ] 34 | # rate: [ 1000.0 ] 35 | - _target_: calvin_agent.utils.transforms.AddGaussianNoise 36 | mean: [ 0.0 ] 37 | std: [ 0.01 ] 38 | rgb_tactile: 39 | - _target_: torchvision.transforms.Resize 40 | size: 70 41 | - _target_: torchvision.transforms.RandomCrop 42 | size: 64 43 | - _target_: calvin_agent.utils.transforms.ScaleImageTensor 44 | - _target_: torchvision.transforms.Normalize 45 | mean: [0.5] 46 | std: [0.5] 47 | - _target_: calvin_agent.utils.transforms.AddGaussianNoise 48 | mean: [ 0.0 ] 49 | std: [ 0.01 ] 50 | depth_tactile: 51 | - _target_: torchvision.transforms.Resize 52 | size: 64 53 | - _target_: torchvision.transforms.Normalize 54 | mean: [0.1,] 55 | std: [0.2,] 56 | - _target_: calvin_agent.utils.transforms.AddGaussianNoise 57 | mean: [ 0.0 ] 58 | std: [ 0.01 ] 59 | robot_obs: 60 | - _target_: calvin_agent.utils.transforms.NormalizeVector 61 | - _target_: calvin_agent.utils.transforms.AddGaussianNoise 62 | mean: [ 0.0 ] 63 | std: [ 0.01 ] 64 | scene_obs: 65 | - _target_: calvin_agent.utils.transforms.NormalizeVector 66 | - _target_: calvin_agent.utils.transforms.AddGaussianNoise 67 | mean: [ 0.0 ] 68 | std: [ 0.01 ] 69 | language: 70 | - _target_: calvin_agent.utils.transforms.AddGaussianNoise 71 | mean: [ 0.0 ] 72 | std: [ 0.01 ] 73 | 74 | 75 | val: 76 | rgb_static: 77 | - _target_: torchvision.transforms.Resize 78 | size: 224 79 | - _target_: hulc.utils.transforms.ScaleImageTensor 80 | - _target_: torchvision.transforms.Normalize 81 | mean: [ 0.48145466, 0.4578275, 0.40821073 ] 82 | std: [ 0.26862954, 0.26130258, 0.27577711 ] 83 | rgb_gripper: 84 | - _target_: torchvision.transforms.Resize 85 | size: 84 86 | - _target_: hulc.utils.transforms.ScaleImageTensor 87 | - _target_: torchvision.transforms.Normalize 88 | mean: [ 0.48145466, 0.4578275, 0.40821073 ] 89 | std: [ 0.26862954, 0.26130258, 0.27577711 ] 90 | depth_static: 91 | - _target_: torchvision.transforms.Resize 92 | size: 200 93 | depth_gripper: 94 | - _target_: torchvision.transforms.Resize 95 | size: 84 96 | rgb_tactile: 97 | - _target_: torchvision.transforms.Resize 98 | size: 70 99 | - _target_: torchvision.transforms.RandomCrop 100 | size: 64 101 | - _target_: hulc.utils.transforms.ScaleImageTensor 102 | - _target_: torchvision.transforms.Normalize 103 | mean: [0.5] 104 | std: [0.5] 105 | depth_tactile: 106 | - _target_: torchvision.transforms.Resize 107 | size: 64 108 | - _target_: torchvision.transforms.Normalize 109 | mean: [0.1,] 110 | std: [0.2,] 111 | robot_obs: 112 | - _target_: hulc.utils.transforms.NormalizeVector 113 | scene_obs: 114 | - _target_: hulc.utils.transforms.NormalizeVector 115 | -------------------------------------------------------------------------------- /conf/datamodule/transforms/play_basic.yaml: -------------------------------------------------------------------------------- 1 | train: 2 | rgb_static: 3 | - _target_: torchvision.transforms.Resize 4 | size: 200 5 | - _target_: calvin_agent.utils.transforms.ScaleImageTensor 6 | - _target_: torchvision.transforms.Normalize 7 | mean: [0.5,] 8 | std: [0.5,] 9 | # - _target_: calvin_agent.utils.transforms.AddGaussianNoise 10 | # mean: [0.0] 11 | # std: [0.01] 12 | rgb_gripper: 13 | - _target_: torchvision.transforms.Resize 14 | size: 84 15 | - _target_: calvin_agent.utils.transforms.ScaleImageTensor 16 | - _target_: torchvision.transforms.Normalize 17 | mean: [0.5,] 18 | std: [0.5,] 19 | # - _target_: calvin_agent.utils.transforms.AddGaussianNoise 20 | # mean: [0.0] 21 | # std: [0.01] 22 | depth_static: 23 | - _target_: torchvision.transforms.Resize 24 | size: 200 25 | - _target_: calvin_agent.utils.transforms.AddDepthNoise 26 | shape: [1000.0] 27 | rate: [1000.0] 28 | # - _target_: calvin_agent.utils.transforms.AddGaussianNoise 29 | # mean: [0.0] 30 | # std: [0.01] 31 | depth_gripper: 32 | - _target_: torchvision.transforms.Resize 33 | size: 84 34 | # - _target_: calvin.utils.transforms.AddDepthNoise 35 | # shape: [ 1000.0 ] 36 | # rate: [ 1000.0 ] 37 | - _target_: calvin_agent.utils.transforms.AddGaussianNoise 38 | mean: [ 0.0 ] 39 | std: [ 0.01 ] 40 | rgb_tactile: 41 | - _target_: torchvision.transforms.Resize 42 | size: 70 43 | - _target_: torchvision.transforms.RandomCrop 44 | size: 64 45 | - _target_: calvin_agent.utils.transforms.ScaleImageTensor 46 | - _target_: torchvision.transforms.Normalize 47 | mean: [0.5] 48 | std: [0.5] 49 | # - _target_: calvin_agent.utils.transforms.AddGaussianNoise 50 | # mean: [ 0.0 ] 51 | # std: [ 0.01 ] 52 | depth_tactile: 53 | - _target_: torchvision.transforms.Resize 54 | size: 64 55 | - _target_: torchvision.transforms.Normalize 56 | mean: [0.1,] 57 | std: [0.2,] 58 | # - _target_: calvin_agent.utils.transforms.AddGaussianNoise 59 | # mean: [ 0.0 ] 60 | # std: [ 0.01 ] 61 | robot_obs: 62 | - _target_: calvin_agent.utils.transforms.NormalizeVector 63 | # - _target_: calvin_agent.utils.transforms.AddGaussianNoise 64 | # mean: [ 0.0 ] 65 | # std: [ 0.01 ] 66 | scene_obs: 67 | - _target_: calvin_agent.utils.transforms.NormalizeVector 68 | # - _target_: calvin_agent.utils.transforms.AddGaussianNoise 69 | # mean: [ 0.0 ] 70 | # std: [ 0.01 ] 71 | # language: 72 | # - _target_: calvin_agent.utils.transforms.AddGaussianNoise 73 | # mean: [ 0.0 ] 74 | # std: [ 0.01 ] 75 | 76 | 77 | val: 78 | rgb_static: 79 | - _target_: torchvision.transforms.Resize 80 | size: 200 81 | - _target_: calvin_agent.utils.transforms.ScaleImageTensor 82 | - _target_: torchvision.transforms.Normalize 83 | mean: [0.5,] 84 | std: [0.5,] 85 | rgb_gripper: 86 | - _target_: torchvision.transforms.Resize 87 | size: 84 88 | - _target_: calvin_agent.utils.transforms.ScaleImageTensor 89 | - _target_: torchvision.transforms.Normalize 90 | mean: [0.5,] 91 | std: [0.5,] 92 | depth_static: 93 | - _target_: torchvision.transforms.Resize 94 | size: 200 95 | depth_gripper: 96 | - _target_: torchvision.transforms.Resize 97 | size: 84 98 | rgb_tactile: 99 | - _target_: torchvision.transforms.Resize 100 | size: 70 101 | - _target_: torchvision.transforms.RandomCrop 102 | size: 64 103 | - _target_: calvin_agent.utils.transforms.ScaleImageTensor 104 | - _target_: torchvision.transforms.Normalize 105 | mean: [0.5] 106 | std: [0.5] 107 | depth_tactile: 108 | - _target_: torchvision.transforms.Resize 109 | size: 64 110 | - _target_: torchvision.transforms.Normalize 111 | mean: [0.1,] 112 | std: [0.2,] 113 | robot_obs: 114 | - _target_: calvin_agent.utils.transforms.NormalizeVector 115 | scene_obs: 116 | - _target_: calvin_agent.utils.transforms.NormalizeVector 117 | -------------------------------------------------------------------------------- /conf/datamodule/transforms/rand_shift.yaml: -------------------------------------------------------------------------------- 1 | train: 2 | rgb_static: 3 | - _target_: torchvision.transforms.Resize 4 | size: 200 5 | - _target_: hulc.utils.transforms.RandomShiftsAug 6 | pad: 10 7 | - _target_: calvin_agent.utils.transforms.ScaleImageTensor 8 | - _target_: torchvision.transforms.Normalize 9 | mean: [0.5,] 10 | std: [0.5,] 11 | # - _target_: calvin_agent.utils.transforms.AddGaussianNoise 12 | # mean: [0.0] 13 | # std: [0.01] 14 | rgb_gripper: 15 | - _target_: torchvision.transforms.Resize 16 | size: 84 17 | - _target_: hulc.utils.transforms.RandomShiftsAug 18 | pad: 4 19 | - _target_: calvin_agent.utils.transforms.ScaleImageTensor 20 | - _target_: torchvision.transforms.Normalize 21 | mean: [0.5,] 22 | std: [0.5,] 23 | # - _target_: calvin_agent.utils.transforms.AddGaussianNoise 24 | # mean: [0.0] 25 | # std: [0.01] 26 | depth_static: 27 | - _target_: torchvision.transforms.Resize 28 | size: 200 29 | - _target_: calvin_agent.utils.transforms.AddDepthNoise 30 | shape: [1000.0] 31 | rate: [1000.0] 32 | # - _target_: calvin_agent.utils.transforms.AddGaussianNoise 33 | # mean: [0.0] 34 | # std: [0.01] 35 | depth_gripper: 36 | - _target_: torchvision.transforms.Resize 37 | size: 84 38 | # - _target_: calvin.utils.transforms.AddDepthNoise 39 | # shape: [ 1000.0 ] 40 | # rate: [ 1000.0 ] 41 | - _target_: calvin_agent.utils.transforms.AddGaussianNoise 42 | mean: [ 0.0 ] 43 | std: [ 0.01 ] 44 | rgb_tactile: 45 | - _target_: torchvision.transforms.Resize 46 | size: 70 47 | - _target_: torchvision.transforms.RandomCrop 48 | size: 64 49 | - _target_: calvin_agent.utils.transforms.ScaleImageTensor 50 | - _target_: torchvision.transforms.Normalize 51 | mean: [0.5] 52 | std: [0.5] 53 | # - _target_: calvin_agent.utils.transforms.AddGaussianNoise 54 | # mean: [ 0.0 ] 55 | # std: [ 0.01 ] 56 | depth_tactile: 57 | - _target_: torchvision.transforms.Resize 58 | size: 64 59 | - _target_: torchvision.transforms.Normalize 60 | mean: [0.1,] 61 | std: [0.2,] 62 | # - _target_: calvin_agent.utils.transforms.AddGaussianNoise 63 | # mean: [ 0.0 ] 64 | # std: [ 0.01 ] 65 | robot_obs: 66 | - _target_: calvin_agent.utils.transforms.NormalizeVector 67 | # - _target_: calvin_agent.utils.transforms.AddGaussianNoise 68 | # mean: [ 0.0 ] 69 | # std: [ 0.01 ] 70 | scene_obs: 71 | - _target_: calvin_agent.utils.transforms.NormalizeVector 72 | # - _target_: calvin_agent.utils.transforms.AddGaussianNoise 73 | # mean: [ 0.0 ] 74 | # std: [ 0.01 ] 75 | # language: 76 | # - _target_: calvin_agent.utils.transforms.AddGaussianNoise 77 | # mean: [ 0.0 ] 78 | # std: [ 0.01 ] 79 | 80 | 81 | val: 82 | rgb_static: 83 | - _target_: torchvision.transforms.Resize 84 | size: 200 85 | - _target_: calvin_agent.utils.transforms.ScaleImageTensor 86 | - _target_: torchvision.transforms.Normalize 87 | mean: [0.5,] 88 | std: [0.5,] 89 | rgb_gripper: 90 | - _target_: torchvision.transforms.Resize 91 | size: 84 92 | - _target_: calvin_agent.utils.transforms.ScaleImageTensor 93 | - _target_: torchvision.transforms.Normalize 94 | mean: [0.5,] 95 | std: [0.5,] 96 | depth_static: 97 | - _target_: torchvision.transforms.Resize 98 | size: 200 99 | depth_gripper: 100 | - _target_: torchvision.transforms.Resize 101 | size: 84 102 | rgb_tactile: 103 | - _target_: torchvision.transforms.Resize 104 | size: 70 105 | - _target_: torchvision.transforms.RandomCrop 106 | size: 64 107 | - _target_: calvin_agent.utils.transforms.ScaleImageTensor 108 | - _target_: torchvision.transforms.Normalize 109 | mean: [0.5] 110 | std: [0.5] 111 | depth_tactile: 112 | - _target_: torchvision.transforms.Resize 113 | size: 64 114 | - _target_: torchvision.transforms.Normalize 115 | mean: [0.1,] 116 | std: [0.2,] 117 | robot_obs: 118 | - _target_: calvin_agent.utils.transforms.NormalizeVector 119 | scene_obs: 120 | - _target_: calvin_agent.utils.transforms.NormalizeVector 121 | -------------------------------------------------------------------------------- /conf/inference/config_inference.yaml: -------------------------------------------------------------------------------- 1 | train_folder: ??? # config path to the config.yaml of the training folder (in .hydra) 2 | load_checkpoint: ??? 3 | seed: 42 4 | log_dir: /tmp 5 | visualize: True 6 | ep_len: 120 7 | replan_freq: 30 8 | processes: 1 9 | 10 | hydra: 11 | run: 12 | dir: ${log_dir}/inference_runs/${now:%Y-%m-%d}/${now:%H-%M-%S} 13 | 14 | defaults: 15 | - override hydra/job_logging: colorlog 16 | - override hydra/hydra_logging: colorlog 17 | -------------------------------------------------------------------------------- /conf/lang_ann.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - callbacks: default 3 | - datamodule: default 4 | - model: sbert 5 | - loss: default 6 | - training: default_training 7 | - trainer: play_trainer 8 | - logger: wandb 9 | - annotations@train_instructions: new_playtable 10 | - annotations@val_instructions: new_playtable_validation 11 | 12 | - override hydra/job_logging: colorlog 13 | - override hydra/hydra_logging: colorlog 14 | - override datamodule/observation_space: state_only 15 | seed: 42 16 | log_dir: ../ 17 | slurm: false 18 | eps: 0.01 19 | postprocessing: true 20 | lang_folder: "lang_annotations" 21 | with_text: false 22 | reannotate: false 23 | prior_steps_window: 16 24 | validation_scene: calvin_scene_D 25 | compute_tsne: false 26 | 27 | hydra: 28 | run: 29 | dir: ${log_dir}/runs/${now:%Y-%m-%d}/${now:%H-%M-%S}_${hydra.job.override_dirname} 30 | sweep: 31 | dir: ${log_dir}/runs/${now:%Y-%m-%d}/${now:%H-%M-%S} 32 | subdir: ${hydra.job.override_dirname} 33 | job: 34 | config: 35 | override_dirname: 36 | exclude_keys: 37 | - log_dir 38 | - datamodule.root_data_dir 39 | - trainer.gpus 40 | - model.tsne_plot 41 | - datamodule.num_workers 42 | - trainer.limit_train_batches 43 | - trainer.limit_val_batches 44 | - model.decoder.load_action_bounds 45 | -------------------------------------------------------------------------------- /conf/logger/tb_logger.yaml: -------------------------------------------------------------------------------- 1 | _target_: pytorch_lightning.loggers.TensorBoardLogger 2 | save_dir: . 3 | name: play_lmp 4 | version: "" 5 | -------------------------------------------------------------------------------- /conf/logger/wandb.yaml: -------------------------------------------------------------------------------- 1 | _target_: pytorch_lightning.loggers.WandbLogger 2 | save_dir: . 3 | name: play_lmp 4 | group: play_lmp 5 | log_model: false 6 | project: "multi_play" 7 | entity: "multimodal_control" 8 | id: ??? 9 | -------------------------------------------------------------------------------- /conf/loss/default.yaml: -------------------------------------------------------------------------------- 1 | kl_beta: 0.01 2 | state_recon_beta: 0.5 3 | kl_balancing_mix: 0.8 4 | bc_z_auxiliary_loss_beta: 1.0 5 | mia_auxiliary_loss_beta: 1.0 6 | clip_auxiliary_loss_beta: 3.0 7 | -------------------------------------------------------------------------------- /conf/model/action_decoder/deterministic.yaml: -------------------------------------------------------------------------------- 1 | _target_: hulc.models.decoders.deterministic_decoder.DeterministicDecoder 2 | hidden_size: 2048 3 | out_features: ${datamodule.action_space} 4 | policy_rnn_dropout_p: 0.0 5 | perceptual_features: ?? 6 | latent_goal_features: ${model.visual_goal.latent_goal_features} 7 | plan_features: ??? 8 | criterion: HuberLoss # MSELoss 9 | num_layers: 2 10 | rnn_model: rnn_decoder 11 | perceptual_emb_slice: [64, 128] 12 | gripper_control: true 13 | -------------------------------------------------------------------------------- /conf/model/action_decoder/hulc_default.yaml: -------------------------------------------------------------------------------- 1 | _target_: hulc.models.decoders.logistic_decoder_rnn.LogisticDecoderRNN 2 | n_mixtures: 10 3 | hidden_size: 2048 4 | out_features: ${datamodule.action_space} 5 | log_scale_min: -7.0 6 | act_max_bound: ${datamodule.action_max} 7 | act_min_bound: ${datamodule.action_min} 8 | dataset_dir: ${datamodule.root_data_dir} 9 | load_action_bounds: false 10 | num_classes: 10 11 | latent_goal_features: ${model.visual_goal.latent_goal_features} 12 | plan_features: ??? 13 | perceptual_features: ??? 14 | gripper_alpha: 1.0 15 | perceptual_emb_slice: [64, 128] 16 | policy_rnn_dropout_p: 0.0 17 | num_layers: 2 18 | rnn_model: rnn_decoder 19 | gripper_control: true 20 | discrete_gripper: true 21 | -------------------------------------------------------------------------------- /conf/model/action_decoder/mcil_default.yaml: -------------------------------------------------------------------------------- 1 | _target_: hulc.models.decoders.logistic_decoder_rnn.LogisticDecoderRNN 2 | n_mixtures: 10 3 | hidden_size: 2048 4 | out_features: ${datamodule.action_space} 5 | log_scale_min: -7.0 6 | act_max_bound: ${datamodule.action_max} 7 | act_min_bound: ${datamodule.action_min} 8 | dataset_dir: ${datamodule.root_data_dir} 9 | load_action_bounds: false 10 | num_classes: 256 11 | latent_goal_features: ${model.visual_goal.latent_goal_features} 12 | plan_features: ??? 13 | perceptual_features: ??? 14 | gripper_alpha: 1.0 15 | policy_rnn_dropout_p: 0.0 16 | num_layers: 2 17 | rnn_model: rnn_decoder 18 | gripper_control: false 19 | discrete_gripper: false 20 | -------------------------------------------------------------------------------- /conf/model/bc_z_lang_decoder/default.yaml: -------------------------------------------------------------------------------- 1 | _target_: hulc.models.auxiliary_loss_networks.bc_z_lang_decoder.BCZLangDecoder 2 | in_features: ${model.plan_recognition.fc_hidden_size} 3 | lang_dim: ${model.language_goal.in_features} 4 | -------------------------------------------------------------------------------- /conf/model/bc_z_lang_decoder/none.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lukashermann/hulc/7fdb09f5f1ff831ec8df29cfc61a41db573e9a06/conf/model/bc_z_lang_decoder/none.yaml -------------------------------------------------------------------------------- /conf/model/clip_lang.yaml: -------------------------------------------------------------------------------- 1 | _target_: hulc.models.encoders.clip_lang_encoder.LangClip 2 | freeze_backbone: true 3 | model_name: "RN50" # "RN101", "RN50x4", "RN50x16", "ViT-B/32", "ViT-B/16" 4 | -------------------------------------------------------------------------------- /conf/model/distribution/continuous.yaml: -------------------------------------------------------------------------------- 1 | _target_: hulc.utils.distributions.Distribution 2 | dist: "continuous" 3 | plan_features: 256 4 | -------------------------------------------------------------------------------- /conf/model/distribution/discrete.yaml: -------------------------------------------------------------------------------- 1 | _target_: hulc.utils.distributions.Distribution 2 | dist: "discrete" 3 | category_size: 32 4 | class_size: 32 5 | -------------------------------------------------------------------------------- /conf/model/gcbc.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - perceptual_encoder: gripper_cam 3 | - plan_proposal: default 4 | - plan_recognition: transformers 5 | - distribution: discrete 6 | - visual_goal: default 7 | - language_goal: default 8 | - action_decoder: hulc_default 9 | - optimizer: adam 10 | - lr_scheduler: constant 11 | - bc_z_lang_decoder: none 12 | - mia_lang_discriminator: none 13 | - proj_vis_lang: default 14 | - /annotations@val_instructions: new_playtable_validation 15 | 16 | _target_: hulc.models.gcbc.GCBC 17 | _recursive_: false 18 | 19 | kl_beta: ${loss.kl_beta} 20 | kl_balancing_mix: ${loss.kl_balancing_mix} 21 | state_recons: false 22 | state_recon_beta: ${loss.state_recon_beta} 23 | use_bc_z_auxiliary_loss: false 24 | bc_z_auxiliary_loss_beta: ${loss.bc_z_auxiliary_loss_beta} 25 | use_mia_auxiliary_loss: false 26 | mia_auxiliary_loss_beta: ${loss.mia_auxiliary_loss_beta} 27 | replan_freq: 30 28 | use_clip_auxiliary_loss: true 29 | clip_auxiliary_loss_beta: ${loss.clip_auxiliary_loss_beta} 30 | -------------------------------------------------------------------------------- /conf/model/hulc.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - perceptual_encoder: gripper_cam 3 | - plan_proposal: default 4 | - plan_recognition: transformers 5 | - distribution: discrete 6 | - visual_goal: default 7 | - language_goal: default 8 | - action_decoder: hulc_default 9 | - optimizer: adam 10 | - lr_scheduler: constant 11 | - bc_z_lang_decoder: none 12 | - mia_lang_discriminator: none 13 | - proj_vis_lang: default 14 | - /annotations@val_instructions: new_playtable_validation 15 | 16 | _target_: hulc.models.hulc.Hulc 17 | _recursive_: false 18 | 19 | kl_beta: ${loss.kl_beta} 20 | kl_balancing_mix: ${loss.kl_balancing_mix} 21 | state_recons: false 22 | state_recon_beta: ${loss.state_recon_beta} 23 | use_bc_z_auxiliary_loss: false 24 | bc_z_auxiliary_loss_beta: ${loss.bc_z_auxiliary_loss_beta} 25 | use_mia_auxiliary_loss: false 26 | mia_auxiliary_loss_beta: ${loss.mia_auxiliary_loss_beta} 27 | replan_freq: 30 28 | use_clip_auxiliary_loss: true 29 | clip_auxiliary_loss_beta: ${loss.clip_auxiliary_loss_beta} 30 | -------------------------------------------------------------------------------- /conf/model/language_encoder/default.yaml: -------------------------------------------------------------------------------- 1 | _target_: hulc.models.encoders.lang_encoder.LanguageEncoder 2 | language_features: 384 3 | hidden_size: 2048 4 | out_features: 256 5 | word_dropout_p: 0.0 6 | activation_function: ReLU #ELU 7 | -------------------------------------------------------------------------------- /conf/model/language_encoder/none.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lukashermann/hulc/7fdb09f5f1ff831ec8df29cfc61a41db573e9a06/conf/model/language_encoder/none.yaml -------------------------------------------------------------------------------- /conf/model/language_goal/default.yaml: -------------------------------------------------------------------------------- 1 | _target_: hulc.models.encoders.goal_encoders.LanguageGoalEncoder 2 | in_features: 384 3 | hidden_size: 2048 4 | latent_goal_features: 32 5 | l2_normalize_goal_embeddings: False 6 | activation_function: ReLU #ELU 7 | word_dropout_p: 0.0 8 | -------------------------------------------------------------------------------- /conf/model/language_goal/none.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lukashermann/hulc/7fdb09f5f1ff831ec8df29cfc61a41db573e9a06/conf/model/language_goal/none.yaml -------------------------------------------------------------------------------- /conf/model/lr_scheduler/constant.yaml: -------------------------------------------------------------------------------- 1 | _target_: transformers.get_constant_schedule 2 | -------------------------------------------------------------------------------- /conf/model/lr_scheduler/cosine_schedule_with_warmup.yaml: -------------------------------------------------------------------------------- 1 | _target_: transformers.get_cosine_schedule_with_warmup 2 | num_training_steps: -1 # -1 specifies to infer number of training steps 3 | num_warmup_steps: 0.1 # float values determines percentage of training steps to use as warmup 4 | num_cycles: 0.5 5 | -------------------------------------------------------------------------------- /conf/model/lr_scheduler/linear_schedule_with_warmup.yaml: -------------------------------------------------------------------------------- 1 | _target_: transformers.get_linear_schedule_with_warmup 2 | num_training_steps: -1 # -1 specifies to infer number of training steps 3 | num_warmup_steps: 0.1 # float values determines percentage of training steps to use as warmup 4 | -------------------------------------------------------------------------------- /conf/model/mcil.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - perceptual_encoder: gripper_cam 3 | - plan_proposal: default 4 | - plan_recognition: birnn 5 | - distribution: continuous 6 | - visual_goal: default 7 | - language_goal: default 8 | - action_decoder: mcil_default 9 | - optimizer: adam 10 | - lr_scheduler: constant 11 | - bc_z_lang_decoder: none 12 | - mia_lang_discriminator: none 13 | - proj_vis_lang: none 14 | - /annotations@val_instructions: new_playtable_validation 15 | 16 | _target_: hulc.models.hulc.Hulc 17 | _recursive_: false 18 | 19 | kl_beta: ${loss.kl_beta} 20 | kl_balancing_mix: ${loss.kl_balancing_mix} 21 | state_recons: false 22 | state_recon_beta: ${loss.state_recon_beta} 23 | use_bc_z_auxiliary_loss: false 24 | bc_z_auxiliary_loss_beta: ${loss.bc_z_auxiliary_loss_beta} 25 | use_mia_auxiliary_loss: false 26 | mia_auxiliary_loss_beta: ${loss.mia_auxiliary_loss_beta} 27 | replan_freq: 30 28 | use_clip_auxiliary_loss: false 29 | clip_auxiliary_loss_beta: ${loss.clip_auxiliary_loss_beta} 30 | -------------------------------------------------------------------------------- /conf/model/mia_lang_discriminator/default.yaml: -------------------------------------------------------------------------------- 1 | _target_: hulc.models.auxiliary_loss_networks.mia_lang_discriminator.MIALangDiscriminator 2 | in_features: ${model.proj_vis_lang.output_dim} 3 | lang_dim: ${model.proj_vis_lang.output_dim} 4 | dropout_p: 0.0 5 | -------------------------------------------------------------------------------- /conf/model/mia_lang_discriminator/none.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lukashermann/hulc/7fdb09f5f1ff831ec8df29cfc61a41db573e9a06/conf/model/mia_lang_discriminator/none.yaml -------------------------------------------------------------------------------- /conf/model/optimizer/adam.yaml: -------------------------------------------------------------------------------- 1 | _target_: torch.optim.Adam 2 | lr: ${training.lr} 3 | #weight_decay: 1e-6 4 | -------------------------------------------------------------------------------- /conf/model/optimizer/adamw.yaml: -------------------------------------------------------------------------------- 1 | _target_: torch.optim.AdamW 2 | lr: ${training.lr} 3 | weight_decay: 1e-6 4 | #amsgrad: False 5 | -------------------------------------------------------------------------------- /conf/model/optimizer/sgd.yaml: -------------------------------------------------------------------------------- 1 | _target_: torch.optim.SGD 2 | lr: ${training.lr} 3 | momentum: 0.9 4 | #weight_decay: 0.0005 5 | -------------------------------------------------------------------------------- /conf/model/perceptual_encoder/default.yaml: -------------------------------------------------------------------------------- 1 | _target_: hulc.models.perceptual_encoders.concat_encoders.ConcatEncoders 2 | _recursive_: false 3 | 4 | defaults: 5 | - rgb_static: default 6 | - rgb_gripper: none 7 | - depth_static: none 8 | - depth_gripper: none 9 | - proprio: none 10 | - tactile: none 11 | - state_decoder: none 12 | -------------------------------------------------------------------------------- /conf/model/perceptual_encoder/depth_gripper/default.yaml: -------------------------------------------------------------------------------- 1 | _target_: hulc.models.perceptual_encoders.vision_network_gripper.VisionNetwork 2 | input_width: 84 3 | input_height: 84 4 | activation_function: ReLU #ELU 5 | dropout_vis_fc: 0.0 6 | l2_normalize_output: false 7 | visual_features: 64 8 | conv_encoder: nature_cnn 9 | num_c: 1 10 | -------------------------------------------------------------------------------- /conf/model/perceptual_encoder/depth_gripper/none.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lukashermann/hulc/7fdb09f5f1ff831ec8df29cfc61a41db573e9a06/conf/model/perceptual_encoder/depth_gripper/none.yaml -------------------------------------------------------------------------------- /conf/model/perceptual_encoder/depth_static/default.yaml: -------------------------------------------------------------------------------- 1 | _target_: hulc.models.perceptual_encoders.vision_network.VisionNetwork 2 | input_width: 200 3 | input_height: 200 4 | activation_function: ReLU #ELU 5 | dropout_vis_fc: 0.0 6 | l2_normalize_output: false 7 | visual_features: 64 8 | num_c: 1 9 | -------------------------------------------------------------------------------- /conf/model/perceptual_encoder/depth_static/none.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lukashermann/hulc/7fdb09f5f1ff831ec8df29cfc61a41db573e9a06/conf/model/perceptual_encoder/depth_static/none.yaml -------------------------------------------------------------------------------- /conf/model/perceptual_encoder/gripper_cam.yaml: -------------------------------------------------------------------------------- 1 | _target_: hulc.models.perceptual_encoders.concat_encoders.ConcatEncoders 2 | _recursive_: false 3 | 4 | defaults: 5 | - rgb_static: default 6 | - rgb_gripper: default 7 | - depth_static: none 8 | - depth_gripper: none 9 | - proprio: none 10 | - tactile: none 11 | -------------------------------------------------------------------------------- /conf/model/perceptual_encoder/proprio/identity.yaml: -------------------------------------------------------------------------------- 1 | _target_: hulc.models.perceptual_encoders.proprio_encoder.IdentityEncoder 2 | proprioception_dims: ${datamodule.proprioception_dims} 3 | -------------------------------------------------------------------------------- /conf/model/perceptual_encoder/proprio/none.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lukashermann/hulc/7fdb09f5f1ff831ec8df29cfc61a41db573e9a06/conf/model/perceptual_encoder/proprio/none.yaml -------------------------------------------------------------------------------- /conf/model/perceptual_encoder/rgb_gripper/default.yaml: -------------------------------------------------------------------------------- 1 | _target_: hulc.models.perceptual_encoders.vision_network_gripper.VisionNetwork 2 | input_width: 84 3 | input_height: 84 4 | activation_function: ReLU #ELU 5 | dropout_vis_fc: 0.0 6 | l2_normalize_output: false 7 | visual_features: 64 8 | conv_encoder: nature_cnn 9 | num_c: 3 10 | -------------------------------------------------------------------------------- /conf/model/perceptual_encoder/rgb_gripper/none.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lukashermann/hulc/7fdb09f5f1ff831ec8df29cfc61a41db573e9a06/conf/model/perceptual_encoder/rgb_gripper/none.yaml -------------------------------------------------------------------------------- /conf/model/perceptual_encoder/rgb_static/clip.yaml: -------------------------------------------------------------------------------- 1 | _target_: hulc.models.perceptual_encoders.vision_clip.VisionClip 2 | visual_features: 64 3 | freeze_backbone: true 4 | model_name: "RN50" # "RN101", "RN50x4", "RN50x16", "ViT-B/32", "ViT-B/16" 5 | -------------------------------------------------------------------------------- /conf/model/perceptual_encoder/rgb_static/default.yaml: -------------------------------------------------------------------------------- 1 | _target_: hulc.models.perceptual_encoders.vision_network.VisionNetwork 2 | input_width: 200 3 | input_height: 200 4 | activation_function: ReLU #ELU 5 | dropout_vis_fc: 0.0 6 | l2_normalize_output: false 7 | visual_features: 64 8 | num_c: 3 9 | use_sinusoid: false 10 | spatial_softmax_temp: 1.0 11 | -------------------------------------------------------------------------------- /conf/model/perceptual_encoder/state_decoder/default.yaml: -------------------------------------------------------------------------------- 1 | _target_: hulc.models.auxiliary_loss_networks.state_decoder.StateDecoder 2 | visual_features: 64 3 | n_state_obs: 8 4 | -------------------------------------------------------------------------------- /conf/model/perceptual_encoder/state_decoder/none.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lukashermann/hulc/7fdb09f5f1ff831ec8df29cfc61a41db573e9a06/conf/model/perceptual_encoder/state_decoder/none.yaml -------------------------------------------------------------------------------- /conf/model/perceptual_encoder/tactile/default.yaml: -------------------------------------------------------------------------------- 1 | _target_: calvin.models.perceptual_encoders.tactile_encoder.TactileEncoder 2 | visual_features: 64 3 | -------------------------------------------------------------------------------- /conf/model/perceptual_encoder/tactile/none.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lukashermann/hulc/7fdb09f5f1ff831ec8df29cfc61a41db573e9a06/conf/model/perceptual_encoder/tactile/none.yaml -------------------------------------------------------------------------------- /conf/model/plan_proposal/default.yaml: -------------------------------------------------------------------------------- 1 | _target_: hulc.models.plan_encoders.plan_proposal_net.PlanProposalNetwork 2 | perceptual_features: ??? 3 | latent_goal_features: ${model.visual_goal.latent_goal_features} 4 | plan_features: ??? 5 | activation_function: ReLU #ELU 6 | hidden_size: 2048 7 | -------------------------------------------------------------------------------- /conf/model/plan_recognition/birnn.yaml: -------------------------------------------------------------------------------- 1 | _target_: hulc.models.plan_encoders.plan_recognition_net.PlanRecognitionBiRNNNetwork 2 | in_features: ??? 3 | plan_features: 256 4 | action_space: ${datamodule.action_space} 5 | birnn_dropout_p: 0.0 6 | rnn_type: nn.RNN # nn.GRU 7 | -------------------------------------------------------------------------------- /conf/model/plan_recognition/transformers.yaml: -------------------------------------------------------------------------------- 1 | _target_: hulc.models.plan_encoders.plan_recognition_net.PlanRecognitionTransformersNetwork 2 | num_heads: 8 3 | num_layers: 2 4 | encoder_hidden_size: 2048 5 | fc_hidden_size: 4096 6 | in_features: ?? 7 | plan_features: ??? 8 | action_space: ${datamodule.action_space} 9 | dropout_p: 0.1 10 | encoder_normalize: false 11 | positional_normalize: false 12 | position_embedding: true 13 | max_position_embeddings: ${datamodule.datasets.lang_dataset.max_window_size} 14 | -------------------------------------------------------------------------------- /conf/model/proj_vis_lang/default.yaml: -------------------------------------------------------------------------------- 1 | _target_: hulc.models.auxiliary_loss_networks.proj_vis_lang.ProjVisLang 2 | im_dim: ${model.plan_recognition.fc_hidden_size} 3 | lang_dim: ${model.language_goal.latent_goal_features} 4 | output_dim: ${model.language_goal.latent_goal_features} 5 | proj_lang: true 6 | -------------------------------------------------------------------------------- /conf/model/proj_vis_lang/none.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lukashermann/hulc/7fdb09f5f1ff831ec8df29cfc61a41db573e9a06/conf/model/proj_vis_lang/none.yaml -------------------------------------------------------------------------------- /conf/model/sbert.yaml: -------------------------------------------------------------------------------- 1 | _target_: hulc.models.encoders.language_network.SBert 2 | nlp_model: "all-MiniLM-L6-v2" 3 | -------------------------------------------------------------------------------- /conf/model/visual_goal/default.yaml: -------------------------------------------------------------------------------- 1 | _target_: hulc.models.encoders.goal_encoders.VisualGoalEncoder 2 | in_features: ??? 3 | hidden_size: 2048 4 | latent_goal_features: 32 5 | l2_normalize_goal_embeddings: False 6 | activation_function: ReLU #ELU 7 | -------------------------------------------------------------------------------- /conf/trainer/play_trainer.yaml: -------------------------------------------------------------------------------- 1 | devices: 1 2 | accelerator: gpu 3 | precision: 16 4 | val_check_interval: 1.0 5 | max_epochs: 100 6 | sync_batchnorm: false 7 | -------------------------------------------------------------------------------- /conf/training/default_training.yaml: -------------------------------------------------------------------------------- 1 | lr: 0.0002 2 | -------------------------------------------------------------------------------- /dataset/README.md: -------------------------------------------------------------------------------- 1 | # Dataset 2 | The CALVIN dataset comes with 6 hours of teleoperated play data in each of the 4 environments. 3 | You can use [this script](scripts/visualize_dataset.py) to visualize the dataset. 4 | 5 | ## Download 6 | 7 | We provide a download script to download the three different splits or a small debug dataset: 8 | 9 | **1.** [Split D->D](http://calvin.cs.uni-freiburg.de/dataset/task_D_D.zip) (166 GB): 10 | ```bash 11 | $ cd $CALVIN_ROOT/dataset 12 | $ sh download_data.sh D 13 | ``` 14 | **2.** [Split ABC->D](http://calvin.cs.uni-freiburg.de/dataset/task_ABC_D.zip) (517 GB) 15 | ```bash 16 | $ cd $CALVIN_ROOT/dataset 17 | $ sh download_data.sh ABC 18 | ``` 19 | **3.** [Split ABCD->D](http://calvin.cs.uni-freiburg.de/dataset/task_ABCD_D.zip) (656 GB) 20 | ```bash 21 | $ cd $CALVIN_ROOT/dataset 22 | $ sh download_data.sh ABCD 23 | ``` 24 | 25 | **4.** [Small debug dataset](http://calvin.cs.uni-freiburg.de/dataset/calvin_debug_dataset.zip) (1.3 GB) 26 | ```bash 27 | $ cd $CALVIN_ROOT/dataset 28 | $ sh download_data.sh debug 29 | ``` 30 | 31 | ## Language Embeddings 32 | Since Sep 16 2022, additional language embeddings are part of the dataset on the server. If you downloaded the dataset before, 33 | you can manually download the embeddings by running 34 | ``` 35 | cd $CALVIN_ROOT/dataset 36 | sh download_lang_embeddings.sh D | ABC | ABCD 37 | ``` 38 | Currently, the available embeddings are: 39 | - lang_all-distilroberta-v1 40 | - lang_all-MiniLM-L6-v2 41 | - lang_all-mpnet-base-v2 42 | - lang_BERT 43 | - lang_clip_resnet50 44 | - lang_clip_ViTB32 45 | - lang_huggingface_distilroberta 46 | - lang_huggingface_mpnet 47 | - lang_msmarco-bert-base-dot-v5 48 | - lang_paraphrase-MiniLM-L3-v2 49 | 50 | ## Data Structure 51 | Each interaction timestep is stored in a dictionary inside a numpy file and contains all corresponding sensory observations, different action spaces, state information and language annoations. 52 | ### Camera Observations 53 | The keys to access the different camera observations are: 54 | ``` 55 | ['rgb_static'] (dtype=np.uint8, shape=(200, 200, 3)), 56 | ['rgb_gripper'] (dtype=np.uint8, shape=(84, 84, 3)), 57 | ['rgb_tactile'] (dtype=np.uint8, shape=(160, 120, 6)), 58 | ['depth_static'] (dtype=np.float32, shape=(200, 200)), 59 | ['depth_gripper'] (dtype=np.float32, shape=(84, 84)), 60 | ['depth_tactile'] (dtype=np.float32, shape=(160, 120, 2)) 61 | ``` 62 | ### Actions 63 | Actions are in cartesian space and define the desired tcp pose wrt to the world frame and the binary gripper action. 64 | The keys to access the 7-DOF absolute and relative actions are: 65 | (tcp = tool center point, i.e. a virtual frame between the gripper finger tips of the robot) 66 | ``` 67 | ['actions'] 68 | (dtype=np.float32, shape=(7,)) 69 | tcp position (3): x,y,z in absolute world coordinates 70 | tcp orientation (3): euler angles x,y,z in absolute world coordinates 71 | gripper_action (1): binary (close = -1, open = 1) 72 | 73 | ['rel_actions'] 74 | (dtype=np.float32, shape=(7,)) 75 | tcp position (3): x,y,z in relative world coordinates normalized and clipped to (-1, 1) with scaling factor 50 76 | tcp orientation (3): euler angles x,y,z in relative world coordinates normalized and clipped to (-1, 1) with scaling factor 20 77 | gripper_action (1): binary (close = -1, open = 1) 78 | ``` 79 | For inference, Calvin env accepts both absolute and relative actions. To use absolute actions, the action is specified as a 3-tuple 80 | `action = ((x,y,z), (euler_x, euler_y, euler_z), (gripper))`. To use relative actions, the action is specified as a 81 | 7-tuple `action = (x,y,z, euler_x, euler_y, euler_z, gripper)`. IMPORTANT: the environment expects the relative actions 82 | to be scaled like the `rel_actions` in the dataset. 83 | 84 | ### State Observation 85 | The keys to access the scene state information containing the position and orientation of all objects in the scenes 86 | (we do not use them to better capture challenges present in real-world settings): 87 | ``` 88 | ['scene_obs'] 89 | (dtype=np.float32, shape=(24,)) 90 | sliding door (1): joint state 91 | drawer (1): joint state 92 | button (1): joint state 93 | switch (1): joint state 94 | lightbulb (1): on=1, off=0 95 | green light (1): on=1, off=0 96 | red block (6): (x, y, z, euler_x, euler_y, euler_z) 97 | blue block (6): (x, y, z, euler_x, euler_y, euler_z) 98 | pink block (6): (x, y, z, euler_x, euler_y, euler_z) 99 | ``` 100 | The robot proprioceptive information, which also includes joint positions can be accessed with: 101 | ``` 102 | ['robot_obs'] 103 | (dtype=np.float32, shape=(15,)) 104 | tcp position (3): x,y,z in world coordinates 105 | tcp orientation (3): euler angles x,y,z in world coordinates 106 | gripper opening width (1): in meter 107 | arm_joint_states (7): in rad 108 | gripper_action (1): binary (close = -1, open = 1) 109 | ``` 110 | ### Language Annotations 111 | The language annotations are in a subdirectory of the train and validation folders called `lang_annotations`. 112 | The file `auto_lang_ann.npy` contains the language annotations and its embeddings besides of additional metadata such as the task id, the sequence indexes. 113 | ``` 114 | ['language']['ann']: list of raw language 115 | ['language']['task']: list of task_id 116 | ['language']['emb']: precomputed miniLM language embedding 117 | ['info']['indx']: list of start and end indices corresponding to the precomputed language embeddings 118 | ``` 119 | The `embeddings.npy` file is only present on the validation folder, this file contains the embeddings used only during the Rollouts (test inference) to condition the policy. 120 | 121 | ## Visualize Language Annotations 122 | We provide a script to generate a video that visualizes the language annotations of the recorded play data. 123 | By default we visualize the first 100 sequences, but feel free to more sequences (just change this [line](https://github.com/mees/calvin/blob/main/calvin_models/calvin_agent/utils/visualize_annotations.py#L57)). 124 | A example video is. 125 | ``` 126 | cd $CALVIN_ROOT/calvin_models/calvin_agent 127 | python utils/visualize_annotations.py datamodule.root_data_dir=$CALVIN_ROOT/dataset/task_D_D/ datamodule/observation_space=lang_rgb_static 128 | ``` 129 | -------------------------------------------------------------------------------- /dataset/download_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Download, Unzip, and Remove zip 4 | if [ "$1" = "D" ] 5 | then 6 | 7 | echo "Downloading task_D_D ..." 8 | wget http://calvin.cs.uni-freiburg.de/dataset/task_D_D.zip 9 | unzip task_D_D.zip && rm task_D_D.zip 10 | echo "saved folder: task_D_D" 11 | elif [ "$1" = "ABC" ] 12 | then 13 | 14 | echo "Downloading task_ABC_D ..." 15 | wget http://calvin.cs.uni-freiburg.de/dataset/task_ABC_D.zip 16 | unzip task_ABC_D.zip && rm task_ABC_D.zip 17 | echo "saved folder: task_ABC_D" 18 | 19 | elif [ "$1" = "ABCD" ] 20 | then 21 | 22 | echo "Downloading task_ABCD_D ..." 23 | wget http://calvin.cs.uni-freiburg.de/dataset/task_ABCD_D.zip 24 | unzip task_ABCD_D.zip && rm task_ABCD_D.zip 25 | echo "saved folder: task_ABCD_D" 26 | 27 | elif [ "$1" = "debug" ] 28 | then 29 | 30 | echo "Downloading debug dataset ..." 31 | wget http://calvin.cs.uni-freiburg.de/dataset/calvin_debug_dataset.zip 32 | unzip calvin_debug_dataset.zip && rm calvin_debug_dataset.zip 33 | echo "saved folder: calvin_debug_dataset" 34 | 35 | 36 | else 37 | echo "Failed: Usage download_data.sh D | ABC | ABCD | debug" 38 | exit 1 39 | fi 40 | -------------------------------------------------------------------------------- /dataset/download_lang_embeddings.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Download, Unzip, and Remove zip 3 | if [ "$1" = "D" ] 4 | then 5 | 6 | echo "Downloading Language Embeddings for task_D_D ..." 7 | cd task_D_D 8 | wget http://hulc.cs.uni-freiburg.de/language_embeddings/D_D_lang_embs_train.zip 9 | unzip D_D_lang_embs_train.zip && rm D_D_lang_embs_train.zip 10 | wget http://hulc.cs.uni-freiburg.de/language_embeddings/D_D_lang_embs_val.zip 11 | unzip D_D_lang_embs_val.zip && rm D_D_lang_embs_val.zip 12 | echo "finished!" 13 | elif [ "$1" = "ABC" ] 14 | then 15 | 16 | echo "Downloading Language Embeddings for task_ABC_D ..." 17 | cd task_ABC_D 18 | wget http://hulc.cs.uni-freiburg.de/language_embeddings/ABC_D_lang_embs_train.zip 19 | unzip ABC_D_lang_embs_train.zip && rm ABC_D_lang_embs_train.zip 20 | wget http://hulc.cs.uni-freiburg.de/language_embeddings/ABC_D_lang_embs_val.zip 21 | unzip ABC_D_lang_embs_val.zip && rm ABC_D_lang_embs_val.zip 22 | echo "finished!" 23 | 24 | elif [ "$1" = "ABCD" ] 25 | then 26 | 27 | echo "Downloading Language Embeddings for task_ABCD_D ..." 28 | cd task_ABCD_D 29 | wget http://hulc.cs.uni-freiburg.de/language_embeddings/ABCD_D_lang_embs_train.zip 30 | unzip ABCD_D_lang_embs_train.zip && rm ABCD_D_lang_embs_train.zip 31 | wget http://hulc.cs.uni-freiburg.de/language_embeddings/ABCD_D_lang_embs_val.zip 32 | unzip ABCD_D_lang_embs_val.zip && rm ABCD_D_lang_embs_val.zip 33 | echo "finished!" 34 | 35 | else 36 | echo "Failed: Usage download_lang_embeddings.sh D | ABC | ABCD" 37 | exit 1 38 | fi 39 | -------------------------------------------------------------------------------- /hulc/__init__.py: -------------------------------------------------------------------------------- 1 | """'Hierarchical Universal Language Conditioned Policies implementation in pytorch 2 | :copyright: 2022 by Oier Mees 3 | :license: MIT, see LICENSE for more details. 4 | """ 5 | 6 | __version__ = "0.0.1" 7 | __project__ = "HULC" 8 | __author__ = "Oier Mees" 9 | __license__ = "MIT" 10 | __email__ = "meeso@informatik.uni-freiburg.de" 11 | -------------------------------------------------------------------------------- /hulc/evaluation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lukashermann/hulc/7fdb09f5f1ff831ec8df29cfc61a41db573e9a06/hulc/evaluation/__init__.py -------------------------------------------------------------------------------- /hulc/evaluation/evaluate_policy.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | from pathlib import Path 4 | import sys 5 | 6 | # This is for using the locally installed repo clone when using slurm 7 | from calvin_agent.evaluation.evaluate_policy import evaluate_policy 8 | 9 | sys.path.insert(0, Path(__file__).absolute().parents[2].as_posix()) 10 | from calvin_agent.evaluation.utils import get_default_model_and_env 11 | from calvin_agent.utils.utils import get_all_checkpoints, get_checkpoints_for_epochs, get_last_checkpoint 12 | from pytorch_lightning import seed_everything 13 | 14 | logger = logging.getLogger(__name__) 15 | 16 | 17 | def get_epoch(checkpoint): 18 | if "=" not in checkpoint.stem: 19 | return "0" 20 | checkpoint.stem.split("=")[1] 21 | 22 | 23 | def main(): 24 | seed_everything(0, workers=True) # type:ignore 25 | parser = argparse.ArgumentParser(description="Evaluate a trained model on multistep sequences with language goals.") 26 | parser.add_argument("--dataset_path", type=str, help="Path to the dataset root directory.") 27 | 28 | # arguments for loading default model 29 | parser.add_argument( 30 | "--train_folder", type=str, help="If calvin_agent was used to train, specify path to the log dir." 31 | ) 32 | parser.add_argument( 33 | "--checkpoints", 34 | type=str, 35 | default=None, 36 | help="Comma separated list of epochs for which checkpoints will be loaded", 37 | ) 38 | parser.add_argument( 39 | "--checkpoint", 40 | type=str, 41 | default=None, 42 | help="Path of the checkpoint", 43 | ) 44 | parser.add_argument( 45 | "--last_k_checkpoints", 46 | type=int, 47 | help="Specify the number of checkpoints you want to evaluate (starting from last). Only used for calvin_agent.", 48 | ) 49 | 50 | parser.add_argument("--debug", action="store_true", help="Print debug info and visualize environment.") 51 | 52 | parser.add_argument("--eval_log_dir", default=None, type=str, help="Where to log the evaluation results.") 53 | 54 | parser.add_argument("--device", default=0, type=int, help="CUDA device") 55 | args = parser.parse_args() 56 | 57 | assert "train_folder" in args 58 | 59 | checkpoints = [] 60 | if args.checkpoints is None and args.last_k_checkpoints is None and args.checkpoint is None: 61 | print("Evaluating model with last checkpoint.") 62 | checkpoints = [get_last_checkpoint(Path(args.train_folder))] 63 | elif args.checkpoints is not None: 64 | print(f"Evaluating model with checkpoints {args.checkpoints}.") 65 | checkpoints = get_checkpoints_for_epochs(Path(args.train_folder), args.checkpoints) 66 | elif args.checkpoints is None and args.last_k_checkpoints is not None: 67 | print(f"Evaluating model with last {args.last_k_checkpoints} checkpoints.") 68 | checkpoints = get_all_checkpoints(Path(args.train_folder))[-args.last_k_checkpoints :] 69 | elif args.checkpoint is not None: 70 | checkpoints = [Path(args.checkpoint)] 71 | 72 | env = None 73 | for checkpoint in checkpoints: 74 | epoch = get_epoch(checkpoint) 75 | model, env, _ = get_default_model_and_env( 76 | args.train_folder, 77 | args.dataset_path, 78 | checkpoint, 79 | env=env, 80 | device_id=args.device, 81 | ) 82 | evaluate_policy(model, env, epoch, eval_log_dir=args.eval_log_dir, debug=args.debug, create_plan_tsne=True) 83 | 84 | 85 | if __name__ == "__main__": 86 | main() 87 | -------------------------------------------------------------------------------- /hulc/evaluation/rollouts_interactive.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from pathlib import Path 3 | 4 | from calvin_agent.evaluation.utils import imshow_tensor 5 | from calvin_agent.utils.utils import get_last_checkpoint 6 | import cv2 7 | import hydra 8 | import numpy as np 9 | from omegaconf import DictConfig, OmegaConf 10 | from omegaconf.errors import MissingMandatoryValue 11 | from pytorch_lightning import seed_everything 12 | import torch 13 | 14 | from hulc.models.hulc import Hulc 15 | 16 | logger = logging.getLogger(__name__) 17 | 18 | 19 | def get_checkpoint(cfg): 20 | try: 21 | checkpoint = cfg.load_checkpoint 22 | except MissingMandatoryValue: 23 | checkpoint = get_last_checkpoint(Path(cfg.train_folder)) 24 | return checkpoint 25 | 26 | 27 | def format_sftp_path(cfg): 28 | """ 29 | When using network mount from nautilus, format path 30 | """ 31 | if cfg.train_folder.startswith("sftp"): 32 | cfg.train_folder = "/run/user/9984/gvfs/sftp:host=" + cfg.train_folder[7:] 33 | 34 | 35 | @hydra.main(config_path="../../conf/inference", config_name="config_inference") 36 | def test_policy(input_cfg: DictConfig) -> None: 37 | """ 38 | Run inference on trained policy. 39 | Arguments: 40 | train_folder (str): path of trained model. 41 | load_checkpoint (str): optional model checkpoint. If not specified, the last checkpoint is taken by default. 42 | +datamodule.root_data_dir (str): /path/dataset when running inference on another machine than were it was trained 43 | visualize (bool): wether to visualize the policy rollouts (default True). 44 | """ 45 | # when mounting remote folder with sftp, format path 46 | format_sftp_path(input_cfg) 47 | # load config used during training 48 | train_cfg_path = Path(input_cfg.train_folder) / ".hydra/config.yaml" 49 | train_cfg = OmegaConf.load(train_cfg_path) 50 | 51 | # merge configs to keep current cmd line overrides 52 | cfg = OmegaConf.merge(train_cfg, input_cfg) 53 | seed_everything(cfg.seed) 54 | 55 | # since we don't use the trainer during inference, manually set up data_module 56 | data_module = hydra.utils.instantiate(cfg.datamodule, num_workers=4) 57 | data_module.prepare_data() 58 | data_module.setup() 59 | dataloader = data_module.val_dataloader() 60 | dataset = dataloader.dataset.datasets["vis"] 61 | env = hydra.utils.instantiate(cfg.callbacks.rollout.env_cfg, dataset, torch.device("cuda:0"), show_gui=False) 62 | 63 | tasks = hydra.utils.instantiate(cfg.callbacks.rollout.tasks) 64 | checkpoint = get_checkpoint(cfg) 65 | logger.info("Loading model from checkpoint.") 66 | model = Hulc.load_from_checkpoint(checkpoint) 67 | model.freeze() 68 | # model.action_decoder._setup_action_bounds(cfg.datamodule.root_data_dir, None, None) 69 | model = model.cuda(0) 70 | logger.info("Successfully loaded model.") 71 | 72 | ep_start_end_ids = np.sort(np.load(dataset.abs_datasets_dir / "ep_start_end_ids.npy"), axis=0) 73 | 74 | for s, e in ep_start_end_ids: 75 | i = start_i = s 76 | file = dataset.abs_datasets_dir / f"episode_{i:06d}.npz" 77 | data = np.load(file) 78 | obs = env.reset(scene_obs=data["scene_obs"], robot_obs=data["robot_obs"]) 79 | start_info = env.get_info() 80 | current_img_obs = start_img_obs = obs["rgb_obs"] 81 | start_state_obs = obs["state_obs"] 82 | goal_imgs = obs["rgb_obs"] 83 | goal_state = obs["state_obs"] 84 | scene_obs = data["scene_obs"] 85 | robot_obs = data["robot_obs"] 86 | while 1: 87 | imshow_tensor("current_img", current_img_obs[0], wait=1) 88 | imshow_tensor("start", start_img_obs[0], wait=1) 89 | imshow_tensor("goal", goal_imgs[0], wait=1) 90 | cv2.imshow("keylistener", np.zeros((300, 300))) 91 | k = cv2.waitKey(0) % 256 92 | if k == ord("s"): 93 | start_info = env.get_info() 94 | start_img_obs = obs["rgb_obs"] 95 | start_state_obs = obs["state_obs"] 96 | scene_obs = data["scene_obs"] 97 | robot_obs = data["robot_obs"] 98 | start_i = i 99 | elif k == ord("w"): 100 | end_info = env.get_info() 101 | print(tasks.get_task_info(start_info, end_info)) 102 | goal_imgs = obs["rgb_obs"] 103 | goal_state = obs["state_obs"] 104 | print(f"steps: {i - start_i}") 105 | elif k == ord("r"): 106 | file = dataset.abs_datasets_dir / f"episode_{i:06d}.npz" 107 | data = np.load(file) 108 | obs = env.reset(scene_obs=data["scene_obs"]) 109 | current_img_obs = obs["rgb_obs"] 110 | elif k == ord("a"): 111 | i -= 1 112 | i = np.clip(i, s, e) 113 | file = dataset.abs_datasets_dir / f"episode_{i:06d}.npz" 114 | data = np.load(file) 115 | obs = env.reset(scene_obs=data["scene_obs"], robot_obs=data["robot_obs"]) 116 | current_img_obs = obs["rgb_obs"] 117 | 118 | elif k == ord("d"): 119 | i += 1 120 | i = np.clip(i, s, e) 121 | file = dataset.abs_datasets_dir / f"episode_{i:06d}.npz" 122 | data = np.load(file) 123 | obs = env.reset(scene_obs=data["scene_obs"], robot_obs=data["robot_obs"]) 124 | current_img_obs = obs["rgb_obs"] 125 | elif k == ord("q"): 126 | i -= 100 127 | i = np.clip(i, s, e) 128 | file = dataset.abs_datasets_dir / f"episode_{i:06d}.npz" 129 | data = np.load(file) 130 | obs = env.reset(scene_obs=data["scene_obs"], robot_obs=data["robot_obs"]) 131 | current_img_obs = obs["rgb_obs"] 132 | 133 | elif k == ord("e"): 134 | i += 100 135 | i = np.clip(i, s, e) 136 | file = dataset.abs_datasets_dir / f"episode_{i:06d}.npz" 137 | data = np.load(file) 138 | obs = env.reset(scene_obs=data["scene_obs"], robot_obs=data["robot_obs"]) 139 | current_img_obs = obs["rgb_obs"] 140 | 141 | elif k == ord("f"): 142 | env.reset(scene_obs=scene_obs, robot_obs=robot_obs) 143 | rollout(model, env, tasks, cfg, start_info, start_img_obs, start_state_obs, goal_imgs, goal_state) 144 | obs = env.reset(scene_obs=scene_obs, robot_obs=robot_obs) 145 | current_img_obs = obs["rgb_obs"] 146 | i = start_i 147 | elif k == ord("n"): # ESC 148 | break 149 | 150 | 151 | def rollout(model, env, tasks, cfg, start_info, current_img_obs, current_state_obs, goal_imgs, goal_state): 152 | # goal image is last step of the episode 153 | # goal_imgs = [goal_img.unsqueeze(0).cuda() for goal_img in goal_imgs] 154 | goal_imgs = goal_imgs[0].contiguous() 155 | for step in range(cfg.ep_len): 156 | # replan every replan_freq steps (default 30 i.e every second) 157 | if step % cfg.replan_freq == 0: 158 | plan, latent_goal = model.get_pp_plan_vision( 159 | current_img_obs, goal_imgs, current_state_obs, goal_state 160 | ) # type: ignore 161 | imshow_tensor("current_img", current_img_obs[0], wait=1) 162 | 163 | # use plan to predict actions with current observations 164 | action = model.predict_with_plan(current_img_obs, current_state_obs, latent_goal, plan) 165 | obs, _, _, current_info = env.step(action) 166 | # check if current step solves a task 167 | current_task_info = tasks.get_task_info(start_info, current_info) 168 | if len(current_task_info) > 0: 169 | print(current_task_info) 170 | # update current observation 171 | current_img_obs = obs["rgb_obs"] 172 | current_state_obs = obs["state_obs"] 173 | 174 | 175 | if __name__ == "__main__": 176 | test_policy() 177 | -------------------------------------------------------------------------------- /hulc/evaluation/run_multiple.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import multiprocessing 3 | import os 4 | from pathlib import Path 5 | import subprocess 6 | 7 | from calvin_agent.utils.utils import get_all_checkpoints 8 | import numpy as np 9 | 10 | 11 | def get_log_dir(log_dir): 12 | log_dir = Path(log_dir) 13 | os.makedirs(log_dir, exist_ok=True) 14 | return log_dir 15 | 16 | 17 | def intervals(parts, duration): 18 | part_duration = duration / parts 19 | return [str(int(i * part_duration)) + "-" + str(int(((i + 1) * part_duration) - 1)) for i in range(parts)] 20 | 21 | 22 | def main(): 23 | """ 24 | This script calls the evaluate.sh script of the specified training_dir 8 times with different checkpoints 25 | """ 26 | parser = argparse.ArgumentParser(description="Evaluate a trained model on multistep sequences with language goals.") 27 | parser.add_argument("--dataset_path", type=str, help="Path to the dataset root directory.") 28 | 29 | parser.add_argument( 30 | "--train_folder", type=str, help="If calvin_agent was used to train, specify path to the log dir." 31 | ) 32 | parser.add_argument("--max_epoch", type=int, default=30, help="Evaluate until which epoch.") 33 | parser.add_argument( 34 | "--eval_log_dir", type=str, help="If calvin_agent was used to train, specify path to the log dir." 35 | ) 36 | 37 | args = parser.parse_args() 38 | eval_log_dir = get_log_dir(args.eval_log_dir) 39 | 40 | eval_script = (Path(__file__).parent / "evaluate_policy.py").as_posix() 41 | training_dir = Path(args.train_folder) 42 | checkpoints = get_all_checkpoints(training_dir) 43 | epochs = [str(e) for chk in checkpoints if (e := int(chk.stem.split("=")[1])) <= args.max_epoch] 44 | split_epochs = np.array_split(epochs, 8) 45 | epoch_args = [",".join(arr) for arr in split_epochs] 46 | max_cpu_count = multiprocessing.cpu_count() 47 | local_cpus = intervals(8, max_cpu_count) 48 | for i, epoch_arg in enumerate(epoch_args): 49 | cmd = [ 50 | "taskset", 51 | "--cpu-list", 52 | local_cpus[i], 53 | "python", 54 | eval_script, 55 | "--checkpoints", 56 | epoch_arg, 57 | "--dataset_path", 58 | args.dataset_path, 59 | "--train_folder", 60 | args.train_folder, 61 | "--eval_log_dir", 62 | args.eval_log_dir, 63 | "--device", 64 | str(i), 65 | ] 66 | std_out = eval_log_dir / f"stdout_{i}.out" 67 | std_err = eval_log_dir / f"stderr_{i}.err" 68 | with open(std_out, "wb") as out, open(std_err, "wb") as err: 69 | subprocess.Popen(cmd, stdout=out, stderr=err, preexec_fn=os.setpgrp) 70 | 71 | 72 | if __name__ == "__main__": 73 | main() 74 | -------------------------------------------------------------------------------- /hulc/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lukashermann/hulc/7fdb09f5f1ff831ec8df29cfc61a41db573e9a06/hulc/models/__init__.py -------------------------------------------------------------------------------- /hulc/models/auxiliary_loss_networks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lukashermann/hulc/7fdb09f5f1ff831ec8df29cfc61a41db573e9a06/hulc/models/auxiliary_loss_networks/__init__.py -------------------------------------------------------------------------------- /hulc/models/auxiliary_loss_networks/bc_z_lang_decoder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class BCZLangDecoder(nn.Module): 6 | def __init__(self, in_features: int, lang_dim: int): 7 | super().__init__() 8 | # include proprio info??? 9 | self.mlp = nn.Sequential( 10 | nn.Linear(in_features=in_features, out_features=512), 11 | nn.ReLU(), 12 | nn.Linear(in_features=512, out_features=lang_dim), 13 | ) 14 | 15 | def forward(self, x: torch.Tensor) -> torch.Tensor: 16 | x = self.mlp(x) 17 | return x 18 | -------------------------------------------------------------------------------- /hulc/models/auxiliary_loss_networks/mia_lang_discriminator.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class MIALangDiscriminator(nn.Module): 6 | def __init__(self, in_features: int, lang_dim: int, dropout_p: float): 7 | super().__init__() 8 | self.mlp = nn.Sequential( 9 | nn.Linear(in_features=in_features + lang_dim, out_features=512), 10 | nn.ReLU(), 11 | nn.Dropout(dropout_p), 12 | nn.Linear(in_features=512, out_features=1), 13 | ) 14 | 15 | def forward(self, vis_emb: torch.Tensor, lang_emb: torch.Tensor) -> torch.Tensor: 16 | x = torch.cat([vis_emb, lang_emb], dim=-1) 17 | x = self.mlp(x) 18 | return x 19 | -------------------------------------------------------------------------------- /hulc/models/auxiliary_loss_networks/proj_vis_lang.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple 2 | 3 | import torch 4 | import torch.nn as nn 5 | 6 | 7 | class ProjVisLang(nn.Module): 8 | def __init__(self, im_dim: int, lang_dim: int, output_dim: int, proj_lang: bool = True): 9 | super().__init__() 10 | self.mlp_im = nn.Sequential( 11 | nn.Linear(in_features=im_dim, out_features=128), 12 | nn.ReLU(), 13 | nn.Linear(in_features=128, out_features=output_dim), 14 | ) 15 | self.mlp_lang = None 16 | if proj_lang: 17 | self.mlp_lang = nn.Sequential( 18 | nn.Linear(in_features=lang_dim, out_features=128), 19 | nn.ReLU(), 20 | nn.Linear(in_features=128, out_features=output_dim), 21 | ) 22 | 23 | def forward(self, vis_emb: torch.Tensor, lang_emb: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: 24 | vis_emb = self.mlp_im(vis_emb) 25 | if self.mlp_lang is not None: 26 | lang_emb = self.mlp_lang(lang_emb) 27 | return vis_emb, lang_emb 28 | -------------------------------------------------------------------------------- /hulc/models/auxiliary_loss_networks/state_decoder.py: -------------------------------------------------------------------------------- 1 | from typing import Dict 2 | 3 | import torch 4 | import torch.nn as nn 5 | 6 | 7 | class StateDecoder(nn.Module): 8 | def __init__(self, visual_features: int, n_state_obs: int): 9 | super().__init__() 10 | self.mlp = nn.Sequential( 11 | nn.Linear(in_features=visual_features, out_features=40), 12 | nn.ReLU(), 13 | nn.Linear(in_features=40, out_features=40), 14 | nn.ReLU(), 15 | nn.Linear(in_features=40, out_features=n_state_obs), 16 | ) 17 | 18 | def forward(self, x: torch.Tensor) -> torch.Tensor: 19 | x = self.mlp(x) 20 | return x 21 | -------------------------------------------------------------------------------- /hulc/models/decoders/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lukashermann/hulc/7fdb09f5f1ff831ec8df29cfc61a41db573e9a06/hulc/models/decoders/__init__.py -------------------------------------------------------------------------------- /hulc/models/decoders/action_decoder.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Tuple 2 | 3 | import torch 4 | import torch.nn as nn 5 | 6 | 7 | class ActionDecoder(nn.Module): 8 | def act( 9 | self, 10 | latent_plan: torch.Tensor, 11 | perceptual_emb: torch.Tensor, 12 | latent_goal: torch.Tensor, 13 | robot_obs: Optional[torch.Tensor] = None, 14 | ) -> torch.Tensor: 15 | raise NotImplementedError 16 | 17 | def loss( 18 | self, 19 | latent_plan: torch.Tensor, 20 | perceptual_emb: torch.Tensor, 21 | latent_goal: torch.Tensor, 22 | actions: torch.Tensor, 23 | robot_obs: Optional[torch.Tensor] = None, 24 | ) -> torch.Tensor: 25 | raise NotImplementedError 26 | 27 | def loss_and_act( 28 | self, 29 | latent_plan: torch.Tensor, 30 | perceptual_emb: torch.Tensor, 31 | latent_goal: torch.Tensor, 32 | actions: torch.Tensor, 33 | robot_obs: Optional[torch.Tensor] = None, 34 | ) -> Tuple[torch.Tensor, torch.Tensor]: 35 | raise NotImplementedError 36 | 37 | def _sample(self, *args, **kwargs): 38 | raise NotImplementedError 39 | 40 | def forward( 41 | self, latent_plan: torch.Tensor, perceptual_emb: torch.Tensor, latent_goal: torch.Tensor 42 | ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: 43 | raise NotImplementedError 44 | 45 | def clear_hidden_state(self) -> None: 46 | pass 47 | -------------------------------------------------------------------------------- /hulc/models/decoders/deterministic_decoder.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from typing import Optional, Tuple 3 | 4 | import torch 5 | import torch.nn as nn 6 | 7 | from hulc.models.decoders.action_decoder import ActionDecoder 8 | from hulc.models.decoders.utils.gripper_control import tcp_to_world_frame, world_to_tcp_frame 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | class DeterministicDecoder(ActionDecoder): 14 | def __init__( 15 | self, 16 | perceptual_features: int, 17 | latent_goal_features: int, 18 | plan_features: int, 19 | hidden_size: int, 20 | out_features: int, 21 | policy_rnn_dropout_p: float, 22 | criterion: str, 23 | num_layers: int, 24 | rnn_model: str, 25 | perceptual_emb_slice: tuple, 26 | gripper_control: bool, 27 | ): 28 | super(DeterministicDecoder, self).__init__() 29 | self.plan_features = plan_features 30 | self.gripper_control = gripper_control 31 | self.out_features = out_features 32 | in_features = (perceptual_emb_slice[1] - perceptual_emb_slice[0]) + latent_goal_features + plan_features 33 | self.rnn = eval(rnn_model) 34 | self.rnn = self.rnn(in_features, hidden_size, num_layers, policy_rnn_dropout_p) 35 | self.actions = nn.Sequential(nn.Linear(hidden_size, out_features), nn.Tanh()) 36 | self.criterion = getattr(nn, criterion)() 37 | self.perceptual_emb_slice = perceptual_emb_slice 38 | self.hidden_state = None 39 | 40 | def clear_hidden_state(self) -> None: 41 | self.hidden_state = None 42 | 43 | def forward( # type: ignore 44 | self, 45 | latent_plan: torch.Tensor, 46 | perceptual_emb: torch.Tensor, 47 | latent_goal: torch.Tensor, 48 | h_0: Optional[torch.Tensor] = None, 49 | ) -> Tuple[torch.Tensor, torch.Tensor]: 50 | perceptual_emb = perceptual_emb[..., slice(*self.perceptual_emb_slice)] 51 | batch_size, seq_len = perceptual_emb.shape[0], perceptual_emb.shape[1] 52 | latent_plan = latent_plan.unsqueeze(1).expand(-1, seq_len, -1) if latent_plan.nelement() > 0 else latent_plan 53 | latent_goal = latent_goal.unsqueeze(1).expand(-1, seq_len, -1) 54 | x = torch.cat([latent_plan, perceptual_emb, latent_goal], dim=-1) # b, s, (plan + visuo-propio + goal) 55 | if not isinstance(self.rnn, nn.Sequential) and isinstance(self.rnn, nn.RNNBase): 56 | x, h_n = self.rnn(x, h_0) 57 | else: 58 | x = self.rnn(x) 59 | h_n = None 60 | actions = self.actions(x) 61 | return actions, h_n 62 | 63 | def loss_and_act( 64 | self, 65 | latent_plan: torch.Tensor, 66 | perceptual_emb: torch.Tensor, 67 | latent_goal: torch.Tensor, 68 | actions: torch.Tensor, 69 | robot_obs: Optional[torch.Tensor] = None, 70 | ) -> Tuple[torch.Tensor, torch.Tensor]: 71 | pred_actions, _ = self(latent_plan, perceptual_emb, latent_goal) 72 | # loss 73 | if self.gripper_control: 74 | actions_tcp = world_to_tcp_frame(actions, robot_obs) 75 | loss = self.criterion(pred_actions, actions_tcp) 76 | pred_actions_world = tcp_to_world_frame(pred_actions, robot_obs) 77 | return loss, pred_actions_world 78 | else: 79 | loss = self.criterion(pred_actions, actions) 80 | return loss, pred_actions 81 | 82 | def loss( 83 | self, 84 | latent_plan: torch.Tensor, 85 | perceptual_emb: torch.Tensor, 86 | latent_goal: torch.Tensor, 87 | actions: torch.Tensor, 88 | robot_obs: Optional[torch.Tensor] = None, 89 | ) -> torch.Tensor: 90 | pred_actions, _ = self(latent_plan, perceptual_emb, latent_goal) 91 | if self.gripper_control: 92 | actions_tcp = world_to_tcp_frame(actions, robot_obs) 93 | self.criterion(pred_actions, actions_tcp) 94 | return self.criterion(pred_actions, actions) 95 | 96 | def act( 97 | self, 98 | latent_plan: torch.Tensor, 99 | perceptual_emb: torch.Tensor, 100 | latent_goal: torch.Tensor, 101 | robot_obs: Optional[torch.Tensor] = None, 102 | ) -> torch.Tensor: 103 | pred_actions, self.hidden_state = self(latent_plan, perceptual_emb, latent_goal, self.hidden_state) 104 | if self.gripper_control: 105 | pred_actions_world = tcp_to_world_frame(pred_actions, robot_obs) 106 | return pred_actions_world 107 | else: 108 | return pred_actions 109 | -------------------------------------------------------------------------------- /hulc/models/decoders/logistic_decoder_rnn.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from pathlib import Path 3 | from typing import List, Optional, Tuple, Union 4 | 5 | import numpy as np 6 | from omegaconf import ListConfig, OmegaConf 7 | import torch 8 | import torch.nn as nn 9 | import torch.nn.functional as F 10 | 11 | import hulc 12 | from hulc.models.decoders.action_decoder import ActionDecoder 13 | from hulc.models.decoders.utils.gripper_control import tcp_to_world_frame, world_to_tcp_frame 14 | from hulc.models.decoders.utils.rnn import gru_decoder, lstm_decoder, mlp_decoder, rnn_decoder # needed for line 60 15 | 16 | logger = logging.getLogger(__name__) 17 | 18 | 19 | def log_sum_exp(x): 20 | """numerically stable log_sum_exp implementation that prevents overflow""" 21 | axis = len(x.size()) - 1 22 | m, _ = torch.max(x, dim=axis) 23 | m2, _ = torch.max(x, dim=axis, keepdim=True) 24 | return m + torch.log(torch.sum(torch.exp(x - m2), dim=axis)) 25 | 26 | 27 | class LogisticDecoderRNN(ActionDecoder): 28 | def __init__( 29 | self, 30 | perceptual_features: int, 31 | latent_goal_features: int, 32 | plan_features: int, 33 | n_mixtures: int, 34 | hidden_size: int, 35 | out_features: int, 36 | log_scale_min: float, 37 | act_max_bound: Union[List[float], ListConfig], 38 | act_min_bound: Union[List[float], ListConfig], 39 | dataset_dir: str, 40 | load_action_bounds: bool, 41 | num_classes: int, 42 | gripper_alpha: float, 43 | policy_rnn_dropout_p: float, 44 | num_layers: int, 45 | rnn_model: str, 46 | gripper_control: bool, 47 | discrete_gripper: bool, 48 | perceptual_emb_slice: Optional[tuple] = None, 49 | ): 50 | super(LogisticDecoderRNN, self).__init__() 51 | self.n_dist = n_mixtures 52 | self.gripper_control = gripper_control 53 | self.discrete_gripper = discrete_gripper 54 | self.log_scale_min = log_scale_min 55 | self.num_classes = num_classes 56 | self.plan_features = plan_features 57 | if perceptual_emb_slice is not None: 58 | in_features = (perceptual_emb_slice[1] - perceptual_emb_slice[0]) + latent_goal_features + plan_features 59 | else: 60 | in_features = perceptual_features + latent_goal_features + plan_features 61 | self.out_features = out_features - 1 if discrete_gripper else out_features # for discrete gripper act 62 | self.gripper_alpha = gripper_alpha 63 | self.rnn = eval(rnn_model) 64 | self.rnn = self.rnn(in_features, hidden_size, num_layers, policy_rnn_dropout_p) 65 | self.mean_fc = nn.Linear(hidden_size, self.out_features * self.n_dist) 66 | self.log_scale_fc = nn.Linear(hidden_size, self.out_features * self.n_dist) 67 | self.prob_fc = nn.Linear(hidden_size, self.out_features * self.n_dist) 68 | self.register_buffer("one_hot_embedding_eye", torch.eye(self.n_dist)) 69 | self.register_buffer("ones", torch.ones(1, 1, self.n_dist)) 70 | self._setup_action_bounds(dataset_dir, act_max_bound, act_min_bound, load_action_bounds) 71 | # hack for mypy 72 | self.one_hot_embedding_eye: torch.Tensor = self.one_hot_embedding_eye 73 | self.action_max_bound: torch.Tensor = self.action_max_bound 74 | self.action_min_bound: torch.Tensor = self.action_min_bound 75 | if self.discrete_gripper: 76 | self.gripper_bounds: torch.Tensor = self.gripper_bounds 77 | self.gripper_fc = nn.Linear(hidden_size, 2) 78 | self.criterion = nn.CrossEntropyLoss() 79 | self.perceptual_emb_slice = perceptual_emb_slice 80 | self.hidden_state = None 81 | 82 | def clear_hidden_state(self) -> None: 83 | self.hidden_state = None 84 | 85 | def loss_and_act( # type: ignore 86 | self, 87 | latent_plan: torch.Tensor, 88 | perceptual_emb: torch.Tensor, 89 | latent_goal: torch.Tensor, 90 | actions: torch.Tensor, 91 | robot_obs: torch.Tensor, 92 | ) -> Tuple[torch.Tensor, torch.Tensor]: 93 | logit_probs, log_scales, means, gripper_act, _ = self(latent_plan, perceptual_emb, latent_goal) 94 | pred_actions = self._sample(logit_probs, log_scales, means, gripper_act) 95 | if self.gripper_control: 96 | actions_tcp = world_to_tcp_frame(actions, robot_obs) 97 | loss = self._loss(logit_probs, log_scales, means, gripper_act, actions_tcp) 98 | pred_actions_world = tcp_to_world_frame(pred_actions, robot_obs) 99 | return loss, pred_actions_world 100 | else: 101 | loss = self._loss(logit_probs, log_scales, means, gripper_act, actions) 102 | return loss, pred_actions 103 | 104 | def act( # type: ignore 105 | self, 106 | latent_plan: torch.Tensor, 107 | perceptual_emb: torch.Tensor, 108 | latent_goal: torch.Tensor, 109 | robot_obs: torch.Tensor, 110 | ) -> torch.Tensor: 111 | logit_probs, log_scales, means, gripper_act, self.hidden_state = self( 112 | latent_plan, perceptual_emb, latent_goal, self.hidden_state 113 | ) 114 | pred_actions = self._sample(logit_probs, log_scales, means, gripper_act) 115 | if self.gripper_control: 116 | pred_actions_world = tcp_to_world_frame(pred_actions, robot_obs) 117 | return pred_actions_world 118 | else: 119 | return pred_actions 120 | 121 | def loss( # type: ignore 122 | self, 123 | latent_plan: torch.Tensor, 124 | perceptual_emb: torch.Tensor, 125 | latent_goal: torch.Tensor, 126 | actions: torch.Tensor, 127 | robot_obs: torch.Tensor, 128 | ) -> torch.Tensor: # type: ignore 129 | logit_probs, log_scales, means, gripper_act, _ = self(latent_plan, perceptual_emb, latent_goal) 130 | if self.gripper_control: 131 | actions_tcp = world_to_tcp_frame(actions, robot_obs) 132 | return self._loss(logit_probs, log_scales, means, gripper_act, actions_tcp) 133 | else: 134 | return self._loss(logit_probs, log_scales, means, gripper_act, actions) 135 | 136 | def _loss( 137 | self, 138 | logit_probs: torch.Tensor, 139 | log_scales: torch.Tensor, 140 | means: torch.Tensor, 141 | gripper_act: torch.Tensor, 142 | actions: torch.Tensor, 143 | ) -> torch.Tensor: 144 | if self.discrete_gripper: 145 | logistics_loss = self._logistic_loss(logit_probs, log_scales, means, actions[:, :, :-1]) 146 | gripper_gt = actions[:, :, -1].clone() 147 | # @fixme: hack because discrete actions are now -1 and 1, but we need 0, 1 for crossentropy loss 148 | m = gripper_gt == -1 149 | gripper_gt[m] = 0 150 | gripper_act_loss = self.criterion(gripper_act.view(-1, 2), gripper_gt.view(-1).long()) 151 | total_loss = logistics_loss + self.gripper_alpha * gripper_act_loss 152 | return total_loss 153 | else: 154 | logistics_loss = self._logistic_loss(logit_probs, log_scales, means, actions) 155 | return logistics_loss 156 | 157 | def _setup_action_bounds(self, dataset_dir, act_max_bound, act_min_bound, load_action_bounds): 158 | if load_action_bounds: 159 | try: 160 | statistics_path = Path(hulc.__file__).parent / dataset_dir / "training/statistics.yaml" 161 | statistics = OmegaConf.load(statistics_path) 162 | act_max_bound = statistics.act_max_bound 163 | act_min_bound = statistics.act_min_bound 164 | logger.info(f"Loaded action bounds from {statistics_path}") 165 | except FileNotFoundError: 166 | logger.info( 167 | f"Could not load statistics.yaml in {statistics_path}, taking action bounds defined in hydra conf" 168 | ) 169 | if self.discrete_gripper: 170 | self.register_buffer("gripper_bounds", torch.Tensor([act_min_bound[-1], act_max_bound[-1]])) 171 | act_max_bound = act_max_bound[:-1] # for discrete grasp 172 | act_min_bound = act_min_bound[:-1] 173 | action_max_bound = torch.Tensor(act_max_bound).float() 174 | action_min_bound = torch.Tensor(act_min_bound).float() 175 | assert action_max_bound.shape[0] == self.out_features 176 | assert action_min_bound.shape[0] == self.out_features 177 | action_max_bound = action_max_bound.unsqueeze(0).unsqueeze(0) # [1, 1, action_space] 178 | action_min_bound = action_min_bound.unsqueeze(0).unsqueeze(0) # [1, 1, action_space] 179 | action_max_bound = action_max_bound.unsqueeze(-1) * self.ones # broadcast to [1, 1, action_space, N_DIST] 180 | action_min_bound = action_min_bound.unsqueeze(-1) * self.ones # broadcast to [1, 1, action_space, N_DIST] 181 | self.register_buffer("action_max_bound", action_max_bound) 182 | self.register_buffer("action_min_bound", action_min_bound) 183 | 184 | def _logistic_loss( 185 | self, 186 | logit_probs: torch.Tensor, 187 | log_scales: torch.Tensor, 188 | means: torch.Tensor, 189 | actions: torch.Tensor, 190 | ) -> torch.Tensor: 191 | # Appropriate scale 192 | log_scales = torch.clamp(log_scales, min=self.log_scale_min) 193 | # Broadcast actions (B, A, N_DIST) 194 | actions = actions.unsqueeze(-1) * self.ones 195 | # Approximation of CDF derivative (PDF) 196 | centered_actions = actions - means 197 | inv_stdv = torch.exp(-log_scales) 198 | assert torch.is_tensor(self.action_max_bound) 199 | assert torch.is_tensor(self.action_min_bound) 200 | act_range = (self.action_max_bound - self.action_min_bound) / 2.0 201 | plus_in = inv_stdv * (centered_actions + act_range / (self.num_classes - 1)) 202 | cdf_plus = torch.sigmoid(plus_in) 203 | min_in = inv_stdv * (centered_actions - act_range / (self.num_classes - 1)) 204 | cdf_min = torch.sigmoid(min_in) 205 | 206 | # Corner Cases 207 | log_cdf_plus = plus_in - F.softplus(plus_in) # log probability for edge case of 0 (before scaling) 208 | log_one_minus_cdf_min = -F.softplus(min_in) # log probability for edge case of 255 (before scaling) 209 | # Log probability in the center of the bin 210 | mid_in = inv_stdv * centered_actions 211 | log_pdf_mid = mid_in - log_scales - 2.0 * F.softplus(mid_in) 212 | # Probability for all other cases 213 | cdf_delta = cdf_plus - cdf_min 214 | 215 | # Log probability 216 | log_probs = torch.where( 217 | actions < self.action_min_bound + 1e-3, 218 | log_cdf_plus, 219 | torch.where( 220 | actions > self.action_max_bound - 1e-3, 221 | log_one_minus_cdf_min, 222 | torch.where( 223 | cdf_delta > 1e-5, 224 | torch.log(torch.clamp(cdf_delta, min=1e-12)), 225 | log_pdf_mid - np.log((self.num_classes - 1) / 2), 226 | ), 227 | ), 228 | ) 229 | log_probs = log_probs + F.log_softmax(logit_probs, dim=-1) 230 | loss = -torch.sum(log_sum_exp(log_probs), dim=-1).mean() 231 | return loss 232 | 233 | # Sampling from logistic distribution 234 | def _sample( # type: ignore 235 | self, logit_probs: torch.Tensor, log_scales: torch.Tensor, means: torch.Tensor, gripper_act: torch.Tensor 236 | ) -> torch.Tensor: # type: ignore 237 | # Selecting Logistic distribution (Gumbel Sample) 238 | r1, r2 = 1e-5, 1.0 - 1e-5 239 | temp = (r1 - r2) * torch.rand(means.shape, device=means.device) + r2 240 | temp = logit_probs - torch.log(-torch.log(temp)) 241 | argmax = torch.argmax(temp, -1) 242 | # TODO: find out why mypy complains about type 243 | dist = self.one_hot_embedding_eye[argmax] 244 | 245 | # Select scales and means 246 | log_scales = (dist * log_scales).sum(dim=-1) 247 | means = (dist * means).sum(dim=-1) 248 | 249 | # Inversion sampling for logistic mixture sampling 250 | scales = torch.exp(log_scales) # Make positive 251 | u = (r1 - r2) * torch.rand(means.shape, device=means.device) + r2 252 | actions = means + scales * (torch.log(u) - torch.log(1.0 - u)) 253 | if self.discrete_gripper: 254 | gripper_cmd = self.gripper_bounds[gripper_act.argmax(dim=-1)] 255 | full_action = torch.cat([actions, gripper_cmd.unsqueeze(-1)], 2) 256 | return full_action 257 | else: 258 | return actions 259 | 260 | def forward( # type: ignore 261 | self, 262 | latent_plan: torch.Tensor, 263 | perceptual_emb: torch.Tensor, 264 | latent_goal: torch.Tensor, 265 | h_0: Optional[torch.Tensor] = None, 266 | ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: 267 | if self.perceptual_emb_slice is not None: 268 | perceptual_emb = perceptual_emb[..., slice(*self.perceptual_emb_slice)] 269 | batch_size, seq_len = perceptual_emb.shape[0], perceptual_emb.shape[1] 270 | latent_plan = latent_plan.unsqueeze(1).expand(-1, seq_len, -1) 271 | latent_goal = latent_goal.unsqueeze(1).expand(-1, seq_len, -1) 272 | x = torch.cat([latent_plan, perceptual_emb, latent_goal], dim=-1) # b, s, (plan + visuo-propio + goal) 273 | if not isinstance(self.rnn, nn.Sequential) and isinstance(self.rnn, nn.RNNBase): 274 | x, h_n = self.rnn(x, h_0) 275 | else: 276 | x = self.rnn(x) 277 | h_n = None 278 | probs = self.prob_fc(x) 279 | means = self.mean_fc(x) 280 | log_scales = self.log_scale_fc(x) 281 | log_scales = torch.clamp(log_scales, min=self.log_scale_min) 282 | gripper_act = self.gripper_fc(x) if self.discrete_gripper else None 283 | # Appropriate dimensions 284 | logit_probs = probs.view(batch_size, seq_len, self.out_features, self.n_dist) 285 | means = means.view(batch_size, seq_len, self.out_features, self.n_dist) 286 | log_scales = log_scales.view(batch_size, seq_len, self.out_features, self.n_dist) 287 | return logit_probs, log_scales, means, gripper_act, h_n 288 | -------------------------------------------------------------------------------- /hulc/models/decoders/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lukashermann/hulc/7fdb09f5f1ff831ec8df29cfc61a41db573e9a06/hulc/models/decoders/utils/__init__.py -------------------------------------------------------------------------------- /hulc/models/decoders/utils/gripper_control.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import numpy as np 4 | from hulc.models.decoders.utils.pytorch3d_transforms import ( 5 | euler_angles_to_matrix, 6 | matrix_to_euler_angles, 7 | matrix_to_quaternion, 8 | quaternion_to_matrix, 9 | ) 10 | import torch 11 | from torch.cuda.amp import autocast 12 | 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | def world_to_tcp_frame(action, robot_obs): 17 | with autocast(dtype=torch.float32): 18 | b, s, _ = action.shape 19 | world_T_tcp = euler_angles_to_matrix(robot_obs[..., 3:6], convention="XYZ").float().view(-1, 3, 3) 20 | tcp_T_world = torch.inverse(world_T_tcp) 21 | pos_w_rel = action[..., :3].view(-1, 3, 1) 22 | pos_tcp_rel = tcp_T_world @ pos_w_rel 23 | # downscaling is necessary here to get pseudo infinitesimal rotation 24 | orn_w_rel = action[..., 3:6] * 0.01 25 | world_T_tcp_new = ( 26 | euler_angles_to_matrix(robot_obs[..., 3:6] + orn_w_rel, convention="XYZ").float().view(-1, 3, 3) 27 | ) 28 | tcp_new_T_tcp_old = torch.inverse(world_T_tcp_new) @ world_T_tcp 29 | orn_tcp_rel = matrix_to_euler_angles(tcp_new_T_tcp_old, convention="XYZ").float() 30 | orn_tcp_rel = torch.where(orn_tcp_rel < -np.pi, orn_tcp_rel + 2 * np.pi, orn_tcp_rel) 31 | orn_tcp_rel = torch.where(orn_tcp_rel > np.pi, orn_tcp_rel - 2 * np.pi, orn_tcp_rel) 32 | # upscaling again 33 | orn_tcp_rel *= 100 34 | action_tcp = torch.cat([pos_tcp_rel.view(b, s, -1), orn_tcp_rel.view(b, s, -1), action[..., -1:]], dim=-1) 35 | assert not torch.any(action_tcp.isnan()) 36 | return action_tcp 37 | 38 | 39 | def tcp_to_world_frame(action, robot_obs): 40 | with autocast(dtype=torch.float32): 41 | b, s, _ = action.shape 42 | world_T_tcp = euler_angles_to_matrix(robot_obs[..., 3:6], convention="XYZ").float().view(-1, 3, 3) 43 | pos_tcp_rel = action[..., :3].view(-1, 3, 1) 44 | pos_w_rel = world_T_tcp @ pos_tcp_rel 45 | # downscaling is necessary here to get pseudo infinitesimal rotation 46 | orn_tcp_rel = action[..., 3:6] * 0.01 47 | tcp_new_T_tcp_old = euler_angles_to_matrix(orn_tcp_rel, convention="XYZ").float().view(-1, 3, 3) 48 | world_T_tcp_new = world_T_tcp @ torch.inverse(tcp_new_T_tcp_old) 49 | 50 | orn_w_new = matrix_to_euler_angles(world_T_tcp_new, convention="XYZ").float() 51 | if torch.any(orn_w_new.isnan()): 52 | logger.warning("NaN value in euler angles.") 53 | orn_w_new = matrix_to_euler_angles( 54 | quaternion_to_matrix(matrix_to_quaternion(world_T_tcp_new)), convention="XYZ" 55 | ).float() 56 | orn_w_rel = orn_w_new - robot_obs[..., 3:6].view(-1, 3) 57 | orn_w_rel = torch.where(orn_w_rel < -np.pi, orn_w_rel + 2 * np.pi, orn_w_rel) 58 | orn_w_rel = torch.where(orn_w_rel > np.pi, orn_w_rel - 2 * np.pi, orn_w_rel) 59 | # upscaling again 60 | orn_w_rel *= 100 61 | action_w = torch.cat([pos_w_rel.view(b, s, -1), orn_w_rel.view(b, s, -1), action[..., -1:]], dim=-1) 62 | assert not torch.any(action_w.isnan()) 63 | return action_w 64 | -------------------------------------------------------------------------------- /hulc/models/decoders/utils/rnn.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | def rnn_decoder(in_features: int, hidden_size: int, num_layers: int, policy_rnn_dropout_p: float) -> torch.nn.Module: 6 | return nn.RNN( 7 | input_size=in_features, 8 | hidden_size=hidden_size, 9 | num_layers=num_layers, 10 | nonlinearity="relu", 11 | bidirectional=False, 12 | batch_first=True, 13 | dropout=policy_rnn_dropout_p, 14 | ) 15 | 16 | 17 | def lstm_decoder(in_features: int, hidden_size: int, num_layers: int, policy_rnn_dropout_p: float) -> torch.nn.Module: 18 | return nn.LSTM( 19 | input_size=in_features, 20 | hidden_size=hidden_size, 21 | num_layers=num_layers, 22 | bidirectional=False, 23 | batch_first=True, 24 | dropout=policy_rnn_dropout_p, 25 | ) 26 | 27 | 28 | def gru_decoder(in_features: int, hidden_size: int, num_layers: int, policy_rnn_dropout_p: float) -> torch.nn.Module: 29 | return nn.GRU( 30 | input_size=in_features, 31 | hidden_size=hidden_size, 32 | num_layers=num_layers, 33 | bidirectional=False, 34 | batch_first=True, 35 | dropout=policy_rnn_dropout_p, 36 | ) 37 | 38 | 39 | def mlp_decoder(in_features: int, hidden_size: int, num_layers: int, policy_rnn_dropout_p: float) -> torch.nn.Module: 40 | return nn.Sequential( 41 | nn.Linear(in_features=in_features, out_features=hidden_size), 42 | nn.ReLU(), 43 | nn.Linear(in_features=hidden_size, out_features=hidden_size), 44 | nn.ReLU(), 45 | nn.Linear(in_features=hidden_size, out_features=hidden_size), 46 | ) 47 | -------------------------------------------------------------------------------- /hulc/models/encoders/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lukashermann/hulc/7fdb09f5f1ff831ec8df29cfc61a41db573e9a06/hulc/models/encoders/__init__.py -------------------------------------------------------------------------------- /hulc/models/encoders/clip_lang_encoder.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | import torch 4 | import torch.nn as nn 5 | 6 | from hulc.models.perceptual_encoders.clip import build_model, load_clip, tokenize 7 | 8 | 9 | class LangClip(nn.Module): 10 | def __init__(self, freeze_backbone: bool = True, model_name: str = "RN50"): 11 | super(LangClip, self).__init__() 12 | self.device = "cuda" if torch.cuda.is_available() else "cpu" 13 | # Load CLIP model 14 | print(f"loading language CLIP model with backbone: {model_name}") 15 | self._load_clip(model_name) 16 | if freeze_backbone: 17 | for param in self.clip_rn50.parameters(): 18 | param.requires_grad = False 19 | 20 | def _load_clip(self, model_name: str) -> None: 21 | model, _ = load_clip(model_name, device=self.device) 22 | self.clip_rn50 = build_model(model.state_dict()).to(self.device) 23 | 24 | def forward(self, x: List) -> torch.Tensor: 25 | with torch.no_grad(): 26 | tokens = tokenize(x).to(self.device) 27 | emb = self.clip_rn50.encode_text(tokens) 28 | return torch.unsqueeze(emb, 1) 29 | -------------------------------------------------------------------------------- /hulc/models/encoders/goal_encoders.py: -------------------------------------------------------------------------------- 1 | from typing import Dict 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | 7 | 8 | class VisualGoalEncoder(nn.Module): 9 | def __init__( 10 | self, 11 | hidden_size: int, 12 | latent_goal_features: int, 13 | in_features: int, 14 | l2_normalize_goal_embeddings: bool, 15 | activation_function: str, 16 | ): 17 | super().__init__() 18 | self.l2_normalize_output = l2_normalize_goal_embeddings 19 | self.act_fn = getattr(nn, activation_function)() 20 | self.mlp = nn.Sequential( 21 | nn.Linear(in_features=in_features, out_features=hidden_size), 22 | # nn.BatchNorm1d(hidden_size), 23 | self.act_fn, 24 | nn.Linear(in_features=hidden_size, out_features=hidden_size), 25 | # nn.BatchNorm1d(hidden_size), 26 | self.act_fn, 27 | nn.Linear(in_features=hidden_size, out_features=latent_goal_features), 28 | ) 29 | self.ln = nn.LayerNorm(latent_goal_features) 30 | 31 | def forward(self, x: torch.Tensor) -> torch.Tensor: 32 | x = self.mlp(x) 33 | if self.l2_normalize_output: 34 | x = F.normalize(x, p=2, dim=1) 35 | x = self.ln(x) 36 | return x 37 | 38 | 39 | class LanguageGoalEncoder(nn.Module): 40 | def __init__( 41 | self, 42 | in_features: int, 43 | hidden_size: int, 44 | latent_goal_features: int, 45 | l2_normalize_goal_embeddings: bool, 46 | word_dropout_p: float, 47 | activation_function: str, 48 | ): 49 | super().__init__() 50 | self.l2_normalize_output = l2_normalize_goal_embeddings 51 | self.act_fn = getattr(nn, activation_function)() 52 | self.mlp = nn.Sequential( 53 | nn.Dropout(word_dropout_p), 54 | nn.Linear(in_features=in_features, out_features=hidden_size), 55 | # nn.BatchNorm1d(hidden_size), 56 | self.act_fn, 57 | nn.Linear(in_features=hidden_size, out_features=hidden_size), 58 | # nn.BatchNorm1d(hidden_size), 59 | self.act_fn, 60 | nn.Linear(in_features=hidden_size, out_features=latent_goal_features), 61 | ) 62 | self.ln = nn.LayerNorm(latent_goal_features) 63 | 64 | def forward(self, x: torch.Tensor) -> torch.Tensor: 65 | x = self.mlp(x) 66 | if self.l2_normalize_output: 67 | x = F.normalize(x, p=2, dim=1) 68 | x = self.ln(x) 69 | return x 70 | -------------------------------------------------------------------------------- /hulc/models/encoders/lang_encoder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class LanguageEncoder(nn.Module): 6 | def __init__( 7 | self, 8 | language_features: int, 9 | hidden_size: int, 10 | out_features: int, 11 | word_dropout_p: float, 12 | activation_function: str, 13 | ): 14 | super().__init__() 15 | self.act_fn = getattr(nn, activation_function)() 16 | self.mlp = nn.Sequential( 17 | nn.Dropout(word_dropout_p), 18 | nn.Linear(in_features=language_features, out_features=hidden_size), 19 | # nn.BatchNorm1d(hidden_size), 20 | self.act_fn, 21 | nn.Linear(in_features=hidden_size, out_features=hidden_size), 22 | # nn.BatchNorm1d(hidden_size), 23 | self.act_fn, 24 | nn.Linear(in_features=hidden_size, out_features=out_features), 25 | ) 26 | 27 | def forward(self, x: torch.Tensor) -> torch.Tensor: 28 | x = self.mlp(x) 29 | return x 30 | -------------------------------------------------------------------------------- /hulc/models/encoders/language_network.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | from sentence_transformers import SentenceTransformer 4 | import torch 5 | import torch.nn as nn 6 | 7 | 8 | class SBert(nn.Module): 9 | def __init__(self, nlp_model: str): 10 | # choose model from https://www.sbert.net/docs/pretrained_models.html 11 | super().__init__() 12 | assert isinstance(nlp_model, str) 13 | self.model = SentenceTransformer(nlp_model) 14 | 15 | def forward(self, x: List) -> torch.Tensor: 16 | emb = self.model.encode(x, convert_to_tensor=True) 17 | return torch.unsqueeze(emb, 1) 18 | -------------------------------------------------------------------------------- /hulc/models/gcbc.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from typing import Dict 3 | 4 | import torch 5 | 6 | from hulc.models.hulc import Hulc 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | 11 | class GCBC(Hulc): 12 | """ 13 | Goal-conditioned behavior cloning. 14 | """ 15 | 16 | @staticmethod 17 | def setup_input_sizes( 18 | perceptual_encoder, 19 | plan_proposal, 20 | plan_recognition, 21 | visual_goal, 22 | action_decoder, 23 | distribution, 24 | ): 25 | """ 26 | Configure the input feature sizes of the respective parts of the network. 27 | 28 | Args: 29 | perceptual_encoder: DictConfig for perceptual encoder. 30 | plan_proposal: DictConfig for plan proposal network. 31 | plan_recognition: DictConfig for plan recognition network. 32 | visual_goal: DictConfig for visual goal encoder. 33 | action_decoder: DictConfig for action decoder network. 34 | distribution: DictConfig for plan distribution (continuous or discrete). 35 | """ 36 | plan_proposal.perceptual_features = perceptual_encoder.latent_size 37 | plan_recognition.in_features = perceptual_encoder.latent_size 38 | visual_goal.in_features = perceptual_encoder.latent_size 39 | action_decoder.perceptual_features = perceptual_encoder.latent_size 40 | 41 | if distribution.dist == "discrete": 42 | plan_proposal.plan_features = distribution.class_size * distribution.category_size 43 | plan_recognition.plan_features = distribution.class_size * distribution.category_size 44 | action_decoder.plan_features = 0 45 | elif distribution.dist == "continuous": 46 | plan_proposal.plan_features = distribution.plan_features 47 | plan_recognition.plan_features = distribution.plan_features 48 | action_decoder.plan_features = 0 49 | 50 | def training_step(self, batch: Dict[str, Dict], batch_idx: int) -> torch.Tensor: # type: ignore 51 | """ 52 | Compute and return the training loss. 53 | 54 | Args: 55 | batch (dict): 56 | - 'vis' (dict): 57 | - 'rgb_obs' (dict): 58 | - 'rgb_static' (Tensor): RGB camera image of static camera 59 | - ... 60 | - 'depth_obs' (dict): 61 | - 'depth_static' (Tensor): Depth camera image of depth camera 62 | - ... 63 | - 'robot_obs' (Tensor): Proprioceptive state observation. 64 | - 'actions' (Tensor): Ground truth actions. 65 | - 'state_info' (dict): 66 | - 'robot_obs' (Tensor): Unnormalized robot states. 67 | - 'scene_obs' (Tensor): Unnormalized scene states. 68 | - 'idx' (LongTensor): Episode indices. 69 | - 'lang' (dict): 70 | Like 'vis' but with additional keys: 71 | - 'language' (Tensor): Embedded Language labels. 72 | - 'use_for_aux_lang_loss' (BoolTensor): Mask of which sequences in the batch to consider for 73 | auxiliary loss. 74 | batch_idx (int): Integer displaying index of this batch. 75 | 76 | 77 | Returns: 78 | loss tensor 79 | """ 80 | action_loss, proprio_loss, lang_pred_loss, lang_contrastive_loss, lang_clip_loss, total_loss = ( 81 | torch.tensor(0.0).to(self.device), 82 | torch.tensor(0.0).to(self.device), 83 | torch.tensor(0.0).to(self.device), 84 | torch.tensor(0.0).to(self.device), 85 | torch.tensor(0.0).to(self.device), 86 | torch.tensor(0.0).to(self.device), 87 | ) 88 | 89 | batch_size: Dict[str, int] = {} 90 | total_bs = 0 91 | for self.modality_scope, dataset_batch in batch.items(): 92 | perceptual_emb = self.perceptual_encoder( 93 | dataset_batch["rgb_obs"], dataset_batch["depth_obs"], dataset_batch["robot_obs"] 94 | ) 95 | if self.state_recons: 96 | proprio_loss += self.perceptual_encoder.state_reconstruction_loss() 97 | if "lang" in self.modality_scope: 98 | latent_goal = self.language_goal(dataset_batch["lang"]) 99 | else: 100 | latent_goal = self.visual_goal(perceptual_emb[:, -1]) 101 | 102 | robot_obs = dataset_batch["state_info"]["robot_obs"] 103 | actions = dataset_batch["actions"] 104 | empty_plan = torch.empty((dataset_batch["actions"].shape[0]), 0).to(self.device) 105 | act_loss = self.action_decoder.loss(empty_plan, perceptual_emb, latent_goal, actions, robot_obs) 106 | _, seq_feat = self.plan_recognition(perceptual_emb) 107 | 108 | if "lang" in self.modality_scope: 109 | if not torch.any(dataset_batch["use_for_aux_lang_loss"]): 110 | batch_size["aux_lang"] = 1 111 | else: 112 | batch_size["aux_lang"] = torch.sum(dataset_batch["use_for_aux_lang_loss"]).detach() # type:ignore 113 | if self.use_bc_z_auxiliary_loss: 114 | lang_pred_loss += self.bc_z_auxiliary_loss( 115 | seq_feat, dataset_batch["lang"], dataset_batch["use_for_aux_lang_loss"] 116 | ) 117 | if self.use_clip_auxiliary_loss: 118 | lang_clip_loss += self.clip_auxiliary_loss( 119 | seq_feat, latent_goal, dataset_batch["use_for_aux_lang_loss"] 120 | ) 121 | if self.use_mia_auxiliary_loss: 122 | lang_contrastive_loss += self.mia_auxiliary_loss( 123 | seq_feat, latent_goal, dataset_batch["use_for_aux_lang_loss"] 124 | ) 125 | action_loss += act_loss 126 | total_loss += act_loss 127 | batch_size[self.modality_scope] = dataset_batch["actions"].shape[0] 128 | total_bs += dataset_batch["actions"].shape[0] 129 | 130 | self.log( 131 | f"train/action_loss_{self.modality_scope}", 132 | act_loss, 133 | on_step=False, 134 | on_epoch=True, 135 | batch_size=batch_size[self.modality_scope], 136 | ) 137 | total_loss = total_loss / len(batch) # divide accumulated gradients by number of datasets 138 | action_loss = action_loss / len(batch) 139 | if self.state_recons: 140 | proprio_loss = proprio_loss / len(batch) 141 | total_loss = total_loss + self.st_recon_beta * proprio_loss 142 | self.log( 143 | "train/pred_proprio", 144 | self.st_recon_beta * proprio_loss, 145 | on_step=False, 146 | on_epoch=True, 147 | batch_size=total_bs, 148 | ) 149 | if self.use_bc_z_auxiliary_loss: 150 | total_loss = total_loss + self.bc_z_auxiliary_loss_beta * lang_pred_loss 151 | self.log( 152 | "train/pred_lang", 153 | self.bc_z_auxiliary_loss_beta * lang_pred_loss, 154 | on_step=False, 155 | on_epoch=True, 156 | batch_size=batch_size["aux_lang"], 157 | sync_dist=True, 158 | ) 159 | if self.use_mia_auxiliary_loss: 160 | total_loss = total_loss + self.mia_auxiliary_loss_beta * lang_contrastive_loss 161 | self.log( 162 | "train/lang_contrastive", 163 | self.mia_auxiliary_loss_beta * lang_contrastive_loss, 164 | on_step=False, 165 | on_epoch=True, 166 | batch_size=batch_size["aux_lang"], 167 | sync_dist=True, 168 | ) 169 | if self.use_clip_auxiliary_loss: 170 | total_loss = total_loss + self.clip_auxiliary_loss_beta * lang_clip_loss 171 | self.log( 172 | "train/lang_clip_loss", 173 | self.clip_auxiliary_loss_beta * lang_clip_loss, 174 | on_step=False, 175 | on_epoch=True, 176 | batch_size=batch_size["aux_lang"], 177 | sync_dist=True, 178 | ) 179 | self.log("train/action_loss", action_loss, on_step=False, on_epoch=True, batch_size=total_bs) 180 | self.log("train/total_loss", total_loss, on_step=False, on_epoch=True, batch_size=total_bs) 181 | return total_loss 182 | 183 | def validation_step(self, batch: Dict[str, Dict], batch_idx: int) -> Dict[str, torch.Tensor]: # type: ignore 184 | """ 185 | Compute and log the validation losses and additional metrics. 186 | 187 | Args: 188 | batch (dict): 189 | - 'vis' (dict): 190 | - 'rgb_obs' (dict): 191 | - 'rgb_static' (Tensor): RGB camera image of static camera 192 | - ... 193 | - 'depth_obs' (dict): 194 | - 'depth_static' (Tensor): Depth camera image of depth camera 195 | - ... 196 | - 'robot_obs' (Tensor): Proprioceptive state observation. 197 | - 'actions' (Tensor): Ground truth actions. 198 | - 'state_info' (dict): 199 | - 'robot_obs' (Tensor): Unnormalized robot states. 200 | - 'scene_obs' (Tensor): Unnormalized scene states. 201 | - 'idx' (LongTensor): Episode indices. 202 | - 'lang' (dict): 203 | Like 'vis' but with additional keys: 204 | - 'language' (Tensor): Embedded Language labels. 205 | - 'use_for_aux_lang_loss' (BoolTensor): Mask of which sequences in the batch to consider for 206 | auxiliary loss. 207 | batch_idx (int): Integer displaying index of this batch. 208 | 209 | Returns: 210 | Dictionary containing the sampled plans of plan recognition and plan proposal networks, as well as the 211 | episode indices. 212 | """ 213 | output = {} 214 | val_total_act_loss = torch.tensor(0.0).to(self.device) 215 | for self.modality_scope, dataset_batch in batch.items(): 216 | perceptual_emb = self.perceptual_encoder( 217 | dataset_batch["rgb_obs"], dataset_batch["depth_obs"], dataset_batch["robot_obs"] 218 | ) 219 | if self.state_recons: 220 | state_recon_loss = self.perceptual_encoder.state_reconstruction_loss() 221 | self.log(f"val/proprio_loss_{self.modality_scope}", state_recon_loss, sync_dist=True) 222 | if "lang" in self.modality_scope: 223 | latent_goal = self.language_goal(dataset_batch["lang"]) 224 | else: 225 | latent_goal = self.visual_goal(perceptual_emb[:, -1]) 226 | 227 | robot_obs = dataset_batch["state_info"]["robot_obs"] 228 | actions = dataset_batch["actions"] 229 | empty_plan = torch.empty((dataset_batch["actions"].shape[0]), 0).to(self.device) 230 | action_loss, sample_act = self.action_decoder.loss_and_act( # type: ignore 231 | empty_plan, perceptual_emb, latent_goal, actions, robot_obs 232 | ) 233 | mae = torch.nn.functional.l1_loss( 234 | sample_act[..., :-1], actions[..., :-1], reduction="none" 235 | ) # (batch, seq, 6) 236 | mae = torch.mean(mae, 1) # (batch, 6) 237 | # gripper action 238 | gripper_discrete = sample_act[..., -1] 239 | gt_gripper_act = actions[..., -1] 240 | m = gripper_discrete > 0 241 | gripper_discrete[m] = 1 242 | gripper_discrete[~m] = -1 243 | gripper_sr = torch.mean((gt_gripper_act == gripper_discrete).float()) 244 | _, seq_feat = self.plan_recognition(perceptual_emb) 245 | 246 | if "lang" in self.modality_scope: 247 | if self.use_bc_z_auxiliary_loss: 248 | val_pred_lang_loss = self.bc_z_auxiliary_loss( 249 | seq_feat, dataset_batch["lang"], dataset_batch["use_for_aux_lang_loss"] 250 | ) 251 | self.log("val/lang_pred_loss", val_pred_lang_loss, sync_dist=True) 252 | if self.use_clip_auxiliary_loss: 253 | val_pred_clip_loss = self.clip_auxiliary_loss( 254 | seq_feat, latent_goal, dataset_batch["use_for_aux_lang_loss"] 255 | ) 256 | self.log("val/val_pred_clip_loss", val_pred_clip_loss, sync_dist=True) 257 | self.clip_groundtruth(seq_feat, dataset_batch["idx"], dataset_batch["use_for_aux_lang_loss"]) 258 | if self.use_mia_auxiliary_loss: 259 | val_pred_contrastive_loss = self.mia_auxiliary_loss( 260 | seq_feat, latent_goal, dataset_batch["use_for_aux_lang_loss"] 261 | ) 262 | self.log("val/lang_contrastive_loss", val_pred_contrastive_loss, sync_dist=True) 263 | val_total_act_loss += action_loss 264 | mae_mean = mae.mean() 265 | pos_mae = mae[..., :3].mean() 266 | orn_mae = mae[..., 3:6].mean() 267 | self.log(f"val_total_mae/{self.modality_scope}_total_mae", mae_mean, sync_dist=True) 268 | self.log(f"val_pos_mae/{self.modality_scope}_pos_mae", pos_mae, sync_dist=True) 269 | self.log(f"val_orn_mae/{self.modality_scope}_orn_mae", orn_mae, sync_dist=True) 270 | self.log(f"val_act/{self.modality_scope}_act_loss", action_loss, sync_dist=True) 271 | self.log(f"val_grip/{self.modality_scope}_grip_sr", gripper_sr, sync_dist=True) 272 | self.log( 273 | "val_act/action_loss", 274 | val_total_act_loss / len(self.trainer.datamodule.modalities), # type:ignore 275 | sync_dist=True, 276 | ) 277 | output[f"idx_{self.modality_scope}"] = dataset_batch["idx"] 278 | 279 | return output 280 | 281 | def reset(self): 282 | """ 283 | Call this at the beginning of a new rollout when doing inference. 284 | """ 285 | self.latent_goal = None 286 | 287 | def step(self, obs, goal): 288 | """ 289 | Do one step of inference with the model. 290 | 291 | Args: 292 | obs (dict): Observation from environment. 293 | goal (dict): Goal as visual observation or embedded language instruction. 294 | 295 | Returns: 296 | Predicted action. 297 | """ 298 | with torch.no_grad(): 299 | if self.latent_goal is None: 300 | if isinstance(goal, str): 301 | embedded_lang = torch.from_numpy(self.lang_embeddings[goal]).to(self.device).squeeze(0).float() 302 | self.latent_goal = self.language_goal(embedded_lang) 303 | else: 304 | imgs = { 305 | k: torch.cat([v, goal["rgb_obs"][k]], dim=1) for k, v in obs["rgb_obs"].items() 306 | } # (1, 2, C, H, W) 307 | depth_imgs = {k: torch.cat([v, goal["depth_obs"][k]], dim=1) for k, v in obs["depth_obs"].items()} 308 | state = torch.cat([obs["robot_obs"], goal["robot_obs"]], dim=1) 309 | perceptual_emb = self.perceptual_encoder(imgs, depth_imgs, state) 310 | self.latent_goal = self.visual_goal(perceptual_emb[:, -1]) 311 | 312 | perceptual_emb = self.perceptual_encoder(obs["rgb_obs"], obs["depth_obs"], obs["robot_obs"]) 313 | empty_plan = torch.empty(1, 0).to(self.device) 314 | action = self.action_decoder.act( 315 | empty_plan, perceptual_emb, self.latent_goal, obs["robot_obs_raw"] 316 | ) # type: ignore 317 | return action 318 | -------------------------------------------------------------------------------- /hulc/models/perceptual_encoders/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lukashermann/hulc/7fdb09f5f1ff831ec8df29cfc61a41db573e9a06/hulc/models/perceptual_encoders/__init__.py -------------------------------------------------------------------------------- /hulc/models/perceptual_encoders/concat_encoders.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Optional 2 | 3 | import hydra 4 | from omegaconf import DictConfig 5 | import torch 6 | import torch.nn as nn 7 | from torch.nn.functional import mse_loss 8 | 9 | 10 | class ConcatEncoders(nn.Module): 11 | def __init__( 12 | self, 13 | rgb_static: DictConfig, 14 | proprio: DictConfig, 15 | device: torch.device, 16 | depth_static: Optional[DictConfig] = None, 17 | rgb_gripper: Optional[DictConfig] = None, 18 | depth_gripper: Optional[DictConfig] = None, 19 | tactile: Optional[DictConfig] = None, 20 | state_decoder: Optional[DictConfig] = None, 21 | ): 22 | super().__init__() 23 | self._latent_size = rgb_static.visual_features 24 | if rgb_gripper: 25 | self._latent_size += rgb_gripper.visual_features 26 | if depth_static: 27 | self._latent_size += depth_static.visual_features 28 | if depth_gripper: 29 | self._latent_size += depth_gripper.visual_features 30 | if tactile: 31 | self._latent_size += tactile.visual_features 32 | visual_features = self._latent_size 33 | # super ugly, fix this clip ddp thing in a better way 34 | if "clip" in rgb_static["_target_"]: 35 | self.rgb_static_encoder = hydra.utils.instantiate(rgb_static, device=device) 36 | else: 37 | self.rgb_static_encoder = hydra.utils.instantiate(rgb_static) 38 | self.depth_static_encoder = hydra.utils.instantiate(depth_static) if depth_static else None 39 | self.rgb_gripper_encoder = hydra.utils.instantiate(rgb_gripper) if rgb_gripper else None 40 | self.depth_gripper_encoder = hydra.utils.instantiate(depth_gripper) if depth_gripper else None 41 | self.tactile_encoder = hydra.utils.instantiate(tactile) 42 | self.proprio_encoder = hydra.utils.instantiate(proprio) 43 | if self.proprio_encoder: 44 | self._latent_size += self.proprio_encoder.out_features 45 | 46 | self.state_decoder = None 47 | if state_decoder: 48 | state_decoder.visual_features = visual_features 49 | state_decoder.n_state_obs = self.proprio_encoder.out_features 50 | self.state_decoder = hydra.utils.instantiate(state_decoder) 51 | 52 | self.current_visual_embedding = None 53 | self.current_state_obs = None 54 | 55 | @property 56 | def latent_size(self): 57 | return self._latent_size 58 | 59 | def forward( 60 | self, imgs: Dict[str, torch.Tensor], depth_imgs: Dict[str, torch.Tensor], state_obs: torch.Tensor 61 | ) -> torch.Tensor: 62 | rgb_static = imgs["rgb_static"] 63 | rgb_gripper = imgs["rgb_gripper"] if "rgb_gripper" in imgs else None 64 | rgb_tactile = imgs["rgb_tactile"] if "rgb_tactile" in imgs else None 65 | depth_static = depth_imgs["depth_static"] if "depth_static" in depth_imgs else None 66 | depth_gripper = depth_imgs["depth_gripper"] if "depth_gripper" in depth_imgs else None 67 | 68 | b, s, c, h, w = rgb_static.shape 69 | rgb_static = rgb_static.reshape(-1, c, h, w) # (batch_size * sequence_length, 3, 200, 200) 70 | # ------------ Vision Network ------------ # 71 | encoded_imgs = self.rgb_static_encoder(rgb_static) # (batch*seq_len, 64) 72 | encoded_imgs = encoded_imgs.reshape(b, s, -1) # (batch, seq, 64) 73 | 74 | if depth_static is not None: 75 | depth_static = torch.unsqueeze(depth_static, 2) 76 | depth_static = depth_static.reshape(-1, 1, h, w) # (batch_size * sequence_length, 3, 200, 200) 77 | encoded_depth_static = self.depth_static_encoder(depth_static) # (batch*seq_len, 64) 78 | encoded_depth_static = encoded_depth_static.reshape(b, s, -1) # (batch, seq, 64) 79 | encoded_imgs = torch.cat([encoded_imgs, encoded_depth_static], dim=-1) 80 | 81 | if rgb_gripper is not None: 82 | b, s, c, h, w = rgb_gripper.shape 83 | rgb_gripper = rgb_gripper.reshape(-1, c, h, w) # (batch_size * sequence_length, 3, 84, 84) 84 | encoded_imgs_gripper = self.rgb_gripper_encoder(rgb_gripper) # (batch*seq_len, 64) 85 | encoded_imgs_gripper = encoded_imgs_gripper.reshape(b, s, -1) # (batch, seq, 64) 86 | encoded_imgs = torch.cat([encoded_imgs, encoded_imgs_gripper], dim=-1) 87 | if depth_gripper is not None: 88 | depth_gripper = torch.unsqueeze(depth_gripper, 2) 89 | depth_gripper = depth_gripper.reshape(-1, 1, h, w) # (batch_size * sequence_length, 1, 84, 84) 90 | encoded_depth_gripper = self.depth_gripper_encoder(depth_gripper) 91 | encoded_depth_gripper = encoded_depth_gripper.reshape(b, s, -1) # (batch, seq, 64) 92 | encoded_imgs = torch.cat([encoded_imgs, encoded_depth_gripper], dim=-1) 93 | 94 | if rgb_tactile is not None: 95 | b, s, c, h, w = rgb_tactile.shape 96 | rgb_tactile = rgb_tactile.reshape(-1, c, h, w) # (batch_size * sequence_length, 3, 84, 84) 97 | encoded_tactile = self.tactile_encoder(rgb_tactile) 98 | encoded_tactile = encoded_tactile.reshape(b, s, -1) 99 | encoded_imgs = torch.cat([encoded_imgs, encoded_tactile], dim=-1) 100 | 101 | self.current_visual_embedding = encoded_imgs 102 | self.current_state_obs = state_obs # type: ignore 103 | if self.proprio_encoder: 104 | state_obs_out = self.proprio_encoder(state_obs) 105 | perceptual_emb = torch.cat([encoded_imgs, state_obs_out], dim=-1) 106 | else: 107 | perceptual_emb = encoded_imgs 108 | 109 | return perceptual_emb 110 | 111 | def state_reconstruction_loss(self): 112 | assert self.state_decoder is not None 113 | proprio_pred = self.state_decoder(self.current_visual_embedding) 114 | return mse_loss(self.current_state_obs, proprio_pred) 115 | -------------------------------------------------------------------------------- /hulc/models/perceptual_encoders/proprio_encoder.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from torch import nn 3 | 4 | 5 | class IdentityEncoder(nn.Module): 6 | def __init__(self, proprioception_dims): 7 | super(IdentityEncoder, self).__init__() 8 | # remove a dimension if we convert robot orientation quaternion to euler angles 9 | self.n_state_obs = int(np.sum(np.diff([list(x) for x in [list(y) for y in proprioception_dims.keep_indices]]))) 10 | self.identity = nn.Identity() 11 | 12 | @property 13 | def out_features(self): 14 | return self.n_state_obs 15 | 16 | def forward(self, x): 17 | return self.identity(x) 18 | -------------------------------------------------------------------------------- /hulc/models/perceptual_encoders/tactile_encoder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import torchvision.models as models 5 | 6 | 7 | class TactileEncoder(nn.Module): 8 | def __init__(self, visual_features: int, freeze_tactile_backbone: bool = True): 9 | super(TactileEncoder, self).__init__() 10 | # Load pre-trained resnet-18 11 | net = models.resnet18(pretrained=True) 12 | # Remove the last fc layer, and rebuild 13 | modules = list(net.children())[:-1] 14 | self.net = nn.Sequential(*modules) 15 | if freeze_tactile_backbone: 16 | for param in self.net.parameters(): 17 | param.requires_grad = False 18 | self.fc1 = nn.Linear(1024, 512) 19 | self.fc2 = nn.Linear(512, visual_features) 20 | 21 | def forward(self, x: torch.Tensor) -> torch.Tensor: 22 | x_l = self.net(x[:, :3, :, :]).squeeze() 23 | x_r = self.net(x[:, 3:, :, :]).squeeze() 24 | x = torch.cat((x_l, x_r), dim=-1) 25 | # Add fc layer for final prediction 26 | output = F.relu(self.fc1(x)) # batch, 512 27 | output = self.fc2(output) # batch, 64 28 | return output 29 | -------------------------------------------------------------------------------- /hulc/models/perceptual_encoders/vision_clip.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | from hulc.models.perceptual_encoders.clip import load_clip 6 | 7 | 8 | class VisionClip(nn.Module): 9 | def __init__( 10 | self, device: torch.device, visual_features: int, freeze_backbone: bool = True, model_name: str = "RN50" 11 | ): 12 | super(VisionClip, self).__init__() 13 | # Load CLIP model 14 | print(f"loading vision CLIP model with backbone: {model_name}") 15 | self.clip_model, _ = load_clip(model_name, device=device) 16 | if freeze_backbone: 17 | for param in self.clip_model.parameters(): 18 | param.requires_grad = False 19 | if "RN50" in model_name: 20 | self.fc1 = nn.Linear(1024, 512) 21 | self.fc2 = nn.Linear(512, visual_features) 22 | elif "ViT-B/32" in model_name: 23 | self.fc1 = nn.Linear(512, 256) 24 | self.fc2 = nn.Linear(256, visual_features) 25 | 26 | def forward(self, x: torch.Tensor) -> torch.Tensor: 27 | x = self.clip_model.encode_image(x) # type:ignore 28 | output = F.relu(self.fc1(x)) # batch, 512 29 | output = self.fc2(output) # batch, 64 30 | return output 31 | -------------------------------------------------------------------------------- /hulc/models/perceptual_encoders/vision_network.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | from typing import Optional, Tuple 4 | 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | from torch.nn.parameter import Parameter 9 | 10 | 11 | class VisionNetwork(nn.Module): 12 | # reference: https://arxiv.org/pdf/2005.07648.pdf 13 | def __init__( 14 | self, 15 | input_width: int, 16 | input_height: int, 17 | activation_function: str, 18 | dropout_vis_fc: float, 19 | l2_normalize_output: bool, 20 | visual_features: int, 21 | num_c: int, 22 | use_sinusoid: bool, 23 | spatial_softmax_temp: float, 24 | ): 25 | super(VisionNetwork, self).__init__() 26 | self.l2_normalize_output = l2_normalize_output 27 | self.act_fn = getattr(nn, activation_function)() 28 | # w,h,kernel_size,padding,stride 29 | w, h = self.calc_out_size(input_width, input_height, 8, 0, 4) 30 | w, h = self.calc_out_size(w, h, 4, 0, 2) 31 | w, h = self.calc_out_size(w, h, 3, 0, 1) 32 | self.use_sinusoid = use_sinusoid 33 | temp = None if not isinstance(spatial_softmax_temp, float) else spatial_softmax_temp 34 | self.spatial_softmax = SpatialSoftmax(num_rows=w, num_cols=h, temperature=temp) # shape: [N, 128] 35 | # model 36 | self.conv_model = nn.Sequential( 37 | # input shape: [N, 3, 200, 200] 38 | nn.Conv2d(in_channels=num_c, out_channels=32, kernel_size=8, stride=4), # shape: [N, 32, 49, 49] 39 | # nn.BatchNorm2d(32), 40 | self.act_fn, 41 | nn.Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2), # shape: [N, 64, 23, 23] 42 | # nn.BatchNorm2d(64), 43 | self.act_fn, 44 | nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1), # shape: [N, 64, 21, 21] 45 | # nn.BatchNorm2d(64), 46 | self.act_fn, 47 | ) 48 | k = 3 if self.use_sinusoid else 1 49 | self.fc1 = nn.Sequential( 50 | nn.Linear(in_features=128 * k, out_features=512), self.act_fn, nn.Dropout(dropout_vis_fc) 51 | ) # shape: [N, 512] 52 | self.fc2 = nn.Linear(in_features=512, out_features=visual_features) # shape: [N, 64] 53 | self.ln = nn.LayerNorm(visual_features) 54 | 55 | def forward(self, x: torch.Tensor) -> torch.Tensor: 56 | x = self.conv_model(x) 57 | x = self.spatial_softmax(x) 58 | if self.use_sinusoid: 59 | x = torch.cat([x, torch.sin(x), torch.cos(x)], 1) 60 | x = self.fc1(x) 61 | x = self.fc2(x) 62 | if self.l2_normalize_output: 63 | x = F.normalize(x, p=2, dim=1) 64 | x = self.ln(x) 65 | return x # shape: [N, 64] 66 | 67 | @staticmethod 68 | def calc_out_size(w: int, h: int, kernel_size: int, padding: int, stride: int) -> Tuple[int, int]: 69 | width = (w - kernel_size + 2 * padding) // stride + 1 70 | height = (h - kernel_size + 2 * padding) // stride + 1 71 | return width, height 72 | 73 | 74 | class SpatialSoftmax(nn.Module): 75 | def __init__(self, num_rows: int, num_cols: int, temperature: Optional[float] = None): 76 | """ 77 | Computes the spatial softmax of a convolutional feature map. 78 | Read more here: 79 | "Learning visual feature spaces for robotic manipulation with 80 | deep spatial autoencoders." Finn et al., http://arxiv.org/abs/1509.06113. 81 | :param num_rows: size related to original image width 82 | :param num_cols: size related to original image height 83 | :param temperature: Softmax temperature (optional). If None, a learnable temperature is created. 84 | """ 85 | super(SpatialSoftmax, self).__init__() 86 | self.num_rows = num_rows 87 | self.num_cols = num_cols 88 | grid_x, grid_y = torch.meshgrid( 89 | torch.linspace(-1.0, 1.0, num_cols), torch.linspace(-1.0, 1.0, num_rows), indexing="ij" 90 | ) 91 | x_map = grid_x.reshape(-1) 92 | y_map = grid_y.reshape(-1) 93 | self.register_buffer("x_map", x_map) 94 | self.register_buffer("y_map", y_map) 95 | if temperature: 96 | self.register_buffer("temperature", torch.ones(1) * temperature) 97 | else: 98 | self.temperature = Parameter(torch.ones(1)) 99 | 100 | def forward(self, x: torch.Tensor) -> torch.Tensor: 101 | n, c, h, w = x.shape 102 | x = x.contiguous().view(-1, h * w) # batch, C, W*H 103 | softmax_attention = F.softmax(x / self.temperature, dim=1) # batch, C, W*H 104 | expected_x = torch.sum(self.x_map * softmax_attention, dim=1, keepdim=True) 105 | expected_y = torch.sum(self.y_map * softmax_attention, dim=1, keepdim=True) 106 | expected_xy = torch.cat((expected_x, expected_y), 1) 107 | self.coords = expected_xy.view(-1, c * 2) 108 | return self.coords # batch, C*2 109 | -------------------------------------------------------------------------------- /hulc/models/perceptual_encoders/vision_network_gripper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | from typing import Tuple 4 | 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | 9 | 10 | def nature_cnn(act_fn, num_c): 11 | return nn.Sequential( 12 | nn.Conv2d(num_c, 32, 8, stride=4), 13 | act_fn, 14 | nn.Conv2d(32, 64, 4, stride=2), 15 | act_fn, 16 | nn.Conv2d(64, 64, 3, stride=1), 17 | act_fn, 18 | nn.Flatten(start_dim=1), 19 | nn.Linear(64 * 7 * 7, 128), 20 | act_fn, 21 | ) 22 | 23 | 24 | class VisionNetwork(nn.Module): 25 | def __init__( 26 | self, 27 | input_width: int, 28 | input_height: int, 29 | conv_encoder: str, 30 | activation_function: str, 31 | dropout_vis_fc: float, 32 | l2_normalize_output: bool, 33 | visual_features: int, 34 | num_c: int, 35 | ): 36 | super(VisionNetwork, self).__init__() 37 | self.l2_normalize_output = l2_normalize_output 38 | self.act_fn = getattr(nn, activation_function)() 39 | # model 40 | # this calls the method with the name conv_encoder 41 | self.conv_model = eval(conv_encoder) 42 | self.conv_model = self.conv_model(self.act_fn, num_c) 43 | self.fc1 = nn.Sequential( 44 | nn.Linear(in_features=128, out_features=512), self.act_fn, nn.Dropout(dropout_vis_fc) 45 | ) # shape: [N, 512] 46 | self.fc2 = nn.Linear(in_features=512, out_features=visual_features) # shape: [N, 64] 47 | self.ln = nn.LayerNorm(visual_features) 48 | 49 | def forward(self, x: torch.Tensor) -> torch.Tensor: 50 | x = self.conv_model(x) 51 | x = self.fc1(x) 52 | x = self.fc2(x) 53 | if self.l2_normalize_output: 54 | x = F.normalize(x, p=2, dim=1) 55 | x = self.ln(x) 56 | return x # shape: [N, 64] 57 | 58 | @staticmethod 59 | def calc_out_size(w: int, h: int, kernel_size: int, padding: int, stride: int) -> Tuple[int, int]: 60 | width = (w - kernel_size + 2 * padding) // stride + 1 61 | height = (h - kernel_size + 2 * padding) // stride + 1 62 | return width, height 63 | -------------------------------------------------------------------------------- /hulc/models/plan_encoders/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lukashermann/hulc/7fdb09f5f1ff831ec8df29cfc61a41db573e9a06/hulc/models/plan_encoders/__init__.py -------------------------------------------------------------------------------- /hulc/models/plan_encoders/plan_proposal_net.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import torch 3 | import torch.nn as nn 4 | 5 | from hulc.utils.distributions import Distribution, State 6 | 7 | 8 | class PlanProposalNetwork(nn.Module): 9 | def __init__( 10 | self, 11 | perceptual_features: int, 12 | latent_goal_features: int, 13 | plan_features: int, 14 | activation_function: str, 15 | hidden_size: int, 16 | dist: Distribution, 17 | ): 18 | super(PlanProposalNetwork, self).__init__() 19 | self.perceptual_features = perceptual_features 20 | self.latent_goal_features = latent_goal_features 21 | self.plan_features = plan_features 22 | self.hidden_size = hidden_size 23 | self.in_features = self.perceptual_features + self.latent_goal_features 24 | self.act_fn = getattr(nn, activation_function)() 25 | self.dist = dist 26 | self.fc_model = nn.Sequential( 27 | nn.Linear(in_features=self.in_features, out_features=hidden_size), # shape: [N, 136] 28 | # nn.BatchNorm1d(hidden_size), 29 | self.act_fn, 30 | nn.Linear(in_features=hidden_size, out_features=hidden_size), 31 | # nn.BatchNorm1d(hidden_size), 32 | self.act_fn, 33 | nn.Linear(in_features=hidden_size, out_features=hidden_size), 34 | # nn.BatchNorm1d(hidden_size), 35 | self.act_fn, 36 | nn.Linear(in_features=hidden_size, out_features=hidden_size), 37 | # nn.BatchNorm1d(hidden_size), 38 | self.act_fn, 39 | ) 40 | self.fc_state = self.dist.build_state(self.hidden_size, self.plan_features) 41 | 42 | def forward(self, initial_percep_emb: torch.Tensor, latent_goal: torch.Tensor) -> State: 43 | x = torch.cat([initial_percep_emb, latent_goal], dim=-1) 44 | x = self.fc_model(x) 45 | my_state = self.fc_state(x) 46 | state = self.dist.forward_dist(my_state) 47 | return state 48 | -------------------------------------------------------------------------------- /hulc/models/plan_encoders/plan_recognition_net.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import math 4 | from typing import Tuple 5 | 6 | import torch 7 | import torch.nn as nn 8 | 9 | from hulc.utils.distributions import Distribution, State 10 | 11 | 12 | class PlanRecognitionBiRNNNetwork(nn.Module): 13 | def __init__( 14 | self, 15 | in_features: int, 16 | plan_features: int, 17 | action_space: int, 18 | birnn_dropout_p: float, 19 | dist: Distribution, 20 | rnn_type: str, 21 | ): 22 | super(PlanRecognitionBiRNNNetwork, self).__init__() 23 | self.plan_features = plan_features 24 | self.action_space = action_space 25 | self.in_features = in_features 26 | self.dist = dist 27 | self.birnn_model = eval(rnn_type)( 28 | input_size=self.in_features, 29 | hidden_size=2048, 30 | num_layers=2, 31 | bidirectional=True, 32 | batch_first=True, 33 | dropout=birnn_dropout_p, 34 | ) # shape: [N, seq_len, feat] 35 | self.fc_state = self.dist.build_state(4096, self.plan_features) 36 | 37 | def forward(self, perceptual_emb: torch.Tensor) -> Tuple[State, torch.Tensor]: 38 | x, hn = self.birnn_model(perceptual_emb) 39 | x = x[:, -1] # we just need only last unit output 40 | my_state = self.fc_state(x) 41 | state = self.dist.forward_dist(my_state) 42 | return state, x 43 | 44 | 45 | class PlanRecognitionTransformersNetwork(nn.Module): 46 | def __init__( 47 | self, 48 | num_heads: int, 49 | num_layers: int, 50 | encoder_hidden_size: int, 51 | fc_hidden_size: int, 52 | plan_features: int, 53 | in_features: int, 54 | action_space: int, 55 | encoder_normalize: bool, 56 | positional_normalize: bool, 57 | position_embedding: bool, 58 | max_position_embeddings: int, 59 | dropout_p: bool, 60 | dist: Distribution, 61 | ): 62 | 63 | super().__init__() 64 | self.in_features = in_features 65 | self.plan_features = plan_features 66 | self.action_space = action_space 67 | self.padding = False 68 | self.dist = dist 69 | self.hidden_size = fc_hidden_size 70 | self.position_embedding = position_embedding 71 | self.encoder_normalize = encoder_normalize 72 | self.positional_normalize = positional_normalize 73 | mod = self.in_features % num_heads 74 | if mod != 0: 75 | print(f"Padding for Num of Heads : {num_heads}") 76 | self.padding = True 77 | self.pad = num_heads - mod 78 | self.in_features += self.pad 79 | if position_embedding: 80 | self.position_embeddings = nn.Embedding(max_position_embeddings, self.in_features) 81 | else: 82 | self.positional_encoder = PositionalEncoding(self.in_features) # TODO: with max window_size 83 | encoder_layer = nn.TransformerEncoderLayer( 84 | self.in_features, num_heads, dim_feedforward=encoder_hidden_size, dropout=dropout_p 85 | ) 86 | encoder_norm = nn.LayerNorm(self.in_features) if encoder_normalize else None 87 | if self.positional_normalize: 88 | self.layernorm = nn.LayerNorm(self.in_features) 89 | self.dropout = nn.Dropout(p=dropout_p) 90 | self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers, norm=encoder_norm) 91 | self.fc = nn.Linear(in_features=self.in_features, out_features=fc_hidden_size) 92 | self.fc_state = self.dist.build_state(fc_hidden_size, self.plan_features) 93 | 94 | def forward(self, perceptual_emb: torch.Tensor) -> Tuple[State, torch.Tensor]: 95 | batch_size, seq_len = perceptual_emb.shape[0], perceptual_emb.shape[1] 96 | perceptual_emb = ( 97 | torch.cat([perceptual_emb, torch.zeros((batch_size, seq_len, self.pad)).to(perceptual_emb.device)], dim=-1) 98 | if self.padding 99 | else perceptual_emb 100 | ) 101 | if self.position_embedding: 102 | position_ids = torch.arange(seq_len, dtype=torch.long, device=perceptual_emb.device).unsqueeze(0) 103 | position_embeddings = self.position_embeddings(position_ids) 104 | x = perceptual_emb + position_embeddings 105 | x = x.permute(1, 0, 2) 106 | else: 107 | # padd the perceptual embeddig 108 | x = self.positional_encoder(perceptual_emb.permute(1, 0, 2)) # [s, b, emb] 109 | if self.positional_normalize: 110 | x = self.layernorm(x) 111 | x = self.dropout(x) 112 | x = self.transformer_encoder(x) 113 | x = self.fc(x.permute(1, 0, 2)) 114 | x = torch.mean(x, dim=1) # gather all the sequence info 115 | my_state = self.fc_state(x) 116 | state = self.dist.forward_dist(my_state) 117 | return state, x 118 | 119 | 120 | class PositionalEncoding(nn.Module): 121 | """Implementation from: https://pytorch.org/tutorials/beginner/transformer_tutorial.html""" 122 | 123 | def __init__(self, d_model, max_len=5000): 124 | super(PositionalEncoding, self).__init__() 125 | 126 | pe = torch.zeros(max_len, d_model) 127 | position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) 128 | div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) 129 | pe[:, 0::2] = torch.sin(position * div_term) 130 | pe[:, 1::2] = torch.cos(position * div_term) if d_model % 2 == 0 else torch.cos(position * div_term[:-1]) 131 | pe = pe.unsqueeze(0).transpose(0, 1) 132 | self.register_buffer("pe", pe) 133 | 134 | def forward(self, x): 135 | x = x + self.pe[: x.size(0), :] 136 | return x 137 | -------------------------------------------------------------------------------- /hulc/training.py: -------------------------------------------------------------------------------- 1 | from datetime import timedelta 2 | import logging 3 | from pathlib import Path 4 | import sys 5 | from typing import List, Union 6 | 7 | from lightning_lite.accelerators.cuda import num_cuda_devices 8 | from pytorch_lightning.strategies import DDPStrategy 9 | 10 | # This is for using the locally installed repo clone when using slurm 11 | sys.path.insert(0, Path(__file__).absolute().parents[1].as_posix()) 12 | from calvin_agent.utils.utils import get_git_commit_hash, get_last_checkpoint, print_system_env_info 13 | import hydra 14 | from omegaconf import DictConfig, ListConfig, OmegaConf 15 | from pytorch_lightning import Callback, LightningModule, seed_everything, Trainer 16 | from pytorch_lightning.callbacks import LearningRateMonitor 17 | from pytorch_lightning.loggers import Logger 18 | from pytorch_lightning.utilities import rank_zero_only 19 | 20 | import hulc 21 | import hulc.models.hulc as models_m 22 | from hulc.utils.utils import initialize_pretrained_weights 23 | 24 | logger = logging.getLogger(__name__) 25 | 26 | 27 | @hydra.main(config_path="../conf", config_name="config") 28 | def train(cfg: DictConfig) -> None: 29 | """ 30 | This is called to start a training. 31 | 32 | Args: 33 | cfg: hydra config 34 | """ 35 | # sets seeds for numpy, torch, python.random and PYTHONHASHSEED. 36 | seed_everything(cfg.seed, workers=True) # type: ignore 37 | datamodule = hydra.utils.instantiate(cfg.datamodule, training_repo_root=Path(hulc.__file__).parents[1]) 38 | chk = get_last_checkpoint(Path.cwd()) 39 | 40 | # Load Model 41 | if chk is not None: 42 | model = getattr(models_m, cfg.model["_target_"].split(".")[-1]).load_from_checkpoint(chk.as_posix()) 43 | else: 44 | model = hydra.utils.instantiate(cfg.model) 45 | if "pretrain_chk" in cfg: 46 | initialize_pretrained_weights(model, cfg) 47 | 48 | log_rank_0(f"Training with the following config:\n{OmegaConf.to_yaml(cfg)}") 49 | log_rank_0("Repo commit hash: {}".format(get_git_commit_hash(Path(hydra.utils.to_absolute_path(__file__))))) 50 | log_rank_0(print_system_env_info()) 51 | 52 | train_logger = setup_logger(cfg, model) 53 | callbacks = setup_callbacks(cfg.callbacks) 54 | lr_logger = LearningRateMonitor(logging_interval="step") 55 | callbacks.append(lr_logger) 56 | 57 | trainer_args = { 58 | **cfg.trainer, 59 | "logger": train_logger, 60 | "callbacks": callbacks, 61 | "benchmark": False, 62 | } 63 | 64 | # Configure multi-GPU training 65 | if is_multi_gpu_training(trainer_args["devices"]): 66 | # increase default timeout for loading data into shared memory 67 | trainer_args["strategy"] = DDPStrategy(find_unused_parameters=False, timeout=timedelta(seconds=3600)) 68 | if not cfg.slurm: 69 | modify_argv_hydra() 70 | 71 | trainer = Trainer(**trainer_args) 72 | 73 | # Start training 74 | trainer.fit(model, datamodule=datamodule, ckpt_path=chk) # type: ignore 75 | 76 | 77 | def setup_callbacks(callbacks_cfg: DictConfig) -> List[Callback]: 78 | """ 79 | Instantiate all training callbacks. 80 | 81 | Args: 82 | callbacks_cfg: DictConfig with all callback params 83 | 84 | Returns: 85 | List of instantiated callbacks. 86 | """ 87 | callbacks = [hydra.utils.instantiate(cb) for cb in callbacks_cfg.values()] 88 | return callbacks 89 | 90 | 91 | def setup_logger(cfg: DictConfig, model: LightningModule) -> Logger: 92 | """ 93 | Set up the logger (tensorboard or wandb) from hydra config. 94 | 95 | Args: 96 | cfg: Hydra config 97 | model: LightningModule 98 | 99 | Returns: 100 | logger 101 | """ 102 | pathlib_cwd = Path.cwd() 103 | if "group" in cfg.logger: 104 | cfg.logger.group = pathlib_cwd.parent.name 105 | cfg.logger.name = pathlib_cwd.parent.name + "/" + pathlib_cwd.name 106 | cfg.logger.id = cfg.logger.name.replace("/", "_") 107 | train_logger = hydra.utils.instantiate(cfg.logger) 108 | # train_logger.watch(model) 109 | else: 110 | train_logger = hydra.utils.instantiate(cfg.logger) 111 | return train_logger 112 | 113 | 114 | def modify_argv_hydra() -> None: 115 | """ 116 | To make hydra work with pytorch-lightning and ddp, we modify sys.argv for the child processes spawned with ddp. 117 | This is only used when NOT using slurm. 118 | """ 119 | cwd = Path.cwd().as_posix() 120 | cwd = f'"{cwd}"' 121 | sys.argv = sys.argv[:1] 122 | sys.argv.extend( 123 | [ 124 | f"hydra.run.dir={cwd}", 125 | "hydra/hydra_logging=disabled", 126 | "hydra/job_logging=disabled", 127 | ] 128 | ) 129 | overrides = OmegaConf.load(".hydra/overrides.yaml") 130 | for o in overrides: 131 | if "hydra/sweeper" in o: # type: ignore 132 | continue 133 | 134 | if "hydra/launcher" in o: # type: ignore 135 | continue 136 | 137 | sys.argv.append(o) # type: ignore 138 | 139 | 140 | def is_multi_gpu_training(devices: Union[int, str, ListConfig]) -> bool: 141 | """ 142 | Check if training on multiple GPUs. 143 | See https://pytorch-lightning.readthedocs.io/en/stable/common/trainer.html#devices 144 | 145 | Args: 146 | devices: int, str or ListConfig specifying devices 147 | 148 | Returns: 149 | True if multi-gpu training (ddp), False otherwise. 150 | """ 151 | num_gpu_available = num_cuda_devices() 152 | if isinstance(devices, int): 153 | return devices > 1 or (devices == -1 and num_gpu_available > 1) 154 | elif isinstance(devices, str) and devices == "auto": 155 | return num_gpu_available > 1 156 | elif isinstance(devices, str): 157 | return len(devices) > 1 158 | elif isinstance(devices, ListConfig): 159 | return len(devices) > 1 160 | else: 161 | raise ValueError 162 | 163 | 164 | @rank_zero_only 165 | def log_rank_0(*args, **kwargs): 166 | # when using ddp, only log with rank 0 process 167 | logger.info(*args, **kwargs) 168 | 169 | 170 | if __name__ == "__main__": 171 | train() 172 | -------------------------------------------------------------------------------- /hulc/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lukashermann/hulc/7fdb09f5f1ff831ec8df29cfc61a41db573e9a06/hulc/utils/__init__.py -------------------------------------------------------------------------------- /hulc/utils/bpe_simple_vocab_16e6.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lukashermann/hulc/7fdb09f5f1ff831ec8df29cfc61a41db573e9a06/hulc/utils/bpe_simple_vocab_16e6.txt.gz -------------------------------------------------------------------------------- /hulc/utils/clip_tokenizer.py: -------------------------------------------------------------------------------- 1 | from functools import lru_cache 2 | import gzip 3 | import html 4 | import os 5 | 6 | import ftfy 7 | import regex as re 8 | 9 | 10 | @lru_cache() 11 | def default_bpe(): 12 | return os.path.join(os.path.dirname(os.path.abspath(__file__)), "bpe_simple_vocab_16e6.txt.gz") 13 | 14 | 15 | @lru_cache() 16 | def bytes_to_unicode(): 17 | """ 18 | Returns list of utf-8 byte and a corresponding list of unicode strings. 19 | The reversible bpe codes work on unicode strings. 20 | This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. 21 | When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. 22 | This is a signficant percentage of your normal, say, 32K bpe vocab. 23 | To avoid that, we want lookup tables between utf-8 bytes and unicode strings. 24 | And avoids mapping to whitespace/control characters the bpe code barfs on. 25 | """ 26 | bs = list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1)) 27 | cs = bs[:] 28 | n = 0 29 | for b in range(2 ** 8): 30 | if b not in bs: 31 | bs.append(b) 32 | cs.append(2 ** 8 + n) 33 | n += 1 34 | cs = [chr(n) for n in cs] 35 | return dict(zip(bs, cs)) 36 | 37 | 38 | def get_pairs(word): 39 | """Return set of symbol pairs in a word. 40 | Word is represented as tuple of symbols (symbols being variable-length strings). 41 | """ 42 | pairs = set() 43 | prev_char = word[0] 44 | for char in word[1:]: 45 | pairs.add((prev_char, char)) 46 | prev_char = char 47 | return pairs 48 | 49 | 50 | def basic_clean(text): 51 | text = ftfy.fix_text(text) 52 | text = html.unescape(html.unescape(text)) 53 | return text.strip() 54 | 55 | 56 | def whitespace_clean(text): 57 | text = re.sub(r"\s+", " ", text) 58 | text = text.strip() 59 | return text 60 | 61 | 62 | class SimpleTokenizer(object): 63 | def __init__(self, bpe_path: str = default_bpe()): 64 | self.byte_encoder = bytes_to_unicode() 65 | self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} 66 | merges = gzip.open(bpe_path).read().decode("utf-8").split("\n") 67 | merges = merges[1 : 49152 - 256 - 2 + 1] 68 | merges = [tuple(merge.split()) for merge in merges] # type:ignore 69 | vocab = list(bytes_to_unicode().values()) 70 | vocab = vocab + [v + "" for v in vocab] 71 | for merge in merges: 72 | vocab.append("".join(merge)) 73 | vocab.extend(["<|startoftext|>", "<|endoftext|>"]) 74 | self.encoder = dict(zip(vocab, range(len(vocab)))) 75 | self.decoder = {v: k for k, v in self.encoder.items()} 76 | self.bpe_ranks = dict(zip(merges, range(len(merges)))) 77 | self.cache = {"<|startoftext|>": "<|startoftext|>", "<|endoftext|>": "<|endoftext|>"} 78 | self.pat = re.compile( 79 | r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", 80 | re.IGNORECASE, 81 | ) 82 | 83 | def bpe(self, token): 84 | if token in self.cache: 85 | return self.cache[token] 86 | word = tuple(token[:-1]) + (token[-1] + "",) 87 | pairs = get_pairs(word) 88 | 89 | if not pairs: 90 | return token + "" 91 | 92 | while True: 93 | bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf"))) 94 | if bigram not in self.bpe_ranks: 95 | break 96 | first, second = bigram 97 | new_word = [] 98 | i = 0 99 | while i < len(word): 100 | try: 101 | j = word.index(first, i) 102 | new_word.extend(word[i:j]) 103 | i = j 104 | except Exception as ex: 105 | new_word.extend(word[i:]) 106 | print(ex.message, ex.args) 107 | break 108 | 109 | if word[i] == first and i < len(word) - 1 and word[i + 1] == second: 110 | new_word.append(first + second) 111 | i += 2 112 | else: 113 | new_word.append(word[i]) 114 | i += 1 115 | new_word = tuple(new_word) 116 | word = new_word 117 | if len(word) == 1: 118 | break 119 | else: 120 | pairs = get_pairs(word) 121 | word = " ".join(word) 122 | self.cache[token] = word 123 | return word 124 | 125 | def encode(self, text): 126 | bpe_tokens = [] 127 | text = whitespace_clean(basic_clean(text)).lower() 128 | for token in re.findall(self.pat, text): 129 | token = "".join(self.byte_encoder[b] for b in token.encode("utf-8")) 130 | bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(" ")) 131 | return bpe_tokens 132 | 133 | def decode(self, tokens): 134 | text = "".join([self.decoder[token] for token in tokens]) 135 | text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors="replace").replace("", " ") 136 | return text 137 | -------------------------------------------------------------------------------- /hulc/utils/distributions.py: -------------------------------------------------------------------------------- 1 | from collections import namedtuple 2 | from typing import Union 3 | 4 | import torch 5 | from torch.distributions import Independent, Normal, OneHotCategoricalStraightThrough # type: ignore 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | 9 | DiscState = namedtuple("DiscState", ["logit"]) 10 | ContState = namedtuple("ContState", ["mean", "std"]) 11 | 12 | State = Union[DiscState, ContState] 13 | 14 | 15 | class Distribution: 16 | def __init__(self, **kwargs): 17 | self.dist = kwargs.get("dist") 18 | assert self.dist == "discrete" or self.dist == "continuous" 19 | if self.dist == "discrete": 20 | self.category_size = kwargs.get("category_size") 21 | self.class_size = kwargs.get("class_size") 22 | 23 | def get_dist(self, state): 24 | if self.dist == "discrete": 25 | shape = state.logit.shape 26 | logits = torch.reshape(state.logit, shape=(*shape[:-1], self.category_size, self.class_size)) 27 | return Independent(OneHotCategoricalStraightThrough(logits=logits), 1) 28 | elif self.dist == "continuous": 29 | return Independent(Normal(state.mean, state.std), 1) 30 | 31 | def detach_state(self, state): 32 | if self.dist == "discrete": 33 | return DiscState(state.logit.detach()) 34 | elif self.dist == "continuous": 35 | return ContState(state.mean.detach(), state.std.detach()) 36 | 37 | def sample_latent_plan(self, distribution): 38 | sampled_plan = distribution.sample() 39 | if self.dist == "discrete": 40 | sampled_plan = torch.flatten(sampled_plan, start_dim=-2, end_dim=-1) 41 | return sampled_plan 42 | 43 | def build_state(self, hidden_size, plan_features): 44 | fc_state = [] 45 | if self.dist == "discrete": 46 | fc_state += [nn.Linear(hidden_size, plan_features)] 47 | elif self.dist == "continuous": 48 | fc_state += [nn.Linear(hidden_size, 2 * plan_features)] 49 | return nn.Sequential(*fc_state) 50 | 51 | def forward_dist(self, x): 52 | if self.dist == "discrete": 53 | prior_logit = x 54 | state = DiscState(prior_logit) # type: State 55 | elif self.dist == "continuous": 56 | mean, var = torch.chunk(x, 2, dim=-1) 57 | min_std = 0.0001 58 | std = F.softplus(var) + min_std 59 | state = ContState(mean, std) 60 | return state 61 | -------------------------------------------------------------------------------- /hulc/utils/kl_callbacks.py: -------------------------------------------------------------------------------- 1 | from pytorch_lightning import Callback, LightningModule, Trainer 2 | import torch 3 | 4 | 5 | def sigmoid(scale: float, shift: float, x: int) -> float: 6 | return torch.sigmoid(torch.Tensor([(x - shift) / (scale / 12)])).item() 7 | 8 | 9 | class KLSchedule(Callback): 10 | """ 11 | Base class for KL Annealing 12 | """ 13 | 14 | def __init__(self, start_epoch: int, end_epoch: int, max_kl_beta: float): 15 | self.start_epoch = start_epoch 16 | self.end_epoch = end_epoch 17 | self.max_kl_beta = max_kl_beta 18 | 19 | def on_train_epoch_start(self, trainer: Trainer, pl_module: LightningModule) -> None: 20 | epoch = pl_module.current_epoch 21 | kl_beta = self._anneal_fn(epoch) 22 | pl_module.set_kl_beta(kl_beta) # type: ignore 23 | 24 | def _anneal_fn(self, epoch): 25 | raise NotImplementedError 26 | 27 | 28 | class KLConstantSchedule(KLSchedule): 29 | def __init__(self): 30 | pass 31 | 32 | def on_train_epoch_start(self, trainer: Trainer, pl_module: LightningModule) -> None: 33 | pass 34 | 35 | def _anneal_fn(self, epoch: int) -> None: 36 | pass 37 | 38 | 39 | class KLSigmoidSchedule(KLSchedule): 40 | def _anneal_fn(self, epoch: int) -> float: 41 | if epoch < self.start_epoch: 42 | kl_beta = 0.0 43 | elif epoch > self.end_epoch: 44 | kl_beta = self.max_kl_beta 45 | else: 46 | scale = self.end_epoch - self.start_epoch 47 | shift = (self.end_epoch + self.start_epoch) / 2 48 | kl_beta = sigmoid(scale=scale, shift=shift, x=epoch) * self.max_kl_beta 49 | return kl_beta 50 | 51 | 52 | class KLLinearSchedule(KLSchedule): 53 | def _anneal_fn(self, epoch: int) -> float: 54 | if epoch < self.start_epoch: 55 | kl_beta = 0.0 56 | elif epoch > self.end_epoch: 57 | kl_beta = self.max_kl_beta 58 | else: 59 | kl_beta = self.max_kl_beta * (epoch - self.start_epoch) / (self.end_epoch - self.start_epoch) 60 | return kl_beta 61 | 62 | 63 | if __name__ == "__main__": 64 | import matplotlib 65 | import matplotlib.pyplot as plt 66 | 67 | matplotlib.use("TkAgg") 68 | import numpy as np 69 | 70 | kl = KLLinearSchedule(10, 50, 0.1) 71 | x = np.arange(200) 72 | y = [kl._anneal_fn(i) for i in x] 73 | plt.plot(x, y) 74 | 75 | kl2 = KLSigmoidSchedule(10, 50, 0.1) 76 | x = np.arange(200) 77 | y = [kl2._anneal_fn(i) for i in x] 78 | plt.plot(x, y) 79 | 80 | plt.show() 81 | -------------------------------------------------------------------------------- /hulc/utils/transforms.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | 6 | 7 | # source: https://github.com/facebookresearch/drqv2/blob/main/drqv2.py 8 | class RandomShiftsAug(nn.Module): 9 | def __init__(self, pad): 10 | super().__init__() 11 | self.pad = pad 12 | 13 | def forward(self, x): 14 | x = x.float() 15 | n, c, h, w = x.size() 16 | assert h == w 17 | padding = tuple([self.pad] * 4) 18 | x = F.pad(x, padding, "replicate") 19 | eps = 1.0 / (h + 2 * self.pad) 20 | arange = torch.linspace(-1.0 + eps, 1.0 - eps, h + 2 * self.pad, device=x.device, dtype=x.dtype)[:h] 21 | arange = arange.unsqueeze(0).repeat(h, 1).unsqueeze(2) 22 | base_grid = torch.cat([arange, arange.transpose(1, 0)], dim=2) 23 | base_grid = base_grid.unsqueeze(0).repeat(n, 1, 1, 1) 24 | 25 | shift = torch.randint(0, 2 * self.pad + 1, size=(n, 1, 1, 2), device=x.device, dtype=x.dtype) 26 | shift *= 2.0 / (h + 2 * self.pad) 27 | 28 | grid = base_grid + shift 29 | return F.grid_sample(x, grid, padding_mode="zeros", align_corners=False) 30 | 31 | 32 | class RelativeActions(object): 33 | """Transform absolute actions to relative""" 34 | 35 | def __init__(self, max_pos, max_orn): 36 | self.max_pos = max_pos 37 | self.max_orn = max_orn 38 | 39 | @staticmethod 40 | def batch_angle_between(a, b): 41 | diff = b - a 42 | return (diff + np.pi) % (2 * np.pi) - np.pi 43 | 44 | def __call__(self, action_and_obs): 45 | actions, robot_obs = action_and_obs 46 | assert isinstance(actions, np.ndarray) 47 | assert isinstance(robot_obs, np.ndarray) 48 | 49 | rel_pos = actions[:, :3] - robot_obs[:, :3] 50 | rel_pos = np.clip(rel_pos, -self.max_pos, self.max_pos) / self.max_pos 51 | 52 | rel_orn = self.batch_angle_between(robot_obs[:, 3:6], actions[:, 3:6]) 53 | rel_orn = np.clip(rel_orn, -self.max_orn, self.max_orn) / self.max_orn 54 | 55 | gripper = actions[:, -1:] 56 | return np.concatenate([rel_pos, rel_orn, gripper], axis=1) 57 | 58 | def __repr__(self): 59 | return self.__class__.__name__ + f"(max_pos={self.max_pos}, max_orn={self.max_orn})" 60 | -------------------------------------------------------------------------------- /hulc/utils/utils.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | from calvin_agent.utils.utils import format_sftp_path 4 | from pytorch_lightning.utilities.cloud_io import load as pl_load 5 | 6 | 7 | def initialize_pretrained_weights(model, cfg): 8 | pretrain_chk = pl_load(format_sftp_path(Path(cfg.pretrain_chk)), map_location=lambda storage, loc: storage) 9 | batch_size = model.plan_recognition.position_embeddings.weight.shape[0] 10 | weight = "plan_recognition.position_embeddings.weight" 11 | pretrain_chk["state_dict"][weight] = pretrain_chk["state_dict"][weight][:batch_size] 12 | if "pretrain_exclude_pr" in cfg and cfg.pretrain_exclude_pr: 13 | for key in list(pretrain_chk["state_dict"].keys()): 14 | if key.startswith("plan_recognition"): 15 | del pretrain_chk["state_dict"][key] 16 | model.load_state_dict(pretrain_chk["state_dict"], strict=False) 17 | -------------------------------------------------------------------------------- /install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cd calvin_env/tacto 4 | pip install -e . 5 | cd .. 6 | pip install -e . 7 | cd .. 8 | pip install -e . 9 | -------------------------------------------------------------------------------- /media/hulc_rollout.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lukashermann/hulc/7fdb09f5f1ff831ec8df29cfc61a41db573e9a06/media/hulc_rollout.gif -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.black] 2 | # https://github.com/psf/black 3 | line-length = 120 4 | target-version = ["py38"] 5 | exclude = "(.eggs|.git|.hg|.mypy_cache|.nox|.tox|.venv|.svn|_build|buck-out|build|dist)" 6 | 7 | [tool.isort] 8 | profile = "black" 9 | line_length = 120 10 | force_sort_within_sections = "True" 11 | order_by_type = "False" 12 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | black 2 | flake8 3 | isort 4 | pre-commit 5 | mypy 6 | pytest 7 | pytest-cov 8 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | cmake 2 | wheel 3 | numpy>1.2 4 | hydra-core==1.1.1 5 | hydra-colorlog 6 | matplotlib 7 | opencv-python 8 | omegaconf 9 | plotly 10 | ftfy 11 | pytorch-lightning==1.8.6 12 | lightning_lite 13 | torch==1.13.1 14 | torchvision 15 | MulticoreTSNE 16 | gitpython 17 | scipy 18 | sentence-transformers 19 | gym 20 | moviepy 21 | tqdm 22 | termcolor 23 | wandb 24 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """Setup hulc installation.""" 4 | 5 | from os import path as op 6 | import re 7 | 8 | from setuptools import find_packages, setup 9 | 10 | 11 | def _read(f): 12 | return open(op.join(op.dirname(__file__), f)).read() if op.exists(f) else "" 13 | 14 | 15 | _meta = _read("hulc/__init__.py") 16 | 17 | 18 | def find_meta(_meta, string): 19 | l_match = re.search(r"^" + string + r'\s*=\s*"(.*)"', _meta, re.M) 20 | if l_match: 21 | return l_match.group(1) 22 | raise RuntimeError(f"Unable to find {string} string.") 23 | 24 | 25 | install_requires = [ 26 | l for l in _read("requirements.txt").split("\n") if l and not l.startswith("#") and not l.startswith("-") 27 | ] 28 | 29 | meta = dict( 30 | name=find_meta(_meta, "__project__"), 31 | version=find_meta(_meta, "__version__"), 32 | license=find_meta(_meta, "__license__"), 33 | description="Hierarchical Universal Language Conditioned Policies", 34 | platforms=("Any"), 35 | zip_safe=False, 36 | keywords="pytorch hulc".split(), 37 | author=find_meta(_meta, "__author__"), 38 | author_email=find_meta(_meta, "__email__"), 39 | url=" https://github.com/mees/hulc", 40 | packages=find_packages(exclude=["tests"]), 41 | install_requires=install_requires, 42 | ) 43 | 44 | if __name__ == "__main__": 45 | print("find_package", find_packages(exclude=["tests"])) 46 | setup(**meta) 47 | -------------------------------------------------------------------------------- /setup_local.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """Setup hulc installation.""" 4 | 5 | from os import path as op 6 | import re 7 | 8 | from setuptools import find_packages, setup 9 | 10 | 11 | def _read(f): 12 | return open(op.join(op.dirname(__file__), f)).read() if op.exists(f) else "" 13 | 14 | 15 | _meta = _read("hulc/__init__.py") 16 | 17 | 18 | def find_meta(_meta, string): 19 | l_match = re.search(r"^" + string + r'\s*=\s*"(.*)"', _meta, re.M) 20 | if l_match: 21 | return l_match.group(1) 22 | raise RuntimeError(f"Unable to find {string} string.") 23 | 24 | 25 | meta = dict( 26 | name=find_meta(_meta, "__project__"), 27 | version=find_meta(_meta, "__version__"), 28 | license=find_meta(_meta, "__license__"), 29 | description="Hierarchical Universal Language Conditioned Policies", 30 | platforms=("Any"), 31 | zip_safe=False, 32 | keywords="pytorch Lfp".split(), 33 | author=find_meta(_meta, "__author__"), 34 | author_email=find_meta(_meta, "__email__"), 35 | url=" https://github.com/mees/hulc", 36 | packages=find_packages(exclude=["tests"]), 37 | ) 38 | 39 | if __name__ == "__main__": 40 | print("find_package", find_packages(exclude=["tests"])) 41 | setup(**meta) 42 | -------------------------------------------------------------------------------- /slurm_scripts/README.md: -------------------------------------------------------------------------------- 1 | ## Training CALVIN on a Slurm Cluster 2 | ### Starting a training 3 | ```bash 4 | $ cd $HULC_ROOT/slurm_scripts 5 | $ python slurm_training.py --venv hulc_venv datamodule.root_data_dir=/path/to/dataset/ 6 | ``` 7 | This assumes that `--venv hulc_venv` specifies a conda environment. 8 | To use virtualenv instead, change line 18 of sbatch_lfp.sh accordingly. 9 | 10 | All hydra arguments can be used as in the normal training. 11 | 12 | Use the following optional command line arguments for slurm: 13 | - `--log_dir`: slurm log directory 14 | - `--job_name`: slurm job name 15 | - `--gpus`: number of gpus 16 | - `--mem`: memory 17 | - `--cpus`: number of cpus 18 | - `--days`: time limit in days 19 | - `--partition`: name of slurm partition 20 | 21 | The script will create a new folder in the specified log dir with a date tag and the job name. 22 | This is done *before* the job is submitted to the slurm queue. 23 | In order to ensure reproducibility, the current state of the calvin repository 24 | is copied to the log directory at *submit time* and is 25 | locally installed, such that you can schedule multiple trainings and there is no interference with 26 | future changes to the repository. 27 | 28 | ### Resuming a training 29 | Every job submission creates a `resume_training.sh` script in the log folder. To resume a training, 30 | call `$ sh /resume_training.sh`. By default, the model loads the latest saved checkpoint. 31 | 32 | ### Evaluating a model 33 | To evaluate a trained model via slurm, run `$ sh /evaluate.sh`, which will automatically place a job on the 34 | same partition as it was trained on. Note that this script is also autogenerated. 35 | -------------------------------------------------------------------------------- /slurm_scripts/sbatch_eval.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Print some information about the job to STDOUT 3 | echo "Workingdir: $PWD"; 4 | echo "Started at $(date)"; 5 | echo "Running job $SLURM_JOB_NAME"; 6 | echo "cpus per node: $SLURM_JOB_CPUS_PER_NODE"; 7 | echo "gres: $SLURM_GRES"; 8 | echo "mem: $SLURM_MEM_PER_NODE"; 9 | echo "ntasks: $SLURM_NTASKS"; 10 | echo "JID $SLURM_JOB_ID on queue $SLURM_JOB_PARTITION"; 11 | 12 | export NCCL_DEBUG=INFO 13 | export PYTHONFAULTHANDLER=1 14 | export HYDRA_FULL_ERROR=1 15 | 16 | # Job to perform 17 | source ~/.bashrc 18 | conda activate $1 19 | srun python ${@:2} 20 | 21 | # Print some Information about the end-time to STDOUT 22 | echo "DONE"; 23 | echo "Finished at $(date)"; 24 | -------------------------------------------------------------------------------- /slurm_scripts/sbatch_lfp.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Print some information about the job to STDOUT 3 | echo "Workingdir: $PWD"; 4 | echo "Started at $(date)"; 5 | echo "Running job $SLURM_JOB_NAME"; 6 | echo "cpus per node: $SLURM_JOB_CPUS_PER_NODE"; 7 | echo "gres: $SLURM_GRES"; 8 | echo "mem: $SLURM_MEM_PER_NODE"; 9 | echo "ntasks: $SLURM_NTASKS"; 10 | echo "JID $SLURM_JOB_ID on queue $SLURM_JOB_PARTITION"; 11 | 12 | export NCCL_DEBUG=INFO 13 | export PYTHONFAULTHANDLER=1 14 | export HYDRA_FULL_ERROR=1 15 | 16 | # Job to perform 17 | source ~/.bashrc 18 | conda activate $1 19 | timeout 23.8h srun python $3 slurm=true hydra.run.dir=$4 trainer.devices=$5 ${@:6} 20 | 21 | if [[ $? -eq 124 ]]; then 22 | echo "Time limit exceeded. Resubmit job."; 23 | ssh ${USER}@$2 < 2 else np.inf 16 | 17 | checkpoints = get_all_checkpoints(training_dir) 18 | epochs = [str(e) for chk in checkpoints if (e := int(chk.stem.split("=")[1])) <= max_epoch] 19 | split_epochs = np.array_split(epochs, 8) 20 | epoch_args = [",".join(arr) for arr in split_epochs if len(arr)] 21 | for epoch_arg in epoch_args: 22 | cmd = [(training_dir / "evaluate.sh").as_posix(), "--checkpoints", epoch_arg, "--eval_log_dir", eval_log_dir] 23 | output = subprocess.check_output(cmd) 24 | print(output.decode("utf-8")) 25 | 26 | 27 | if __name__ == "__main__": 28 | main() 29 | -------------------------------------------------------------------------------- /slurm_scripts/slurm_training.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import datetime 3 | import os 4 | from pathlib import Path 5 | import stat 6 | import subprocess 7 | 8 | from git import Repo 9 | import numpy as np 10 | from setuptools import sandbox 11 | 12 | default_log_dir = f"/home/{os.environ['USER']}/logs" if "USER" in os.environ else "/tmp" 13 | if default_log_dir == "/tmp": 14 | print("CAUTION: logging to /tmp") 15 | parser = argparse.ArgumentParser(description="Parse slurm parameters and hydra config overrides") 16 | 17 | parser.add_argument("--script", type=str, default="./sbatch_lfp.sh") 18 | parser.add_argument("--train_file", type=str, default="../hulc/training.py") 19 | parser.add_argument("-l", "--log_dir", type=str, default=default_log_dir) 20 | parser.add_argument("-j", "--job_name", type=str, default="play_training") 21 | parser.add_argument("-g", "--gpus", type=int, default=1) 22 | parser.add_argument("--mem", type=int, default=0) # 0 means no memory limit 23 | parser.add_argument("--cpus", type=int, default=8) 24 | parser.add_argument("--days", type=int, default=1) 25 | parser.add_argument("-v", "--venv", type=str) 26 | parser.add_argument("-p", "--partition", type=str, default="alldlc_gpu-rtx2080") 27 | parser.add_argument("--login_node", type=str, default="kis3bat1") 28 | parser.add_argument("-x", "--exclude", type=str) 29 | parser.add_argument("--no_clone", action="store_true") 30 | args, unknownargs = parser.parse_known_args() 31 | 32 | 33 | assert np.all(["gpu" not in arg for arg in unknownargs]) 34 | assert np.all(["hydra.run.dir" not in arg for arg in unknownargs]) 35 | assert np.all(["log_dir" not in arg for arg in unknownargs]) 36 | assert np.all(["hydra.sweep.dir" not in arg for arg in unknownargs]) 37 | 38 | log_dir = Path(args.log_dir).absolute() / f'{datetime.datetime.now().strftime("%Y-%m-%d/%H-%M-%S")}_{args.job_name}' 39 | os.makedirs(log_dir) 40 | args.script = Path(args.script).absolute() 41 | args.train_file = Path(args.train_file).absolute() 42 | 43 | 44 | def create_git_copy(repo_src_dir, repo_target_dir): 45 | repo = Repo(repo_src_dir) 46 | repo.clone(repo_target_dir) 47 | orig_cwd = os.getcwd() 48 | os.chdir(repo_target_dir) 49 | os.environ["PYTHONPATH"] = os.getcwd() + ":" + os.environ.get("PYTHONPATH", "") 50 | sandbox.run_setup("setup_local.py", ["develop", "--install-dir", "."]) 51 | os.chdir(orig_cwd) 52 | 53 | 54 | if not args.no_clone: 55 | repo_src_dir = Path(__file__).absolute().parents[1] 56 | repo_target_dir = log_dir / "hulc" 57 | create_git_copy(repo_src_dir, repo_target_dir) 58 | 59 | args.script = repo_target_dir / os.path.relpath(args.script, repo_src_dir) 60 | args.train_file = repo_target_dir / os.path.relpath(args.train_file, repo_src_dir) 61 | 62 | if args.partition == "test": 63 | args.partition = "testdlc_gpu-rtx2080" 64 | 65 | args.time = f"{args.days}-00:00" 66 | if args.partition == "testdlc_gpu-rtx2080": 67 | args.time = "01:00:00" 68 | 69 | job_opts = { 70 | "script": f"{args.script.as_posix()} {args.venv} {args.login_node} {args.train_file.as_posix()} {log_dir.as_posix()} {args.gpus} {' '.join(unknownargs)}", 71 | "partition": args.partition, 72 | "mem": args.mem, 73 | "ntasks-per-node": args.gpus, 74 | "cpus-per-task": args.cpus, 75 | "gres": f"gpu:{args.gpus}", 76 | "output": os.path.join(log_dir, "%x.%N.%j.out"), 77 | "error": os.path.join(log_dir, "%x.%N.%j.err"), 78 | "job-name": args.job_name, 79 | "mail-type": "END,FAIL", 80 | "time": args.time, 81 | } 82 | 83 | if args.exclude is not None: 84 | job_opts["exclude"] = ",".join(map(lambda x: f"dlcgpu{int(x):02d}", args.exclude.split(","))) 85 | 86 | 87 | def submit_job(job_info): 88 | # Construct sbatch command 89 | slurm_cmd = ["sbatch"] 90 | for key, value in job_info.items(): 91 | # Check for special case keys 92 | if key == "script": 93 | continue 94 | slurm_cmd.append(f"--{key}={value}") 95 | slurm_cmd.append(job_info["script"]) 96 | print("Generated slurm batch command: '%s'" % slurm_cmd) 97 | 98 | # Run sbatch command as subprocess. 99 | try: 100 | sbatch_output = subprocess.check_output(slurm_cmd) 101 | create_resume_script(slurm_cmd) 102 | except subprocess.CalledProcessError as e: 103 | # Print error message from sbatch for easier debugging, then pass on exception 104 | if sbatch_output is not None: 105 | print("ERROR: Subprocess call output: %s" % sbatch_output) 106 | raise e 107 | 108 | print(sbatch_output.decode("utf-8")) 109 | 110 | 111 | def create_resume_script(slurm_cmd): 112 | file_path = os.path.join(log_dir, "resume_training.sh") 113 | with open(file_path, "w") as file: 114 | file.write("#!/bin/bash\n") 115 | file.write(" ".join(slurm_cmd)) 116 | st = os.stat(file_path) 117 | os.chmod(file_path, st.st_mode | stat.S_IEXEC) 118 | 119 | 120 | def create_eval_script(): 121 | # Construct sbatch command 122 | eval_log_dir = log_dir / "evaluation" 123 | os.makedirs(eval_log_dir, exist_ok=True) 124 | eval_sbatch_script = Path("./sbatch_eval.sh").absolute() 125 | eval_file = args.train_file.parent / "evaluation/evaluate_policy.py" 126 | 127 | dataset_path = next(filter(lambda x: x.split("=")[0] == "datamodule.root_data_dir", unknownargs)).split("=")[1] 128 | 129 | eval_cmd = ["sbatch"] 130 | eval_job_opts = { 131 | "partition": args.partition, 132 | "mem": args.mem, 133 | "ntasks-per-node": 1, 134 | "cpus-per-task": 8, 135 | "gres": "gpu:1", 136 | "output": os.path.join(eval_log_dir, "%x.%N.%j.out"), 137 | "error": os.path.join(eval_log_dir, "%x.%N.%j.err"), 138 | "job-name": f"{args.job_name}_eval", 139 | "mail-type": "END,FAIL", 140 | "time": "1-00:00", 141 | } 142 | for key, value in eval_job_opts.items(): 143 | eval_cmd.append(f"--{key}={value}") 144 | eval_args = f"{eval_sbatch_script.as_posix()} {args.venv} {eval_file.as_posix()}" 145 | eval_args += f" --dataset_path {dataset_path}" 146 | eval_args += f" --train_folder {log_dir}" 147 | eval_args += " ${@:1}" 148 | eval_cmd.append(eval_args) 149 | 150 | file_path = os.path.join(log_dir, "evaluate.sh") 151 | with open(file_path, "w") as file: 152 | file.write("#!/bin/bash\n") 153 | file.write(" ".join(eval_cmd)) 154 | st = os.stat(file_path) 155 | os.chmod(file_path, st.st_mode | stat.S_IEXEC) 156 | 157 | 158 | submit_job(job_opts) 159 | create_eval_script() 160 | --------------------------------------------------------------------------------