├── .flake8
├── .gitignore
├── .gitmodules
├── .pre-commit-config.yaml
├── LICENSE
├── README.md
├── checkpoints
    └── download_model_weights.sh
├── conf
    ├── __init__.py
    ├── annotations
    │   ├── new_playtable.yaml
    │   └── new_playtable_validation.yaml
    ├── callbacks
    │   ├── checkpoint
    │   │   ├── all.yaml
    │   │   ├── clip_loss.yaml
    │   │   ├── kl.yaml
    │   │   ├── lh_sr.yaml
    │   │   ├── state_recon.yaml
    │   │   ├── task_sr.yaml
    │   │   └── val_action.yaml
    │   ├── default.yaml
    │   ├── kl_schedule
    │   │   ├── constant.yaml
    │   │   ├── linear.yaml
    │   │   └── sigmoid.yaml
    │   ├── rollout
    │   │   ├── default.yaml
    │   │   └── tasks
    │   │   │   └── new_playtable_tasks.yaml
    │   ├── rollout_lh
    │   │   └── default.yaml
    │   ├── shm_signal
    │   │   └── default.yaml
    │   └── tsne_plot
    │   │   └── default.yaml
    ├── config.yaml
    ├── datamodule
    │   ├── datasets
    │   │   ├── lang_dataset
    │   │   │   ├── lang.yaml
    │   │   │   └── lang_shm.yaml
    │   │   ├── lang_only.yaml
    │   │   ├── vision_dataset
    │   │   │   ├── vision.yaml
    │   │   │   └── vision_shm.yaml
    │   │   ├── vision_lang.yaml
    │   │   ├── vision_lang_shm.yaml
    │   │   └── vision_only.yaml
    │   ├── default.yaml
    │   ├── mcil.yaml
    │   ├── observation_space
    │   │   ├── all_mods_abs_act.yaml
    │   │   ├── lang_rgb_static_abs_act.yaml
    │   │   ├── lang_rgb_static_gripper_abs_act.yaml
    │   │   ├── lang_rgb_static_gripper_rel_act.yaml
    │   │   ├── lang_rgb_static_rel_act.yaml
    │   │   ├── lang_rgb_static_robot_scene_abs_act.yaml
    │   │   ├── lang_rgb_static_tactile_abs_act.yaml
    │   │   ├── lang_rgbd_both_abs_act.yaml
    │   │   ├── lang_rgbd_both_rel_act.yaml
    │   │   ├── lang_rgbd_static_gripper_rel_act.yaml
    │   │   ├── lang_rgbd_static_robot_abs_act.yaml
    │   │   ├── rgb_static_abs_act.yaml
    │   │   ├── rgb_static_robot_scene_abs_act.yaml
    │   │   └── state_only.yaml
    │   ├── proprioception_dims
    │   │   ├── none.yaml
    │   │   ├── robot_full.yaml
    │   │   ├── robot_no_joints.yaml
    │   │   ├── robot_no_joints_no_gripper_width.yaml
    │   │   └── robot_scene.yaml
    │   └── transforms
    │   │   ├── clip.yaml
    │   │   ├── play_basic.yaml
    │   │   └── rand_shift.yaml
    ├── inference
    │   └── config_inference.yaml
    ├── lang_ann.yaml
    ├── logger
    │   ├── tb_logger.yaml
    │   └── wandb.yaml
    ├── loss
    │   └── default.yaml
    ├── model
    │   ├── action_decoder
    │   │   ├── deterministic.yaml
    │   │   ├── hulc_default.yaml
    │   │   └── mcil_default.yaml
    │   ├── bc_z_lang_decoder
    │   │   ├── default.yaml
    │   │   └── none.yaml
    │   ├── clip_lang.yaml
    │   ├── distribution
    │   │   ├── continuous.yaml
    │   │   └── discrete.yaml
    │   ├── gcbc.yaml
    │   ├── hulc.yaml
    │   ├── language_encoder
    │   │   ├── default.yaml
    │   │   └── none.yaml
    │   ├── language_goal
    │   │   ├── default.yaml
    │   │   └── none.yaml
    │   ├── lr_scheduler
    │   │   ├── constant.yaml
    │   │   ├── cosine_schedule_with_warmup.yaml
    │   │   └── linear_schedule_with_warmup.yaml
    │   ├── mcil.yaml
    │   ├── mia_lang_discriminator
    │   │   ├── default.yaml
    │   │   └── none.yaml
    │   ├── optimizer
    │   │   ├── adam.yaml
    │   │   ├── adamw.yaml
    │   │   └── sgd.yaml
    │   ├── perceptual_encoder
    │   │   ├── default.yaml
    │   │   ├── depth_gripper
    │   │   │   ├── default.yaml
    │   │   │   └── none.yaml
    │   │   ├── depth_static
    │   │   │   ├── default.yaml
    │   │   │   └── none.yaml
    │   │   ├── gripper_cam.yaml
    │   │   ├── proprio
    │   │   │   ├── identity.yaml
    │   │   │   └── none.yaml
    │   │   ├── rgb_gripper
    │   │   │   ├── default.yaml
    │   │   │   └── none.yaml
    │   │   ├── rgb_static
    │   │   │   ├── clip.yaml
    │   │   │   └── default.yaml
    │   │   ├── state_decoder
    │   │   │   ├── default.yaml
    │   │   │   └── none.yaml
    │   │   └── tactile
    │   │   │   ├── default.yaml
    │   │   │   └── none.yaml
    │   ├── plan_proposal
    │   │   └── default.yaml
    │   ├── plan_recognition
    │   │   ├── birnn.yaml
    │   │   └── transformers.yaml
    │   ├── proj_vis_lang
    │   │   ├── default.yaml
    │   │   └── none.yaml
    │   ├── sbert.yaml
    │   └── visual_goal
    │   │   └── default.yaml
    ├── trainer
    │   └── play_trainer.yaml
    └── training
    │   └── default_training.yaml
├── dataset
    ├── README.md
    ├── download_data.sh
    └── download_lang_embeddings.sh
├── hulc
    ├── __init__.py
    ├── evaluation
    │   ├── __init__.py
    │   ├── create_plots.py
    │   ├── evaluate_policy.py
    │   ├── rollouts_interactive.py
    │   └── run_multiple.py
    ├── models
    │   ├── __init__.py
    │   ├── auxiliary_loss_networks
    │   │   ├── __init__.py
    │   │   ├── bc_z_lang_decoder.py
    │   │   ├── mia_lang_discriminator.py
    │   │   ├── proj_vis_lang.py
    │   │   └── state_decoder.py
    │   ├── decoders
    │   │   ├── __init__.py
    │   │   ├── action_decoder.py
    │   │   ├── deterministic_decoder.py
    │   │   ├── logistic_decoder_rnn.py
    │   │   └── utils
    │   │   │   ├── __init__.py
    │   │   │   ├── gripper_control.py
    │   │   │   ├── pytorch3d_transforms.py
    │   │   │   └── rnn.py
    │   ├── encoders
    │   │   ├── __init__.py
    │   │   ├── clip_lang_encoder.py
    │   │   ├── goal_encoders.py
    │   │   ├── lang_encoder.py
    │   │   └── language_network.py
    │   ├── gcbc.py
    │   ├── hulc.py
    │   ├── perceptual_encoders
    │   │   ├── __init__.py
    │   │   ├── clip.py
    │   │   ├── concat_encoders.py
    │   │   ├── proprio_encoder.py
    │   │   ├── tactile_encoder.py
    │   │   ├── vision_clip.py
    │   │   ├── vision_network.py
    │   │   └── vision_network_gripper.py
    │   └── plan_encoders
    │   │   ├── __init__.py
    │   │   ├── plan_proposal_net.py
    │   │   └── plan_recognition_net.py
    ├── training.py
    └── utils
    │   ├── __init__.py
    │   ├── bpe_simple_vocab_16e6.txt.gz
    │   ├── clip_tokenizer.py
    │   ├── distributions.py
    │   ├── kl_callbacks.py
    │   ├── transforms.py
    │   └── utils.py
├── install.sh
├── media
    └── hulc_rollout.gif
├── pyproject.toml
├── requirements-dev.txt
├── requirements.txt
├── setup.py
├── setup_local.py
└── slurm_scripts
    ├── README.md
    ├── sbatch_eval.sh
    ├── sbatch_lfp.sh
    ├── slurm_eval.py
    └── slurm_training.py


/.flake8:
--------------------------------------------------------------------------------
 1 | [flake8]
 2 | exclude = .git
 3 | # Default is 79 in PEP 8
 4 | max-line-length = 120
 5 | select = E,F,W,C
 6 | ignore=W503, # line break before binary operator, need for black
 7 |        E203, # whitespace before ':'. Opposite convention enforced by black
 8 |        E731, # do not assign a lambda expression, use a def
 9 |        E722,
10 |        F401,
11 |        F841,
12 |        E402,  # module level import not at top of file
13 |        E741,  # ambiguous variable name
14 |        E501, # line too long. Handled by black
15 |        C406,  # Unnecessary list literal - rewrite as a dict literal
16 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | # Pycharm
132 | .idea
133 | 
134 | # log files
135 | runs
136 | 
137 | checkpoints/HULC*
138 | 
139 | dataset/calvin_debug_dataset/
140 | dataset/task*
141 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "calvin_env"]
2 | 	path = calvin_env
3 | 	url = https://github.com/mees/calvin_env.git
4 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | default_language_version:
 2 |     python: python3.8
 3 | repos:
 4 |   - repo: https://github.com/psf/black
 5 |     rev: 21.5b2
 6 |     hooks:
 7 |       - id: black
 8 |         language_version: python3.8
 9 | 
10 |   - repo: https://gitlab.com/pycqa/flake8
11 |     rev: 3.8.4
12 |     hooks:
13 |       - id: flake8
14 |         additional_dependencies: [-e, "git+git://github.com/pycqa/pyflakes.git@c72d6cf#egg=pyflakes"]
15 |         exclude: telegram_bot
16 | 
17 |   - repo: https://github.com/pycqa/isort
18 |     rev: 5.7.0
19 |     hooks:
20 |       - id: isort
21 | 
22 |   - repo: https://github.com/pre-commit/mirrors-mypy
23 |     rev: v0.812
24 |     hooks:
25 |       - id: mypy
26 |         args: [--ignore-missing-imports, --warn-no-return, --warn-redundant-casts, --disallow-incomplete-defs]
27 |         additional_dependencies: [pytorch-lightning==1.5.5, torch==1.10.0, numpy]
28 |         exclude: telegram_bot
29 | 
30 |   - repo: https://github.com/pre-commit/pre-commit-hooks
31 |     rev: v4.0.1
32 |     hooks:
33 |       - id: check-yaml
34 |       - id: trailing-whitespace
35 |       - id: end-of-file-fixer
36 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Oier Mees
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # HULC
  2 | [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
  3 | [![Language grade: Python](https://img.shields.io/lgtm/grade/python/g/mees/hulc.svg?logo=lgtm&logoWidth=18)](https://lgtm.com/projects/g/mees/hulc/context:python)
  4 | [![Total alerts](https://img.shields.io/lgtm/alerts/g/mees/hulc.svg?logo=lgtm&logoWidth=18)](https://lgtm.com/projects/g/mees/hulc/alerts/)
  5 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
  6 | 
  7 | [<b>What Matters in Language Conditioned Imitation Learning over Unstructured Data</b>](https://arxiv.org/pdf/2204.06252.pdf)
  8 | 
  9 | [Oier Mees](https://www.oiermees.com/), [Lukas Hermann](https://lukashermann.github.io/), [Wolfram Burgard](http://www2.informatik.uni-freiburg.de/~burgard)
 10 | 
 11 |  We present **HULC** (**H**ierarchical **U**niversal **L**anguage **C**onditioned Policies), an end-to-end model that can
 12 |  learn  a wide variety of language conditioned robot skills from  offline free-form imitation datasets. HULC sets a new state of the art on the challenging CALVIN benchmark,
 13 |  on learning a single 7-DoF policy that can perform long-horizon manipulation tasks in a 3D environment, directly from images, and only specified with natural language.
 14 | This code accompanies the paper What Matters in Language Conditioned Imitation Learning, which can be found [here](https://arxiv.org/pdf/2204.06252.pdf).
 15 | We hope the code will be useful as a starting point for further research on language conditioned policy learning and will bring us closer towards general-purpose robots that can relate human language to their perception and actions.
 16 | 
 17 | ![](media/hulc_rollout.gif)
 18 | ## Installation
 19 | As a prerequisite, you need to have [calvin](https://github.com/mees/calvin) installed.
 20 | This is needed because HULC builds upon calvin_agent and calvin_env.
 21 | 
 22 | Next, clone this repository locally
 23 | ```bash
 24 | git clone https://github.com/mees/hulc.git
 25 | export HULC_ROOT=$(pwd)/hulc
 26 | 
 27 | ```
 28 | 
 29 | Install requirements:
 30 | ```bash
 31 | cd $HULC_ROOT
 32 | conda create -n hulc_venv python=3.10  # or use virtualenv
 33 | conda activate hulc_venv
 34 | sh install.sh
 35 | ```
 36 | We originally used Python 3.8, but we 3.10 should also work.
 37 | 
 38 | If you encounter problems installing pyhash, you might have to downgrade setuptools to a version below 58.
 39 | 
 40 | ## Download
 41 | ### CALVIN Dataset
 42 | If you want to train on the [CALVIN](https://github.com/mees/calvin) dataset, choose a split with:
 43 | ```bash
 44 | cd $HULC_ROOT/dataset
 45 | sh download_data.sh D | ABC | ABCD | debug
 46 | ```
 47 | If you have previously downloaded the dataset in the calvin repo, you can just set the paths to that folder via the command line when starting a training.
 48 | If you want to get started without downloading the whole dataset, use the argument `debug` to download a small debug dataset (1.3 GB).
 49 | ### Language Embeddings
 50 | We provide the precomputed embeddings of the different Language Models we evaluate in the paper.
 51 | The script assumes the corresponding split has been already downloaded.
 52 | ```bash
 53 | cd $HULC_ROOT/dataset
 54 | sh download_lang_embeddings.sh D | ABC | ABCD
 55 | ```
 56 | 
 57 | ### Pre-trained Models
 58 | We provide our final models for all three CALVIN splits.
 59 | ```bash
 60 | cd $HULC_ROOT/checkpoints
 61 | sh download_model_weights.sh D | ABC | ABCD
 62 | ```
 63 | For instructions how to use the pretrained models, look at the training and evaluation sections.
 64 | 
 65 | ## Hardware Requirements
 66 | 
 67 | We leverage [Pytorch Lightning's](https://www.pytorchlightning.ai/) DDP implementation to scale our training to 8x NVIDIA GPUs with **12GB** memory each.
 68 | Evaluating the models requires a single NVIDIA GPU with **8GB**. As each GPU receives a batch of 64 sequences (32 language + 32 vision), the effective batch size is 512 for all our experiments.
 69 | 
 70 | Trained with:
 71 | - **GPU** - 8x NVIDIA RTX 2080Ti
 72 | - **CPU** - AMD EPYC 7502
 73 | - **RAM** - 512GB
 74 | - **OS** - Ubuntu 20.04
 75 | 
 76 | With this setup, one epoch takes around 1.5 hours and the whole training with 30 epochs can be completed in 45 hours (without the evaluation callbacks).
 77 | 
 78 | ## Training
 79 | To train our HULC model with the maximum amount of available GPUS, run:
 80 | ```
 81 | python hulc/training.py trainer.devices-1 datamodule.root_data_dir=path/to/dataset datamodule/datasets=vision_lang_shm
 82 | ```
 83 | The `vision_lang_shm` option loads the CALVIN dataset into shared memory at the beginning of the training,
 84 | speeding up the data loading during training.
 85 | The preparation of the shared memory cache will take some time
 86 | (approx. 20 min at our SLURM cluster). \
 87 | If you want to use the original data loader (e.g. for debugging) just override the command with `datamodule/datasets=vision_lang`. \
 88 | For an additional speed up, you can disable the evaluation callbacks during training by adding `~callbacks/rollout` and `~callbacks/rollout_lh`
 89 | 
 90 | If you have access to a SLURM cluster, follow this [guide](https://github.com/mees/hulc/blob/main/slurm_scripts/README.md).
 91 | 
 92 | You can use our [pre-trained models](#pre-trained-models) to initialize a training by running
 93 | ```
 94 | python hulc/training.py trainer.devices-1 datamodule.root_data_dir=path/to/dataset hydra.run.dir=$HULC_ROOT/checkpoints/HULC_D_D
 95 | ```
 96 | Note that this will log the training into the checkpoint folder.
 97 | 
 98 | ### Ablations
 99 | Multi-context imitation learning (MCIL), (Lynch et al., 2019):
100 | ```
101 | python hulc/training.py trainer.devices-1 datamodule.root_data_dir=path/to/dataset datamodule/datasets=vision_lang_shm model=mcil
102 | datamodule=mcil
103 | ```
104 | 
105 | Goal-conditioned behavior cloning (GCBC), (Lynch et al., 2019):
106 | ```
107 | python hulc/training.py trainer.devices-1 datamodule.root_data_dir=path/to/dataset datamodule/datasets=vision_lang_shm model=gcbc
108 | ~callbacks/tsne_plot
109 | ```
110 | 
111 | 
112 | ## Evaluation
113 | See detailed inference instructions on the [CALVIN repo](https://github.com/mees/calvin#muscle-evaluation-the-calvin-challenge).
114 | ```
115 | python hulc/evaluation/evaluate_policy.py --dataset_path <PATH/TO/DATASET> --train_folder <PATH/TO/TRAINING/FOLDER>
116 | ```
117 | Set `--train_folder $HULC_ROOT/checkpoints/HULC_D_D` to evaluate our [pre-trained models](#pre-trained-models).
118 | 
119 | Optional arguments:
120 | 
121 | - `--checkpoint <PATH/TO/CHECKPOINT>`: by default, the evaluation loads the last checkpoint in the training log directory.
122 | You can instead specify the path to another checkpoint by adding this to the evaluation command.
123 | - `--debug`: print debug information and visualize environment.
124 | 
125 | ## Changelog
126 | 
127 | ### 16 Sep 2022
128 | - **MAJOR BUG IN ABC and ABCD dataset:** If you downloaded these datasets before this date you have to do these fixes:
129 |    - Wrong language annotations in ABC and ABCD dataset. You can download the corrected language embeddings [here](https://github.com/mees/calvin/blob/main/dataset/README.md#language-embeddings).
130 |    - Bug in `calvin_env` that only affects the generation of language embeddings.
131 |    - Wrong `scene_info.npy` in ABC and ABCD dataset. Please replace as follows:
132 | ```
133 | cd task_ABCD_D
134 | wget http://calvin.cs.uni-freiburg.de/scene_info_fix/task_ABCD_D_scene_info.zip
135 | unzip task_ABCD_D_scene_info.zip && rm task_ABCD_D_scene_info.zip
136 | ```
137 | ```
138 | cd task_ABC_D
139 | wget http://calvin.cs.uni-freiburg.de/scene_info_fix/task_ABC_D_scene_info.zip
140 | unzip task_ABC_D_scene_info.zip && rm task_ABC_D_scene_info.zip
141 | ```
142 | 
143 | ### 1 Sep 2022
144 | - Updated the language embeddings for the splits ABC and ABCD due to a bug in switching scenes during the automatic language labeling. Additionally, added various precomputed language embeddings.
145 | 
146 | ## Acknowledgements
147 | 
148 | This work uses code from the following open-source projects and datasets:
149 | 
150 | #### CALVIN
151 | Original:  [https://github.com/mees/calvin](https://github.com/mees/calvin)
152 | License: [MIT](https://github.com/mees/calvin/blob/main/LICENSE)
153 | 
154 | #### Sentence-Transformers
155 | Original:  [https://github.com/UKPLab/sentence-transformers](https://github.com/UKPLab/sentence-transformers)
156 | License: [Apache 2.0](https://github.com/UKPLab/sentence-transformers/blob/master/LICENSE)
157 | 
158 | #### OpenAI CLIP
159 | Original: [https://github.com/openai/CLIP](https://github.com/openai/CLIP)
160 | License: [MIT](https://github.com/openai/CLIP/blob/main/LICENSE)
161 | ## Citations
162 | 
163 | If you find the code useful, please cite:
164 | 
165 | **HULC**
166 | ```bibtex
167 | @article{mees2022hulc,
168 |   author={Oier Mees and Lukas Hermann and Wolfram Burgard},
169 |   title={What Matters in Language Conditioned Robotic Imitation Learning Over Unstructured Data},
170 |   journal={IEEE Robotics and Automation Letters (RA-L)},
171 |   volume={7},
172 |   number={4},
173 |   pages={11205-11212},
174 |   year={2022}
175 | }
176 | ```
177 | **CALVIN**
178 | ```bibtex
179 | @article{mees2022calvin,
180 | author = {Oier Mees and Lukas Hermann and Erick Rosete-Beas and Wolfram Burgard},
181 | title = {CALVIN: A Benchmark for Language-Conditioned Policy Learning for Long-Horizon Robot Manipulation Tasks},
182 | journal={IEEE Robotics and Automation Letters (RA-L)},
183 | volume={7},
184 | number={3},
185 | pages={7327-7334},
186 | year={2022}
187 | }
188 | ```
189 | 
190 | ## License
191 | 
192 | MIT License
193 | 


--------------------------------------------------------------------------------
/checkpoints/download_model_weights.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Download, Unzip, and Remove zip
 3 | if [ "$1" = "D" ]
 4 | then
 5 | 
 6 |     echo "Downloading HULC Checkpoint for task_D_D ..."
 7 |     wget http://hulc.cs.uni-freiburg.de/model_weights/HULC_D_D.zip
 8 |     unzip HULC_D_D.zip && rm HULC_D_D.zip
 9 |     echo "finished!"
10 | elif [ "$1" = "ABC" ]
11 | then
12 | 
13 |     echo "Downloading HULC Checkpoint for task_ABC_D ..."
14 |     wget http://hulc.cs.uni-freiburg.de/model_weights/HULC_ABC_D.zip
15 |     unzip HULC_ABC_D.zip && rm HULC_ABC_D.zip
16 |     echo "finished!"
17 | 
18 | elif [ "$1" = "ABCD" ]
19 | then
20 | 
21 |     echo "Downloading HULC Checkpoint for task_ABCD_D ..."
22 |     wget http://hulc.cs.uni-freiburg.de/model_weights/HULC_ABCD_D.zip
23 |     unzip HULC_ABCD_D.zip && rm HULC_ABCD_D.zip
24 |     echo "finished!"
25 | 
26 | else
27 |     echo "Failed: Usage download_model_weights.sh D | ABC | ABCD"
28 |     exit 1
29 | fi
30 | 


--------------------------------------------------------------------------------
/conf/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lukashermann/hulc/7fdb09f5f1ff831ec8df29cfc61a41db573e9a06/conf/__init__.py


--------------------------------------------------------------------------------
/conf/annotations/new_playtable_validation.yaml:
--------------------------------------------------------------------------------
 1 | # rotation
 2 | rotate_red_block_right: ["take the red block and rotate it to the right"]
 3 | rotate_red_block_left: ["take the red block and rotate it to the left"]
 4 | rotate_blue_block_right: ["take the blue block and rotate it to the right"]
 5 | rotate_blue_block_left: ["take the blue block and rotate it to the left"]
 6 | rotate_pink_block_right: ["take the pink block and rotate it to the right"]
 7 | rotate_pink_block_left: ["take the pink block and rotate it to the left"]
 8 | 
 9 | # sliding
10 | push_red_block_right: ["go push the red block right"]
11 | push_red_block_left: ["go push the red block left"]
12 | push_blue_block_right: ["go push the blue block right"]
13 | push_blue_block_left: ["go push the blue block left"]
14 | push_pink_block_right: ["go push the pink block right"]
15 | push_pink_block_left: ["go push the pink block left"]
16 | 
17 | # open/close
18 | move_slider_left: [ "push the sliding door to the left side"]
19 | move_slider_right: [ "push the sliding door to the right side"]
20 | open_drawer: ["pull the handle to open the drawer"]
21 | close_drawer: ["push the handle to close the drawer"]
22 | 
23 | # lifting
24 | lift_red_block_table: ["grasp and lift the red block"]
25 | lift_blue_block_table: ["grasp and lift the blue block"]
26 | lift_pink_block_table: ["grasp and lift the pink block"]
27 | 
28 | lift_red_block_slider: [ "lift the red block from the sliding cabinet"]
29 | lift_blue_block_slider: [ "lift the blue block from the sliding cabinet"]
30 | lift_pink_block_slider: [ "lift the pink block from the sliding cabinet"]
31 | 
32 | lift_red_block_drawer: ["Take the red block from the drawer"]
33 | lift_blue_block_drawer: ["Take the blue block from the drawer"]
34 | lift_pink_block_drawer: ["Take the pink block from the drawer"]
35 | 
36 | place_in_slider: [ "store the grasped block in the sliding cabinet"]
37 | place_in_drawer: [ "store the grasped block in the drawer"]
38 | 
39 | push_into_drawer: ["slide the block that it falls into the drawer"]
40 | 
41 | stack_block: ["stack the grasped block"]
42 | unstack_block: ["remove the stacked block"]
43 | 
44 | turn_on_lightbulb: ["use the switch to turn on the light bulb"]
45 | turn_off_lightbulb: ["use the switch to turn off the light bulb"]
46 | turn_on_led: ["press the button to turn on the led light"]
47 | turn_off_led: ["press the button to turn off the led light"]
48 | 


--------------------------------------------------------------------------------
/conf/callbacks/checkpoint/all.yaml:
--------------------------------------------------------------------------------
1 | _target_: pytorch_lightning.callbacks.ModelCheckpoint
2 | save_top_k: -1
3 | verbose: True
4 | dirpath: saved_models
5 | filename: '{epoch}' #put back in when PL fixes this _{val/accuracy:.4f}'
6 | 


--------------------------------------------------------------------------------
/conf/callbacks/checkpoint/clip_loss.yaml:
--------------------------------------------------------------------------------
1 | _target_: pytorch_lightning.callbacks.ModelCheckpoint
2 | save_top_k: 3
3 | verbose: True
4 | monitor: val/val_pred_clip_loss
5 | mode: min
6 | dirpath: saved_models
7 | filename: '{epoch}' #put back in when PL fixes this _{val/accuracy:.4f}'
8 | 


--------------------------------------------------------------------------------
/conf/callbacks/checkpoint/kl.yaml:
--------------------------------------------------------------------------------
1 | _target_: pytorch_lightning.callbacks.ModelCheckpoint
2 | save_top_k: 3
3 | verbose: True
4 | monitor: train/kl_loss
5 | mode: max
6 | dirpath: saved_models
7 | filename: '{epoch}' #put back in when PL fixes this _{val/accuracy:.4f}'
8 | 


--------------------------------------------------------------------------------
/conf/callbacks/checkpoint/lh_sr.yaml:
--------------------------------------------------------------------------------
1 | _target_: pytorch_lightning.callbacks.ModelCheckpoint
2 | save_top_k: 3
3 | verbose: True
4 | monitor: eval_lh/avg_seq_len
5 | mode: max
6 | dirpath: saved_models
7 | filename: '{epoch}' #put back in when PL fixes this _{val/accuracy:.4f}'
8 | every_n_epochs: ${callbacks.rollout_lh.rollout_freq}
9 | 


--------------------------------------------------------------------------------
/conf/callbacks/checkpoint/state_recon.yaml:
--------------------------------------------------------------------------------
1 | _target_: pytorch_lightning.callbacks.ModelCheckpoint
2 | save_top_k: 3
3 | verbose: True
4 | monitor: val/state_recon_loss
5 | mode: min
6 | dirpath: saved_models
7 | filename: '{epoch}' #put back in when PL fixes this _{val/accuracy:.4f}'
8 | 


--------------------------------------------------------------------------------
/conf/callbacks/checkpoint/task_sr.yaml:
--------------------------------------------------------------------------------
1 | _target_: pytorch_lightning.callbacks.ModelCheckpoint
2 | save_top_k: 3
3 | verbose: True
4 | monitor: tasks/average_sr
5 | mode: max
6 | dirpath: saved_models
7 | filename: '{epoch}' #put back in when PL fixes this _{val/accuracy:.4f}'
8 | every_n_epochs: ${callbacks.rollout.rollout_freq}
9 | 


--------------------------------------------------------------------------------
/conf/callbacks/checkpoint/val_action.yaml:
--------------------------------------------------------------------------------
1 | _target_: pytorch_lightning.callbacks.ModelCheckpoint
2 | save_top_k: -1
3 | verbose: True
4 | monitor: val_act/action_loss_pp
5 | mode: min
6 | dirpath: saved_models
7 | filename: '{epoch}' #put back in when PL fixes this _{val/accuracy:.4f}'
8 | 


--------------------------------------------------------------------------------
/conf/callbacks/default.yaml:
--------------------------------------------------------------------------------
1 | defaults:
2 | #  - rollout: default
3 |   - rollout_lh: default
4 |   - checkpoint: all
5 |   - tsne_plot: default
6 |   - kl_schedule: constant
7 |   - shm_signal: default
8 | 


--------------------------------------------------------------------------------
/conf/callbacks/kl_schedule/constant.yaml:
--------------------------------------------------------------------------------
1 | _target_: hulc.utils.kl_callbacks.KLConstantSchedule
2 | 


--------------------------------------------------------------------------------
/conf/callbacks/kl_schedule/linear.yaml:
--------------------------------------------------------------------------------
1 | _target_: hulc.utils.kl_callbacks.KLLinearSchedule
2 | start_epoch: 10
3 | end_epoch: 50
4 | max_kl_beta: ${loss.kl_beta}
5 | 


--------------------------------------------------------------------------------
/conf/callbacks/kl_schedule/sigmoid.yaml:
--------------------------------------------------------------------------------
1 | # @package _group_
2 | _target_: hulc.utils.kl_callbacks.KLSigmoidSchedule
3 | start_epoch: 10
4 | end_epoch: 50
5 | max_kl_beta: ${loss.kl_beta}
6 | 


--------------------------------------------------------------------------------
/conf/callbacks/rollout/default.yaml:
--------------------------------------------------------------------------------
 1 | defaults:
 2 |   - /callbacks/rollout/tasks@tasks: new_playtable_tasks
 3 |   - /annotations@val_annotations: new_playtable_validation
 4 | _target_: calvin_agent.rollout.rollout.Rollout
 5 | _recursive_: false
 6 | env_cfg:
 7 |   _target_: calvin_agent.wrappers.calvin_env_wrapper.CalvinEnvWrapper
 8 | skip_epochs: 1
 9 | rollout_freq: 5
10 | video: true
11 | num_rollouts_per_task: 10
12 | check_percentage_of_batch: 1  # which percentage of sequences do we want to check for possible tasks
13 | ep_len: 120
14 | empty_cache: false
15 | log_video_to_file: false
16 | save_dir: ./videos
17 | add_goal_thumbnail: true
18 | min_window_size: ${datamodule.datasets.vision_dataset.min_window_size}
19 | max_window_size: ${datamodule.datasets.vision_dataset.max_window_size}
20 | id_selection_strategy: "select_longest"
21 | lang_folder: ${datamodule.datasets.lang_dataset.lang_folder}
22 | 


--------------------------------------------------------------------------------
/conf/callbacks/rollout/tasks/new_playtable_tasks.yaml:
--------------------------------------------------------------------------------
 1 | _target_: calvin_env.envs.tasks.Tasks
 2 | tasks:
 3 |   # rotation
 4 |   rotate_red_block_right: [rotate_object, 'block_red', -60]
 5 |   rotate_red_block_left: [rotate_object, 'block_red', 60]
 6 |   rotate_blue_block_right: [ rotate_object, 'block_blue', -60 ]
 7 |   rotate_blue_block_left: [ rotate_object, 'block_blue', 60 ]
 8 |   rotate_pink_block_right: [ rotate_object, 'block_pink', -60 ]
 9 |   rotate_pink_block_left: [ rotate_object, 'block_pink', 60 ]
10 | 
11 |   # pushing
12 |   push_red_block_right: [ push_object, 'block_red', 0.1, 0]
13 |   push_red_block_left: [ push_object, 'block_red', -0.1, 0]
14 |   push_blue_block_right: [ push_object, 'block_blue', 0.1, 0]
15 |   push_blue_block_left: [ push_object, 'block_blue', -0.1, 0]
16 |   push_pink_block_right: [ push_object, 'block_pink', 0.1, 0]
17 |   push_pink_block_left: [ push_object, 'block_pink', -0.1, 0]
18 | 
19 |   # open/close
20 |   move_slider_left: [move_door_rel, 'base__slide', 0.15]  # 0 - 0.56
21 |   move_slider_right: [move_door_rel, 'base__slide', -0.15]
22 |   open_drawer: [move_door_rel, 'base__drawer', 0.12]  # 0 - 0.24
23 |   close_drawer: [move_door_rel, 'base__drawer', -0.12]
24 | 
25 |   # lifting
26 |   lift_red_block_table: [lift_object, 'block_red', 0.05, 'table', 'base_link']
27 |   lift_red_block_slider: [lift_object, 'block_red', 0.03, 'table', 'plank_link']
28 |   lift_red_block_drawer: [lift_object, 'block_red', 0.05, 'table', 'drawer_link']
29 |   lift_blue_block_table: [ lift_object, 'block_blue', 0.05, 'table', 'base_link' ]
30 |   lift_blue_block_slider: [ lift_object, 'block_blue', 0.03, 'table', 'plank_link' ]
31 |   lift_blue_block_drawer: [ lift_object, 'block_blue', 0.05, 'table', 'drawer_link' ]
32 |   lift_pink_block_table: [ lift_object, 'block_pink', 0.05, 'table', 'base_link' ]
33 |   lift_pink_block_slider: [ lift_object, 'block_pink', 0.03, 'table', 'plank_link' ]
34 |   lift_pink_block_drawer: [ lift_object, 'block_pink', 0.05, 'table', 'drawer_link' ]
35 | 
36 |   # placing
37 |   place_in_slider: [place_object, 'table', 'plank_link']
38 |   place_in_drawer: [place_object, 'table', 'drawer_link']
39 | 
40 |   # stacking
41 |   stack_block: [stack_objects]
42 |   unstack_block: [unstack_objects]
43 | 
44 |   # lights
45 |   turn_on_lightbulb: [toggle_light, 'lightbulb', 0, 1]
46 |   turn_off_lightbulb: [toggle_light, 'lightbulb', 1, 0]
47 |   turn_on_led: [ toggle_light, 'led', 0, 1 ]
48 |   turn_off_led: [ toggle_light, 'led', 1, 0 ]
49 | 
50 |   # pushing into drawer
51 |   push_into_drawer: [push_object_into, ['block_red', 'block_blue', 'block_pink'], 'table', 'base_link', 'table', 'drawer_link']
52 | 
53 | # signatures of available base tasks:
54 | # rotate_object(obj_name, degrees, x_y_threshold=30, z_treshold=180):
55 | # push_object(obj_name, x_direction, y_direction):
56 | # lift_object(obj_name, z_direction, surface_body=None, surface_link=None):
57 | # place_object(dest_body, dest_link=None):
58 | # push_object_into(obj_name, src_body, dest_body):
59 | # move_door_abs(start_info, end_info, obj_name, joint_name, start_threshold, end_threshold):
60 | # move_door_rel(obj_name, joint_name, threshold):
61 | 


--------------------------------------------------------------------------------
/conf/callbacks/rollout_lh/default.yaml:
--------------------------------------------------------------------------------
 1 | defaults:
 2 |   - /callbacks/rollout/tasks@tasks: new_playtable_tasks
 3 |   - /annotations@val_annotations: new_playtable_validation
 4 | _target_: calvin_agent.rollout.rollout_long_horizon.RolloutLongHorizon
 5 | _recursive_: false
 6 | env_cfg:
 7 |   _target_: calvin_agent.wrappers.calvin_env_wrapper.CalvinEnvWrapper
 8 | skip_epochs: 1
 9 | rollout_freq: 1
10 | num_videos: 16
11 | num_sequences: 128
12 | replan_freq: 30
13 | ep_len: 360
14 | empty_cache: false
15 | log_video_to_file: false
16 | save_dir: ./videos
17 | lang_folder: ${datamodule.datasets.lang_dataset.lang_folder}
18 | debug: false
19 | 


--------------------------------------------------------------------------------
/conf/callbacks/shm_signal/default.yaml:
--------------------------------------------------------------------------------
1 | _target_: calvin_agent.datasets.utils.shared_memory_utils.SignalCallback
2 | 


--------------------------------------------------------------------------------
/conf/callbacks/tsne_plot/default.yaml:
--------------------------------------------------------------------------------
1 | _target_: calvin_agent.visualization.tsne_plot.TSNEPlot
2 | perplexity: 40
3 | n_jobs: 8
4 | plot_percentage: 0.2
5 | opacity: 0.3
6 | marker_size: 5
7 | 


--------------------------------------------------------------------------------
/conf/config.yaml:
--------------------------------------------------------------------------------
 1 | defaults:
 2 |   - callbacks: default
 3 |   - datamodule: default
 4 |   - model: hulc
 5 |   - loss: default
 6 |   - training: default_training
 7 |   - trainer: play_trainer
 8 |   - logger: wandb
 9 |   - override hydra/job_logging: colorlog
10 |   - override hydra/hydra_logging: colorlog
11 |   - _self_
12 | 
13 | seed: 42
14 | log_dir: ../
15 | slurm: false
16 | 
17 | hydra:
18 |   run:
19 |     dir: ${log_dir}/runs/${now:%Y-%m-%d}/${now:%H-%M-%S}
20 |   sweep:
21 |     dir: ${log_dir}/runs/${now:%Y-%m-%d}/${now:%H-%M-%S}
22 |     subdir: ${hydra.job.override_dirname}
23 |   job:
24 |     config:
25 |       override_dirname:
26 |         exclude_keys:
27 |           - log_dir
28 |           - datamodule.root_data_dir
29 |           - trainer.gpus
30 |           - model.tsne_plot
31 |           - datamodule.num_workers
32 |           - trainer.limit_train_batches
33 |           - trainer.limit_val_batches
34 |           - model.action_decoder.load_action_bounds
35 | 


--------------------------------------------------------------------------------
/conf/datamodule/datasets/lang_dataset/lang.yaml:
--------------------------------------------------------------------------------
 1 | _target_: calvin_agent.datasets.disk_dataset.DiskDataset
 2 | key: "lang"
 3 | save_format: "npz"
 4 | batch_size: 32
 5 | min_window_size: 20
 6 | max_window_size: 32
 7 | proprio_state: ${datamodule.proprioception_dims}
 8 | obs_space: ${datamodule.observation_space}
 9 | skip_frames: 1
10 | pad: true
11 | lang_folder: "lang_paraphrase-MiniLM-L3-v2"
12 | aux_lang_loss_window: 8
13 | num_workers: 2
14 | 


--------------------------------------------------------------------------------
/conf/datamodule/datasets/lang_dataset/lang_shm.yaml:
--------------------------------------------------------------------------------
 1 | _target_: calvin_agent.datasets.shm_dataset.ShmDataset
 2 | key: "lang"
 3 | batch_size: 32
 4 | min_window_size: 20
 5 | max_window_size: 32
 6 | proprio_state: ${datamodule.proprioception_dims}
 7 | obs_space: ${datamodule.observation_space}
 8 | pad: true
 9 | lang_folder: "lang_paraphrase-MiniLM-L3-v2"
10 | aux_lang_loss_window: 8
11 | num_workers: 2
12 | 


--------------------------------------------------------------------------------
/conf/datamodule/datasets/lang_only.yaml:
--------------------------------------------------------------------------------
1 | defaults:
2 |   - lang_dataset: lang
3 | 


--------------------------------------------------------------------------------
/conf/datamodule/datasets/vision_dataset/vision.yaml:
--------------------------------------------------------------------------------
 1 | _target_: calvin_agent.datasets.disk_dataset.DiskDataset
 2 | key: "vis"
 3 | save_format: "npz"
 4 | batch_size: 32
 5 | min_window_size: 20
 6 | max_window_size: 32
 7 | proprio_state: ${datamodule.proprioception_dims}
 8 | obs_space: ${datamodule.observation_space}
 9 | pad: true
10 | lang_folder: "lang_paraphrase-MiniLM-L3-v2"
11 | num_workers: 2
12 | 


--------------------------------------------------------------------------------
/conf/datamodule/datasets/vision_dataset/vision_shm.yaml:
--------------------------------------------------------------------------------
 1 | _target_: calvin_agent.datasets.shm_dataset.ShmDataset
 2 | key: "vis"
 3 | batch_size: 32
 4 | min_window_size: 20
 5 | max_window_size: 32
 6 | proprio_state: ${datamodule.proprioception_dims}
 7 | obs_space: ${datamodule.observation_space}
 8 | pad: true
 9 | lang_folder: "lang_paraphrase-MiniLM-L3-v2"
10 | num_workers: 2
11 | 


--------------------------------------------------------------------------------
/conf/datamodule/datasets/vision_lang.yaml:
--------------------------------------------------------------------------------
1 | defaults:
2 |   - vision_dataset: vision
3 |   - lang_dataset: lang
4 | 


--------------------------------------------------------------------------------
/conf/datamodule/datasets/vision_lang_shm.yaml:
--------------------------------------------------------------------------------
1 | defaults:
2 |   - vision_dataset: vision_shm
3 |   - lang_dataset: lang_shm
4 | 


--------------------------------------------------------------------------------
/conf/datamodule/datasets/vision_only.yaml:
--------------------------------------------------------------------------------
1 | defaults:
2 |   - vision_dataset: vision
3 | 


--------------------------------------------------------------------------------
/conf/datamodule/default.yaml:
--------------------------------------------------------------------------------
 1 | defaults:
 2 |   - datasets: vision_lang_shm
 3 |   - transforms: rand_shift
 4 |   - proprioception_dims: robot_no_joints #robot_full
 5 |   - observation_space: lang_rgb_static_gripper_rel_act
 6 | _target_: calvin_agent.datasets.calvin_data_module.CalvinDataModule
 7 | _recursive_: false
 8 | root_data_dir: ???
 9 | action_space: 7
10 | action_max: [1., 1., 1., 1., 1., 1., 1.,]
11 | action_min: [-1., -1., -1., -1., -1., -1., -1]
12 | shuffle_val: false
13 | 


--------------------------------------------------------------------------------
/conf/datamodule/mcil.yaml:
--------------------------------------------------------------------------------
 1 | defaults:
 2 |   - datasets: vision_lang_shm
 3 |   - transforms: play_basic
 4 |   - proprioception_dims: robot_no_joints #robot_full
 5 |   - observation_space: lang_rgb_static_gripper_abs_act
 6 | _target_: calvin_agent.datasets.calvin_data_module.CalvinDataModule
 7 | _recursive_: false
 8 | root_data_dir: ???
 9 | action_space: 7
10 | action_max: [1., 1., 1., 1., 1., 1., 1.,]
11 | action_min: [-1., -1., -1., -1., -1., -1., -1]
12 | shuffle_val: false
13 | 


--------------------------------------------------------------------------------
/conf/datamodule/observation_space/all_mods_abs_act.yaml:
--------------------------------------------------------------------------------
1 | rgb_obs: ['rgb_static', 'rgb_gripper', 'rgb_tactile']
2 | depth_obs: ['depth_static', 'depth_gripper', 'depth_tactile']
3 | state_obs: ['robot_obs', 'scene_obs']
4 | actions: ['actions']
5 | language: ['language']
6 | 


--------------------------------------------------------------------------------
/conf/datamodule/observation_space/lang_rgb_static_abs_act.yaml:
--------------------------------------------------------------------------------
1 | rgb_obs: ['rgb_static']
2 | depth_obs: []
3 | state_obs: ['robot_obs']
4 | actions: ['actions']
5 | language: ['language']
6 | 


--------------------------------------------------------------------------------
/conf/datamodule/observation_space/lang_rgb_static_gripper_abs_act.yaml:
--------------------------------------------------------------------------------
1 | rgb_obs: ['rgb_static', 'rgb_gripper']
2 | depth_obs: []
3 | state_obs: ['robot_obs']
4 | actions: ['actions']
5 | language: ['language']
6 | 


--------------------------------------------------------------------------------
/conf/datamodule/observation_space/lang_rgb_static_gripper_rel_act.yaml:
--------------------------------------------------------------------------------
1 | rgb_obs: ['rgb_static', 'rgb_gripper']
2 | depth_obs: []
3 | state_obs: ['robot_obs']
4 | actions: ['rel_actions']
5 | language: ['language']
6 | 


--------------------------------------------------------------------------------
/conf/datamodule/observation_space/lang_rgb_static_rel_act.yaml:
--------------------------------------------------------------------------------
1 | rgb_obs: ['rgb_static']
2 | depth_obs: []
3 | state_obs: ['robot_obs']
4 | actions: ['rel_actions']
5 | language: ['language']
6 | 


--------------------------------------------------------------------------------
/conf/datamodule/observation_space/lang_rgb_static_robot_scene_abs_act.yaml:
--------------------------------------------------------------------------------
1 | rgb_obs: ['rgb_static']
2 | depth_obs: []
3 | state_obs: ['robot_obs', 'scene_obs']
4 | actions: ['actions']
5 | language: ['language']
6 | 


--------------------------------------------------------------------------------
/conf/datamodule/observation_space/lang_rgb_static_tactile_abs_act.yaml:
--------------------------------------------------------------------------------
1 | rgb_obs: ['rgb_static', 'rgb_tactile']
2 | depth_obs: []
3 | state_obs: ['robot_obs']
4 | actions: ['actions']
5 | language: ['language']
6 | 


--------------------------------------------------------------------------------
/conf/datamodule/observation_space/lang_rgbd_both_abs_act.yaml:
--------------------------------------------------------------------------------
1 | rgb_obs: ['rgb_static', 'rgb_gripper']
2 | depth_obs: ['depth_static', 'depth_gripper']
3 | state_obs: ['robot_obs']
4 | actions: ['actions']
5 | language: ['language']
6 | 


--------------------------------------------------------------------------------
/conf/datamodule/observation_space/lang_rgbd_both_rel_act.yaml:
--------------------------------------------------------------------------------
1 | rgb_obs: ['rgb_static', 'rgb_gripper']
2 | depth_obs: ['depth_static', 'depth_gripper']
3 | state_obs: ['robot_obs']
4 | actions: ['rel_actions']
5 | language: ['language']
6 | 


--------------------------------------------------------------------------------
/conf/datamodule/observation_space/lang_rgbd_static_gripper_rel_act.yaml:
--------------------------------------------------------------------------------
1 | rgb_obs: ['rgb_static', 'rgb_gripper']
2 | depth_obs: ['depth_gripper']
3 | state_obs: ['robot_obs']
4 | actions: ['rel_actions']
5 | language: ['language']
6 | 


--------------------------------------------------------------------------------
/conf/datamodule/observation_space/lang_rgbd_static_robot_abs_act.yaml:
--------------------------------------------------------------------------------
1 | rgb_obs: ['rgb_static']
2 | depth_obs: ['depth_static']
3 | state_obs: ['robot_obs']
4 | actions: ['actions']
5 | language: ['language']
6 | 


--------------------------------------------------------------------------------
/conf/datamodule/observation_space/rgb_static_abs_act.yaml:
--------------------------------------------------------------------------------
1 | rgb_obs: ['rgb_static']
2 | depth_obs: []
3 | state_obs: ['robot_obs']
4 | actions: ['actions']
5 | 


--------------------------------------------------------------------------------
/conf/datamodule/observation_space/rgb_static_robot_scene_abs_act.yaml:
--------------------------------------------------------------------------------
1 | rgb_obs: ['rgb_static']
2 | depth_obs: []
3 | state_obs: ['robot_obs', 'scene_obs']
4 | actions: ['actions']
5 | 


--------------------------------------------------------------------------------
/conf/datamodule/observation_space/state_only.yaml:
--------------------------------------------------------------------------------
1 | rgb_obs: []
2 | depth_obs: []
3 | state_obs: ['robot_obs']
4 | actions: ['actions']
5 | language: ['language']
6 | 


--------------------------------------------------------------------------------
/conf/datamodule/proprioception_dims/none.yaml:
--------------------------------------------------------------------------------
1 | n_state_obs: 0
2 | keep_indices: [[0, 0]]
3 | robot_orientation_idx: [3, 6]
4 | normalize: False
5 | normalize_robot_orientation: False
6 | 


--------------------------------------------------------------------------------
/conf/datamodule/proprioception_dims/robot_full.yaml:
--------------------------------------------------------------------------------
1 | n_state_obs: 15
2 | keep_indices: [[0, 15]]
3 | robot_orientation_idx: [3, 6]
4 | normalize: True
5 | normalize_robot_orientation: True
6 | 


--------------------------------------------------------------------------------
/conf/datamodule/proprioception_dims/robot_no_joints.yaml:
--------------------------------------------------------------------------------
1 | n_state_obs: 8
2 | keep_indices: [[0, 7], [14,15]]
3 | robot_orientation_idx: [3, 6]
4 | normalize: True
5 | normalize_robot_orientation: True
6 | 


--------------------------------------------------------------------------------
/conf/datamodule/proprioception_dims/robot_no_joints_no_gripper_width.yaml:
--------------------------------------------------------------------------------
1 | n_state_obs: 7
2 | keep_indices: [[0, 6], [14,15]]
3 | robot_orientation_idx: [3, 6]
4 | normalize: True
5 | normalize_robot_orientation: True
6 | 


--------------------------------------------------------------------------------
/conf/datamodule/proprioception_dims/robot_scene.yaml:
--------------------------------------------------------------------------------
1 | n_state_obs: 54
2 | keep_indices: [[0, 54]]
3 | robot_orientation_idx: [3, 6]
4 | normalize: True
5 | normalize_robot_orientation: True
6 | 


--------------------------------------------------------------------------------
/conf/datamodule/transforms/clip.yaml:
--------------------------------------------------------------------------------
  1 | train:
  2 |   rgb_static:
  3 |     - _target_: torchvision.transforms.Resize
  4 |       size: 224
  5 |     - _target_: hulc.utils.transforms.RandomShiftsAug
  6 |       pad: 10
  7 |     - _target_: calvin_agent.utils.transforms.ScaleImageTensor
  8 |     - _target_: torchvision.transforms.Normalize
  9 |       mean: [0.48145466, 0.4578275, 0.40821073]
 10 |       std: [0.26862954, 0.26130258, 0.27577711]
 11 |   rgb_gripper:
 12 |     - _target_: torchvision.transforms.Resize
 13 |       size: 84
 14 |     - _target_: hulc.utils.transforms.RandomShiftsAug
 15 |       pad: 4
 16 |     - _target_: calvin_agent.utils.transforms.ScaleImageTensor
 17 |     - _target_: torchvision.transforms.Normalize
 18 |       mean: [0.48145466, 0.4578275, 0.40821073]
 19 |       std: [0.26862954, 0.26130258, 0.27577711]
 20 |   depth_static:
 21 |     - _target_: torchvision.transforms.Resize
 22 |       size: 200
 23 |     - _target_: calvin_agent.utils.transforms.AddDepthNoise
 24 |       shape: [1000.0]
 25 |       rate: [1000.0]
 26 |     - _target_: calvin_agent.utils.transforms.AddGaussianNoise
 27 |       mean: [0.0]
 28 |       std: [0.01]
 29 |   depth_gripper:
 30 |     - _target_: torchvision.transforms.Resize
 31 |       size: 84
 32 | #    - _target_: calvin.utils.transforms.AddDepthNoise
 33 | #      shape: [ 1000.0 ]
 34 | #      rate: [ 1000.0 ]
 35 |     - _target_: calvin_agent.utils.transforms.AddGaussianNoise
 36 |       mean: [ 0.0 ]
 37 |       std: [ 0.01 ]
 38 |   rgb_tactile:
 39 |     - _target_: torchvision.transforms.Resize
 40 |       size: 70
 41 |     - _target_: torchvision.transforms.RandomCrop
 42 |       size: 64
 43 |     - _target_: calvin_agent.utils.transforms.ScaleImageTensor
 44 |     - _target_: torchvision.transforms.Normalize
 45 |       mean: [0.5]
 46 |       std: [0.5]
 47 |     - _target_: calvin_agent.utils.transforms.AddGaussianNoise
 48 |       mean: [ 0.0 ]
 49 |       std: [ 0.01 ]
 50 |   depth_tactile:
 51 |     - _target_: torchvision.transforms.Resize
 52 |       size: 64
 53 |     - _target_: torchvision.transforms.Normalize
 54 |       mean: [0.1,]
 55 |       std: [0.2,]
 56 |     - _target_: calvin_agent.utils.transforms.AddGaussianNoise
 57 |       mean: [ 0.0 ]
 58 |       std: [ 0.01 ]
 59 |   robot_obs:
 60 |     - _target_: calvin_agent.utils.transforms.NormalizeVector
 61 |     - _target_: calvin_agent.utils.transforms.AddGaussianNoise
 62 |       mean: [ 0.0 ]
 63 |       std: [ 0.01 ]
 64 |   scene_obs:
 65 |     - _target_: calvin_agent.utils.transforms.NormalizeVector
 66 |     - _target_: calvin_agent.utils.transforms.AddGaussianNoise
 67 |       mean: [ 0.0 ]
 68 |       std: [ 0.01 ]
 69 |   language:
 70 |     - _target_: calvin_agent.utils.transforms.AddGaussianNoise
 71 |       mean: [ 0.0 ]
 72 |       std: [ 0.01 ]
 73 | 
 74 | 
 75 | val:
 76 |   rgb_static:
 77 |     - _target_: torchvision.transforms.Resize
 78 |       size: 224
 79 |     - _target_: hulc.utils.transforms.ScaleImageTensor
 80 |     - _target_: torchvision.transforms.Normalize
 81 |       mean: [ 0.48145466, 0.4578275, 0.40821073 ]
 82 |       std: [ 0.26862954, 0.26130258, 0.27577711 ]
 83 |   rgb_gripper:
 84 |     - _target_: torchvision.transforms.Resize
 85 |       size: 84
 86 |     - _target_: hulc.utils.transforms.ScaleImageTensor
 87 |     - _target_: torchvision.transforms.Normalize
 88 |       mean: [ 0.48145466, 0.4578275, 0.40821073 ]
 89 |       std: [ 0.26862954, 0.26130258, 0.27577711 ]
 90 |   depth_static:
 91 |     - _target_: torchvision.transforms.Resize
 92 |       size: 200
 93 |   depth_gripper:
 94 |     - _target_: torchvision.transforms.Resize
 95 |       size: 84
 96 |   rgb_tactile:
 97 |     - _target_: torchvision.transforms.Resize
 98 |       size: 70
 99 |     - _target_: torchvision.transforms.RandomCrop
100 |       size: 64
101 |     - _target_: hulc.utils.transforms.ScaleImageTensor
102 |     - _target_: torchvision.transforms.Normalize
103 |       mean: [0.5]
104 |       std: [0.5]
105 |   depth_tactile:
106 |     - _target_: torchvision.transforms.Resize
107 |       size: 64
108 |     - _target_: torchvision.transforms.Normalize
109 |       mean: [0.1,]
110 |       std: [0.2,]
111 |   robot_obs:
112 |     - _target_: hulc.utils.transforms.NormalizeVector
113 |   scene_obs:
114 |     - _target_: hulc.utils.transforms.NormalizeVector
115 | 


--------------------------------------------------------------------------------
/conf/datamodule/transforms/play_basic.yaml:
--------------------------------------------------------------------------------
  1 | train:
  2 |   rgb_static:
  3 |     - _target_: torchvision.transforms.Resize
  4 |       size: 200
  5 |     - _target_: calvin_agent.utils.transforms.ScaleImageTensor
  6 |     - _target_: torchvision.transforms.Normalize
  7 |       mean: [0.5,]
  8 |       std: [0.5,]
  9 | #    - _target_: calvin_agent.utils.transforms.AddGaussianNoise
 10 | #      mean: [0.0]
 11 | #      std: [0.01]
 12 |   rgb_gripper:
 13 |     - _target_: torchvision.transforms.Resize
 14 |       size: 84
 15 |     - _target_: calvin_agent.utils.transforms.ScaleImageTensor
 16 |     - _target_: torchvision.transforms.Normalize
 17 |       mean: [0.5,]
 18 |       std: [0.5,]
 19 | #    - _target_: calvin_agent.utils.transforms.AddGaussianNoise
 20 | #      mean: [0.0]
 21 | #      std: [0.01]
 22 |   depth_static:
 23 |     - _target_: torchvision.transforms.Resize
 24 |       size: 200
 25 |     - _target_: calvin_agent.utils.transforms.AddDepthNoise
 26 |       shape: [1000.0]
 27 |       rate: [1000.0]
 28 | #    - _target_: calvin_agent.utils.transforms.AddGaussianNoise
 29 | #      mean: [0.0]
 30 | #      std: [0.01]
 31 |   depth_gripper:
 32 |     - _target_: torchvision.transforms.Resize
 33 |       size: 84
 34 | #    - _target_: calvin.utils.transforms.AddDepthNoise
 35 | #      shape: [ 1000.0 ]
 36 | #      rate: [ 1000.0 ]
 37 |     - _target_: calvin_agent.utils.transforms.AddGaussianNoise
 38 |       mean: [ 0.0 ]
 39 |       std: [ 0.01 ]
 40 |   rgb_tactile:
 41 |     - _target_: torchvision.transforms.Resize
 42 |       size: 70
 43 |     - _target_: torchvision.transforms.RandomCrop
 44 |       size: 64
 45 |     - _target_: calvin_agent.utils.transforms.ScaleImageTensor
 46 |     - _target_: torchvision.transforms.Normalize
 47 |       mean: [0.5]
 48 |       std: [0.5]
 49 | #    - _target_: calvin_agent.utils.transforms.AddGaussianNoise
 50 | #      mean: [ 0.0 ]
 51 | #      std: [ 0.01 ]
 52 |   depth_tactile:
 53 |     - _target_: torchvision.transforms.Resize
 54 |       size: 64
 55 |     - _target_: torchvision.transforms.Normalize
 56 |       mean: [0.1,]
 57 |       std: [0.2,]
 58 | #    - _target_: calvin_agent.utils.transforms.AddGaussianNoise
 59 | #      mean: [ 0.0 ]
 60 | #      std: [ 0.01 ]
 61 |   robot_obs:
 62 |     - _target_: calvin_agent.utils.transforms.NormalizeVector
 63 | #    - _target_: calvin_agent.utils.transforms.AddGaussianNoise
 64 | #      mean: [ 0.0 ]
 65 | #      std: [ 0.01 ]
 66 |   scene_obs:
 67 |     - _target_: calvin_agent.utils.transforms.NormalizeVector
 68 | #    - _target_: calvin_agent.utils.transforms.AddGaussianNoise
 69 | #      mean: [ 0.0 ]
 70 | #      std: [ 0.01 ]
 71 | #  language:
 72 | #    - _target_: calvin_agent.utils.transforms.AddGaussianNoise
 73 | #      mean: [ 0.0 ]
 74 | #      std: [ 0.01 ]
 75 | 
 76 | 
 77 | val:
 78 |   rgb_static:
 79 |     - _target_: torchvision.transforms.Resize
 80 |       size: 200
 81 |     - _target_: calvin_agent.utils.transforms.ScaleImageTensor
 82 |     - _target_: torchvision.transforms.Normalize
 83 |       mean: [0.5,]
 84 |       std: [0.5,]
 85 |   rgb_gripper:
 86 |     - _target_: torchvision.transforms.Resize
 87 |       size: 84
 88 |     - _target_: calvin_agent.utils.transforms.ScaleImageTensor
 89 |     - _target_: torchvision.transforms.Normalize
 90 |       mean: [0.5,]
 91 |       std: [0.5,]
 92 |   depth_static:
 93 |     - _target_: torchvision.transforms.Resize
 94 |       size: 200
 95 |   depth_gripper:
 96 |     - _target_: torchvision.transforms.Resize
 97 |       size: 84
 98 |   rgb_tactile:
 99 |     - _target_: torchvision.transforms.Resize
100 |       size: 70
101 |     - _target_: torchvision.transforms.RandomCrop
102 |       size: 64
103 |     - _target_: calvin_agent.utils.transforms.ScaleImageTensor
104 |     - _target_: torchvision.transforms.Normalize
105 |       mean: [0.5]
106 |       std: [0.5]
107 |   depth_tactile:
108 |     - _target_: torchvision.transforms.Resize
109 |       size: 64
110 |     - _target_: torchvision.transforms.Normalize
111 |       mean: [0.1,]
112 |       std: [0.2,]
113 |   robot_obs:
114 |     - _target_: calvin_agent.utils.transforms.NormalizeVector
115 |   scene_obs:
116 |     - _target_: calvin_agent.utils.transforms.NormalizeVector
117 | 


--------------------------------------------------------------------------------
/conf/datamodule/transforms/rand_shift.yaml:
--------------------------------------------------------------------------------
  1 | train:
  2 |   rgb_static:
  3 |     - _target_: torchvision.transforms.Resize
  4 |       size: 200
  5 |     - _target_: hulc.utils.transforms.RandomShiftsAug
  6 |       pad: 10
  7 |     - _target_: calvin_agent.utils.transforms.ScaleImageTensor
  8 |     - _target_: torchvision.transforms.Normalize
  9 |       mean: [0.5,]
 10 |       std: [0.5,]
 11 | #    - _target_: calvin_agent.utils.transforms.AddGaussianNoise
 12 | #      mean: [0.0]
 13 | #      std: [0.01]
 14 |   rgb_gripper:
 15 |     - _target_: torchvision.transforms.Resize
 16 |       size: 84
 17 |     - _target_: hulc.utils.transforms.RandomShiftsAug
 18 |       pad: 4
 19 |     - _target_: calvin_agent.utils.transforms.ScaleImageTensor
 20 |     - _target_: torchvision.transforms.Normalize
 21 |       mean: [0.5,]
 22 |       std: [0.5,]
 23 | #    - _target_: calvin_agent.utils.transforms.AddGaussianNoise
 24 | #      mean: [0.0]
 25 | #      std: [0.01]
 26 |   depth_static:
 27 |     - _target_: torchvision.transforms.Resize
 28 |       size: 200
 29 |     - _target_: calvin_agent.utils.transforms.AddDepthNoise
 30 |       shape: [1000.0]
 31 |       rate: [1000.0]
 32 | #    - _target_: calvin_agent.utils.transforms.AddGaussianNoise
 33 | #      mean: [0.0]
 34 | #      std: [0.01]
 35 |   depth_gripper:
 36 |     - _target_: torchvision.transforms.Resize
 37 |       size: 84
 38 | #    - _target_: calvin.utils.transforms.AddDepthNoise
 39 | #      shape: [ 1000.0 ]
 40 | #      rate: [ 1000.0 ]
 41 |     - _target_: calvin_agent.utils.transforms.AddGaussianNoise
 42 |       mean: [ 0.0 ]
 43 |       std: [ 0.01 ]
 44 |   rgb_tactile:
 45 |     - _target_: torchvision.transforms.Resize
 46 |       size: 70
 47 |     - _target_: torchvision.transforms.RandomCrop
 48 |       size: 64
 49 |     - _target_: calvin_agent.utils.transforms.ScaleImageTensor
 50 |     - _target_: torchvision.transforms.Normalize
 51 |       mean: [0.5]
 52 |       std: [0.5]
 53 | #    - _target_: calvin_agent.utils.transforms.AddGaussianNoise
 54 | #      mean: [ 0.0 ]
 55 | #      std: [ 0.01 ]
 56 |   depth_tactile:
 57 |     - _target_: torchvision.transforms.Resize
 58 |       size: 64
 59 |     - _target_: torchvision.transforms.Normalize
 60 |       mean: [0.1,]
 61 |       std: [0.2,]
 62 | #    - _target_: calvin_agent.utils.transforms.AddGaussianNoise
 63 | #      mean: [ 0.0 ]
 64 | #      std: [ 0.01 ]
 65 |   robot_obs:
 66 |     - _target_: calvin_agent.utils.transforms.NormalizeVector
 67 | #    - _target_: calvin_agent.utils.transforms.AddGaussianNoise
 68 | #      mean: [ 0.0 ]
 69 | #      std: [ 0.01 ]
 70 |   scene_obs:
 71 |     - _target_: calvin_agent.utils.transforms.NormalizeVector
 72 | #    - _target_: calvin_agent.utils.transforms.AddGaussianNoise
 73 | #      mean: [ 0.0 ]
 74 | #      std: [ 0.01 ]
 75 | #  language:
 76 | #    - _target_: calvin_agent.utils.transforms.AddGaussianNoise
 77 | #      mean: [ 0.0 ]
 78 | #      std: [ 0.01 ]
 79 | 
 80 | 
 81 | val:
 82 |   rgb_static:
 83 |     - _target_: torchvision.transforms.Resize
 84 |       size: 200
 85 |     - _target_: calvin_agent.utils.transforms.ScaleImageTensor
 86 |     - _target_: torchvision.transforms.Normalize
 87 |       mean: [0.5,]
 88 |       std: [0.5,]
 89 |   rgb_gripper:
 90 |     - _target_: torchvision.transforms.Resize
 91 |       size: 84
 92 |     - _target_: calvin_agent.utils.transforms.ScaleImageTensor
 93 |     - _target_: torchvision.transforms.Normalize
 94 |       mean: [0.5,]
 95 |       std: [0.5,]
 96 |   depth_static:
 97 |     - _target_: torchvision.transforms.Resize
 98 |       size: 200
 99 |   depth_gripper:
100 |     - _target_: torchvision.transforms.Resize
101 |       size: 84
102 |   rgb_tactile:
103 |     - _target_: torchvision.transforms.Resize
104 |       size: 70
105 |     - _target_: torchvision.transforms.RandomCrop
106 |       size: 64
107 |     - _target_: calvin_agent.utils.transforms.ScaleImageTensor
108 |     - _target_: torchvision.transforms.Normalize
109 |       mean: [0.5]
110 |       std: [0.5]
111 |   depth_tactile:
112 |     - _target_: torchvision.transforms.Resize
113 |       size: 64
114 |     - _target_: torchvision.transforms.Normalize
115 |       mean: [0.1,]
116 |       std: [0.2,]
117 |   robot_obs:
118 |     - _target_: calvin_agent.utils.transforms.NormalizeVector
119 |   scene_obs:
120 |     - _target_: calvin_agent.utils.transforms.NormalizeVector
121 | 


--------------------------------------------------------------------------------
/conf/inference/config_inference.yaml:
--------------------------------------------------------------------------------
 1 | train_folder: ???  # config path to the config.yaml of the training folder (in .hydra)
 2 | load_checkpoint: ???
 3 | seed: 42
 4 | log_dir: /tmp
 5 | visualize: True
 6 | ep_len: 120
 7 | replan_freq: 30
 8 | processes: 1
 9 | 
10 | hydra:
11 |   run:
12 |     dir: ${log_dir}/inference_runs/${now:%Y-%m-%d}/${now:%H-%M-%S}
13 | 
14 | defaults:
15 |   - override hydra/job_logging: colorlog
16 |   - override hydra/hydra_logging: colorlog
17 | 


--------------------------------------------------------------------------------
/conf/lang_ann.yaml:
--------------------------------------------------------------------------------
 1 | defaults:
 2 |   - callbacks: default
 3 |   - datamodule: default
 4 |   - model: sbert
 5 |   - loss: default
 6 |   - training: default_training
 7 |   - trainer: play_trainer
 8 |   - logger: wandb
 9 |   - annotations@train_instructions: new_playtable
10 |   - annotations@val_instructions: new_playtable_validation
11 | 
12 |   - override hydra/job_logging: colorlog
13 |   - override hydra/hydra_logging: colorlog
14 |   - override datamodule/observation_space: state_only
15 | seed: 42
16 | log_dir: ../
17 | slurm: false
18 | eps: 0.01
19 | postprocessing: true
20 | lang_folder: "lang_annotations"
21 | with_text: false
22 | reannotate: false
23 | prior_steps_window: 16
24 | validation_scene: calvin_scene_D
25 | compute_tsne: false
26 | 
27 | hydra:
28 |   run:
29 |     dir: ${log_dir}/runs/${now:%Y-%m-%d}/${now:%H-%M-%S}_${hydra.job.override_dirname}
30 |   sweep:
31 |     dir: ${log_dir}/runs/${now:%Y-%m-%d}/${now:%H-%M-%S}
32 |     subdir: ${hydra.job.override_dirname}
33 |   job:
34 |     config:
35 |       override_dirname:
36 |         exclude_keys:
37 |           - log_dir
38 |           - datamodule.root_data_dir
39 |           - trainer.gpus
40 |           - model.tsne_plot
41 |           - datamodule.num_workers
42 |           - trainer.limit_train_batches
43 |           - trainer.limit_val_batches
44 |           - model.decoder.load_action_bounds
45 | 


--------------------------------------------------------------------------------
/conf/logger/tb_logger.yaml:
--------------------------------------------------------------------------------
1 | _target_: pytorch_lightning.loggers.TensorBoardLogger
2 | save_dir: .
3 | name: play_lmp
4 | version: ""
5 | 


--------------------------------------------------------------------------------
/conf/logger/wandb.yaml:
--------------------------------------------------------------------------------
1 | _target_: pytorch_lightning.loggers.WandbLogger
2 | save_dir: .
3 | name: play_lmp
4 | group: play_lmp
5 | log_model: false
6 | project: "multi_play"
7 | entity: "multimodal_control"
8 | id: ???
9 | 


--------------------------------------------------------------------------------
/conf/loss/default.yaml:
--------------------------------------------------------------------------------
1 | kl_beta: 0.01
2 | state_recon_beta: 0.5
3 | kl_balancing_mix: 0.8
4 | bc_z_auxiliary_loss_beta: 1.0
5 | mia_auxiliary_loss_beta: 1.0
6 | clip_auxiliary_loss_beta: 3.0
7 | 


--------------------------------------------------------------------------------
/conf/model/action_decoder/deterministic.yaml:
--------------------------------------------------------------------------------
 1 | _target_: hulc.models.decoders.deterministic_decoder.DeterministicDecoder
 2 | hidden_size: 2048
 3 | out_features: ${datamodule.action_space}
 4 | policy_rnn_dropout_p: 0.0
 5 | perceptual_features: ??
 6 | latent_goal_features: ${model.visual_goal.latent_goal_features}
 7 | plan_features: ???
 8 | criterion: HuberLoss  # MSELoss
 9 | num_layers: 2
10 | rnn_model: rnn_decoder
11 | perceptual_emb_slice: [64, 128]
12 | gripper_control: true
13 | 


--------------------------------------------------------------------------------
/conf/model/action_decoder/hulc_default.yaml:
--------------------------------------------------------------------------------
 1 | _target_: hulc.models.decoders.logistic_decoder_rnn.LogisticDecoderRNN
 2 | n_mixtures: 10
 3 | hidden_size: 2048
 4 | out_features: ${datamodule.action_space}
 5 | log_scale_min: -7.0
 6 | act_max_bound: ${datamodule.action_max}
 7 | act_min_bound: ${datamodule.action_min}
 8 | dataset_dir: ${datamodule.root_data_dir}
 9 | load_action_bounds: false
10 | num_classes: 10
11 | latent_goal_features: ${model.visual_goal.latent_goal_features}
12 | plan_features: ???
13 | perceptual_features: ???
14 | gripper_alpha: 1.0
15 | perceptual_emb_slice: [64, 128]
16 | policy_rnn_dropout_p: 0.0
17 | num_layers: 2
18 | rnn_model: rnn_decoder
19 | gripper_control: true
20 | discrete_gripper: true
21 | 


--------------------------------------------------------------------------------
/conf/model/action_decoder/mcil_default.yaml:
--------------------------------------------------------------------------------
 1 | _target_: hulc.models.decoders.logistic_decoder_rnn.LogisticDecoderRNN
 2 | n_mixtures: 10
 3 | hidden_size: 2048
 4 | out_features: ${datamodule.action_space}
 5 | log_scale_min: -7.0
 6 | act_max_bound: ${datamodule.action_max}
 7 | act_min_bound: ${datamodule.action_min}
 8 | dataset_dir: ${datamodule.root_data_dir}
 9 | load_action_bounds: false
10 | num_classes: 256
11 | latent_goal_features: ${model.visual_goal.latent_goal_features}
12 | plan_features: ???
13 | perceptual_features: ???
14 | gripper_alpha: 1.0
15 | policy_rnn_dropout_p: 0.0
16 | num_layers: 2
17 | rnn_model: rnn_decoder
18 | gripper_control: false
19 | discrete_gripper: false
20 | 


--------------------------------------------------------------------------------
/conf/model/bc_z_lang_decoder/default.yaml:
--------------------------------------------------------------------------------
1 | _target_: hulc.models.auxiliary_loss_networks.bc_z_lang_decoder.BCZLangDecoder
2 | in_features: ${model.plan_recognition.fc_hidden_size}
3 | lang_dim: ${model.language_goal.in_features}
4 | 


--------------------------------------------------------------------------------
/conf/model/bc_z_lang_decoder/none.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lukashermann/hulc/7fdb09f5f1ff831ec8df29cfc61a41db573e9a06/conf/model/bc_z_lang_decoder/none.yaml


--------------------------------------------------------------------------------
/conf/model/clip_lang.yaml:
--------------------------------------------------------------------------------
1 | _target_: hulc.models.encoders.clip_lang_encoder.LangClip
2 | freeze_backbone: true
3 | model_name: "RN50" # "RN101", "RN50x4", "RN50x16", "ViT-B/32", "ViT-B/16"
4 | 


--------------------------------------------------------------------------------
/conf/model/distribution/continuous.yaml:
--------------------------------------------------------------------------------
1 | _target_: hulc.utils.distributions.Distribution
2 | dist: "continuous"
3 | plan_features: 256
4 | 


--------------------------------------------------------------------------------
/conf/model/distribution/discrete.yaml:
--------------------------------------------------------------------------------
1 | _target_: hulc.utils.distributions.Distribution
2 | dist: "discrete"
3 | category_size: 32
4 | class_size: 32
5 | 


--------------------------------------------------------------------------------
/conf/model/gcbc.yaml:
--------------------------------------------------------------------------------
 1 | defaults:
 2 |   - perceptual_encoder: gripper_cam
 3 |   - plan_proposal: default
 4 |   - plan_recognition: transformers
 5 |   - distribution:  discrete
 6 |   - visual_goal: default
 7 |   - language_goal: default
 8 |   - action_decoder: hulc_default
 9 |   - optimizer: adam
10 |   - lr_scheduler: constant
11 |   - bc_z_lang_decoder: none
12 |   - mia_lang_discriminator: none
13 |   - proj_vis_lang: default
14 |   - /annotations@val_instructions: new_playtable_validation
15 | 
16 | _target_: hulc.models.gcbc.GCBC
17 | _recursive_: false
18 | 
19 | kl_beta: ${loss.kl_beta}
20 | kl_balancing_mix: ${loss.kl_balancing_mix}
21 | state_recons: false
22 | state_recon_beta: ${loss.state_recon_beta}
23 | use_bc_z_auxiliary_loss: false
24 | bc_z_auxiliary_loss_beta: ${loss.bc_z_auxiliary_loss_beta}
25 | use_mia_auxiliary_loss: false
26 | mia_auxiliary_loss_beta: ${loss.mia_auxiliary_loss_beta}
27 | replan_freq: 30
28 | use_clip_auxiliary_loss: true
29 | clip_auxiliary_loss_beta: ${loss.clip_auxiliary_loss_beta}
30 | 


--------------------------------------------------------------------------------
/conf/model/hulc.yaml:
--------------------------------------------------------------------------------
 1 | defaults:
 2 |   - perceptual_encoder: gripper_cam
 3 |   - plan_proposal: default
 4 |   - plan_recognition: transformers
 5 |   - distribution:  discrete
 6 |   - visual_goal: default
 7 |   - language_goal: default
 8 |   - action_decoder: hulc_default
 9 |   - optimizer: adam
10 |   - lr_scheduler: constant
11 |   - bc_z_lang_decoder: none
12 |   - mia_lang_discriminator: none
13 |   - proj_vis_lang: default
14 |   - /annotations@val_instructions: new_playtable_validation
15 | 
16 | _target_: hulc.models.hulc.Hulc
17 | _recursive_: false
18 | 
19 | kl_beta: ${loss.kl_beta}
20 | kl_balancing_mix: ${loss.kl_balancing_mix}
21 | state_recons: false
22 | state_recon_beta: ${loss.state_recon_beta}
23 | use_bc_z_auxiliary_loss: false
24 | bc_z_auxiliary_loss_beta: ${loss.bc_z_auxiliary_loss_beta}
25 | use_mia_auxiliary_loss: false
26 | mia_auxiliary_loss_beta: ${loss.mia_auxiliary_loss_beta}
27 | replan_freq: 30
28 | use_clip_auxiliary_loss: true
29 | clip_auxiliary_loss_beta: ${loss.clip_auxiliary_loss_beta}
30 | 


--------------------------------------------------------------------------------
/conf/model/language_encoder/default.yaml:
--------------------------------------------------------------------------------
1 | _target_: hulc.models.encoders.lang_encoder.LanguageEncoder
2 | language_features: 384
3 | hidden_size: 2048
4 | out_features: 256
5 | word_dropout_p: 0.0
6 | activation_function: ReLU #ELU
7 | 


--------------------------------------------------------------------------------
/conf/model/language_encoder/none.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lukashermann/hulc/7fdb09f5f1ff831ec8df29cfc61a41db573e9a06/conf/model/language_encoder/none.yaml


--------------------------------------------------------------------------------
/conf/model/language_goal/default.yaml:
--------------------------------------------------------------------------------
1 | _target_: hulc.models.encoders.goal_encoders.LanguageGoalEncoder
2 | in_features: 384
3 | hidden_size: 2048
4 | latent_goal_features: 32
5 | l2_normalize_goal_embeddings: False
6 | activation_function: ReLU #ELU
7 | word_dropout_p: 0.0
8 | 


--------------------------------------------------------------------------------
/conf/model/language_goal/none.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lukashermann/hulc/7fdb09f5f1ff831ec8df29cfc61a41db573e9a06/conf/model/language_goal/none.yaml


--------------------------------------------------------------------------------
/conf/model/lr_scheduler/constant.yaml:
--------------------------------------------------------------------------------
1 | _target_: transformers.get_constant_schedule
2 | 


--------------------------------------------------------------------------------
/conf/model/lr_scheduler/cosine_schedule_with_warmup.yaml:
--------------------------------------------------------------------------------
1 | _target_: transformers.get_cosine_schedule_with_warmup
2 | num_training_steps: -1  # -1 specifies to infer number of training steps
3 | num_warmup_steps: 0.1  # float values determines percentage of training steps to use as warmup
4 | num_cycles: 0.5
5 | 


--------------------------------------------------------------------------------
/conf/model/lr_scheduler/linear_schedule_with_warmup.yaml:
--------------------------------------------------------------------------------
1 | _target_: transformers.get_linear_schedule_with_warmup
2 | num_training_steps: -1  # -1 specifies to infer number of training steps
3 | num_warmup_steps: 0.1  # float values determines percentage of training steps to use as warmup
4 | 


--------------------------------------------------------------------------------
/conf/model/mcil.yaml:
--------------------------------------------------------------------------------
 1 | defaults:
 2 |   - perceptual_encoder: gripper_cam
 3 |   - plan_proposal: default
 4 |   - plan_recognition: birnn
 5 |   - distribution:  continuous
 6 |   - visual_goal: default
 7 |   - language_goal: default
 8 |   - action_decoder: mcil_default
 9 |   - optimizer: adam
10 |   - lr_scheduler: constant
11 |   - bc_z_lang_decoder: none
12 |   - mia_lang_discriminator: none
13 |   - proj_vis_lang: none
14 |   - /annotations@val_instructions: new_playtable_validation
15 | 
16 | _target_: hulc.models.hulc.Hulc
17 | _recursive_: false
18 | 
19 | kl_beta: ${loss.kl_beta}
20 | kl_balancing_mix: ${loss.kl_balancing_mix}
21 | state_recons: false
22 | state_recon_beta: ${loss.state_recon_beta}
23 | use_bc_z_auxiliary_loss: false
24 | bc_z_auxiliary_loss_beta: ${loss.bc_z_auxiliary_loss_beta}
25 | use_mia_auxiliary_loss: false
26 | mia_auxiliary_loss_beta: ${loss.mia_auxiliary_loss_beta}
27 | replan_freq: 30
28 | use_clip_auxiliary_loss: false
29 | clip_auxiliary_loss_beta: ${loss.clip_auxiliary_loss_beta}
30 | 


--------------------------------------------------------------------------------
/conf/model/mia_lang_discriminator/default.yaml:
--------------------------------------------------------------------------------
1 | _target_: hulc.models.auxiliary_loss_networks.mia_lang_discriminator.MIALangDiscriminator
2 | in_features: ${model.proj_vis_lang.output_dim}
3 | lang_dim: ${model.proj_vis_lang.output_dim}
4 | dropout_p: 0.0
5 | 


--------------------------------------------------------------------------------
/conf/model/mia_lang_discriminator/none.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lukashermann/hulc/7fdb09f5f1ff831ec8df29cfc61a41db573e9a06/conf/model/mia_lang_discriminator/none.yaml


--------------------------------------------------------------------------------
/conf/model/optimizer/adam.yaml:
--------------------------------------------------------------------------------
1 | _target_: torch.optim.Adam
2 | lr: ${training.lr}
3 | #weight_decay: 1e-6
4 | 


--------------------------------------------------------------------------------
/conf/model/optimizer/adamw.yaml:
--------------------------------------------------------------------------------
1 | _target_: torch.optim.AdamW
2 | lr: ${training.lr}
3 | weight_decay: 1e-6
4 | #amsgrad: False
5 | 


--------------------------------------------------------------------------------
/conf/model/optimizer/sgd.yaml:
--------------------------------------------------------------------------------
1 | _target_: torch.optim.SGD
2 | lr: ${training.lr}
3 | momentum: 0.9
4 | #weight_decay: 0.0005
5 | 


--------------------------------------------------------------------------------
/conf/model/perceptual_encoder/default.yaml:
--------------------------------------------------------------------------------
 1 | _target_: hulc.models.perceptual_encoders.concat_encoders.ConcatEncoders
 2 | _recursive_: false
 3 | 
 4 | defaults:
 5 |  - rgb_static: default
 6 |  - rgb_gripper: none
 7 |  - depth_static: none
 8 |  - depth_gripper: none
 9 |  - proprio: none
10 |  - tactile: none
11 |  - state_decoder: none
12 | 


--------------------------------------------------------------------------------
/conf/model/perceptual_encoder/depth_gripper/default.yaml:
--------------------------------------------------------------------------------
 1 | _target_: hulc.models.perceptual_encoders.vision_network_gripper.VisionNetwork
 2 | input_width: 84
 3 | input_height: 84
 4 | activation_function: ReLU #ELU
 5 | dropout_vis_fc: 0.0
 6 | l2_normalize_output: false
 7 | visual_features: 64
 8 | conv_encoder: nature_cnn
 9 | num_c: 1
10 | 


--------------------------------------------------------------------------------
/conf/model/perceptual_encoder/depth_gripper/none.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lukashermann/hulc/7fdb09f5f1ff831ec8df29cfc61a41db573e9a06/conf/model/perceptual_encoder/depth_gripper/none.yaml


--------------------------------------------------------------------------------
/conf/model/perceptual_encoder/depth_static/default.yaml:
--------------------------------------------------------------------------------
1 | _target_: hulc.models.perceptual_encoders.vision_network.VisionNetwork
2 | input_width: 200
3 | input_height: 200
4 | activation_function: ReLU #ELU
5 | dropout_vis_fc: 0.0
6 | l2_normalize_output: false
7 | visual_features: 64
8 | num_c: 1
9 | 


--------------------------------------------------------------------------------
/conf/model/perceptual_encoder/depth_static/none.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lukashermann/hulc/7fdb09f5f1ff831ec8df29cfc61a41db573e9a06/conf/model/perceptual_encoder/depth_static/none.yaml


--------------------------------------------------------------------------------
/conf/model/perceptual_encoder/gripper_cam.yaml:
--------------------------------------------------------------------------------
 1 | _target_: hulc.models.perceptual_encoders.concat_encoders.ConcatEncoders
 2 | _recursive_: false
 3 | 
 4 | defaults:
 5 |  - rgb_static: default
 6 |  - rgb_gripper: default
 7 |  - depth_static: none
 8 |  - depth_gripper: none
 9 |  - proprio: none
10 |  - tactile: none
11 | 


--------------------------------------------------------------------------------
/conf/model/perceptual_encoder/proprio/identity.yaml:
--------------------------------------------------------------------------------
1 | _target_: hulc.models.perceptual_encoders.proprio_encoder.IdentityEncoder
2 | proprioception_dims: ${datamodule.proprioception_dims}
3 | 


--------------------------------------------------------------------------------
/conf/model/perceptual_encoder/proprio/none.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lukashermann/hulc/7fdb09f5f1ff831ec8df29cfc61a41db573e9a06/conf/model/perceptual_encoder/proprio/none.yaml


--------------------------------------------------------------------------------
/conf/model/perceptual_encoder/rgb_gripper/default.yaml:
--------------------------------------------------------------------------------
 1 | _target_: hulc.models.perceptual_encoders.vision_network_gripper.VisionNetwork
 2 | input_width: 84
 3 | input_height: 84
 4 | activation_function: ReLU #ELU
 5 | dropout_vis_fc: 0.0
 6 | l2_normalize_output: false
 7 | visual_features: 64
 8 | conv_encoder: nature_cnn
 9 | num_c: 3
10 | 


--------------------------------------------------------------------------------
/conf/model/perceptual_encoder/rgb_gripper/none.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lukashermann/hulc/7fdb09f5f1ff831ec8df29cfc61a41db573e9a06/conf/model/perceptual_encoder/rgb_gripper/none.yaml


--------------------------------------------------------------------------------
/conf/model/perceptual_encoder/rgb_static/clip.yaml:
--------------------------------------------------------------------------------
1 | _target_: hulc.models.perceptual_encoders.vision_clip.VisionClip
2 | visual_features: 64
3 | freeze_backbone: true
4 | model_name: "RN50" # "RN101", "RN50x4", "RN50x16", "ViT-B/32", "ViT-B/16"
5 | 


--------------------------------------------------------------------------------
/conf/model/perceptual_encoder/rgb_static/default.yaml:
--------------------------------------------------------------------------------
 1 | _target_: hulc.models.perceptual_encoders.vision_network.VisionNetwork
 2 | input_width: 200
 3 | input_height: 200
 4 | activation_function: ReLU #ELU
 5 | dropout_vis_fc: 0.0
 6 | l2_normalize_output: false
 7 | visual_features: 64
 8 | num_c: 3
 9 | use_sinusoid: false
10 | spatial_softmax_temp: 1.0
11 | 


--------------------------------------------------------------------------------
/conf/model/perceptual_encoder/state_decoder/default.yaml:
--------------------------------------------------------------------------------
1 | _target_: hulc.models.auxiliary_loss_networks.state_decoder.StateDecoder
2 | visual_features: 64
3 | n_state_obs: 8
4 | 


--------------------------------------------------------------------------------
/conf/model/perceptual_encoder/state_decoder/none.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lukashermann/hulc/7fdb09f5f1ff831ec8df29cfc61a41db573e9a06/conf/model/perceptual_encoder/state_decoder/none.yaml


--------------------------------------------------------------------------------
/conf/model/perceptual_encoder/tactile/default.yaml:
--------------------------------------------------------------------------------
1 | _target_: calvin.models.perceptual_encoders.tactile_encoder.TactileEncoder
2 | visual_features: 64
3 | 


--------------------------------------------------------------------------------
/conf/model/perceptual_encoder/tactile/none.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lukashermann/hulc/7fdb09f5f1ff831ec8df29cfc61a41db573e9a06/conf/model/perceptual_encoder/tactile/none.yaml


--------------------------------------------------------------------------------
/conf/model/plan_proposal/default.yaml:
--------------------------------------------------------------------------------
1 | _target_: hulc.models.plan_encoders.plan_proposal_net.PlanProposalNetwork
2 | perceptual_features: ???
3 | latent_goal_features: ${model.visual_goal.latent_goal_features}
4 | plan_features: ???
5 | activation_function: ReLU #ELU
6 | hidden_size: 2048
7 | 


--------------------------------------------------------------------------------
/conf/model/plan_recognition/birnn.yaml:
--------------------------------------------------------------------------------
1 | _target_: hulc.models.plan_encoders.plan_recognition_net.PlanRecognitionBiRNNNetwork
2 | in_features: ???
3 | plan_features: 256
4 | action_space: ${datamodule.action_space}
5 | birnn_dropout_p: 0.0
6 | rnn_type: nn.RNN  # nn.GRU
7 | 


--------------------------------------------------------------------------------
/conf/model/plan_recognition/transformers.yaml:
--------------------------------------------------------------------------------
 1 | _target_: hulc.models.plan_encoders.plan_recognition_net.PlanRecognitionTransformersNetwork
 2 | num_heads: 8
 3 | num_layers: 2
 4 | encoder_hidden_size: 2048
 5 | fc_hidden_size: 4096
 6 | in_features: ??
 7 | plan_features: ???
 8 | action_space: ${datamodule.action_space}
 9 | dropout_p: 0.1
10 | encoder_normalize: false
11 | positional_normalize: false
12 | position_embedding: true
13 | max_position_embeddings: ${datamodule.datasets.lang_dataset.max_window_size}
14 | 


--------------------------------------------------------------------------------
/conf/model/proj_vis_lang/default.yaml:
--------------------------------------------------------------------------------
1 | _target_: hulc.models.auxiliary_loss_networks.proj_vis_lang.ProjVisLang
2 | im_dim: ${model.plan_recognition.fc_hidden_size}
3 | lang_dim: ${model.language_goal.latent_goal_features}
4 | output_dim: ${model.language_goal.latent_goal_features}
5 | proj_lang: true
6 | 


--------------------------------------------------------------------------------
/conf/model/proj_vis_lang/none.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lukashermann/hulc/7fdb09f5f1ff831ec8df29cfc61a41db573e9a06/conf/model/proj_vis_lang/none.yaml


--------------------------------------------------------------------------------
/conf/model/sbert.yaml:
--------------------------------------------------------------------------------
1 | _target_: hulc.models.encoders.language_network.SBert
2 | nlp_model: "all-MiniLM-L6-v2"
3 | 


--------------------------------------------------------------------------------
/conf/model/visual_goal/default.yaml:
--------------------------------------------------------------------------------
1 | _target_: hulc.models.encoders.goal_encoders.VisualGoalEncoder
2 | in_features: ???
3 | hidden_size: 2048
4 | latent_goal_features: 32
5 | l2_normalize_goal_embeddings: False
6 | activation_function: ReLU #ELU
7 | 


--------------------------------------------------------------------------------
/conf/trainer/play_trainer.yaml:
--------------------------------------------------------------------------------
1 | devices: 1
2 | accelerator: gpu
3 | precision: 16
4 | val_check_interval: 1.0
5 | max_epochs: 100
6 | sync_batchnorm: false
7 | 


--------------------------------------------------------------------------------
/conf/training/default_training.yaml:
--------------------------------------------------------------------------------
1 | lr: 0.0002
2 | 


--------------------------------------------------------------------------------
/dataset/README.md:
--------------------------------------------------------------------------------
  1 | # Dataset
  2 | The CALVIN dataset comes with 6 hours of teleoperated play data in each of the 4 environments.
  3 | You can use [this script](scripts/visualize_dataset.py) to visualize the dataset.
  4 | 
  5 | ## Download
  6 | 
  7 | We provide a download script to download the three different splits or a small debug dataset:
  8 | 
  9 | **1.** [Split D->D](http://calvin.cs.uni-freiburg.de/dataset/task_D_D.zip) (166 GB):
 10 | ```bash
 11 | $ cd $CALVIN_ROOT/dataset
 12 | $ sh download_data.sh D
 13 | ```
 14 | **2.** [Split ABC->D](http://calvin.cs.uni-freiburg.de/dataset/task_ABC_D.zip) (517 GB)
 15 | ```bash
 16 | $ cd $CALVIN_ROOT/dataset
 17 | $ sh download_data.sh ABC
 18 | ```
 19 | **3.** [Split ABCD->D](http://calvin.cs.uni-freiburg.de/dataset/task_ABCD_D.zip) (656 GB)
 20 | ```bash
 21 | $ cd $CALVIN_ROOT/dataset
 22 | $ sh download_data.sh ABCD
 23 | ```
 24 | 
 25 | **4.** [Small debug dataset](http://calvin.cs.uni-freiburg.de/dataset/calvin_debug_dataset.zip) (1.3 GB)
 26 | ```bash
 27 | $ cd $CALVIN_ROOT/dataset
 28 | $ sh download_data.sh debug
 29 | ```
 30 | 
 31 | ## Language Embeddings
 32 | Since Sep 16 2022, additional language embeddings are part of the dataset on the server. If you downloaded the dataset before,
 33 | you can manually download the embeddings by running
 34 | ```
 35 | cd $CALVIN_ROOT/dataset
 36 | sh download_lang_embeddings.sh D | ABC | ABCD
 37 | ```
 38 | Currently, the available embeddings are:
 39 | - lang_all-distilroberta-v1
 40 | - lang_all-MiniLM-L6-v2
 41 | - lang_all-mpnet-base-v2
 42 | - lang_BERT
 43 | - lang_clip_resnet50
 44 | - lang_clip_ViTB32
 45 | - lang_huggingface_distilroberta
 46 | - lang_huggingface_mpnet
 47 | - lang_msmarco-bert-base-dot-v5
 48 | - lang_paraphrase-MiniLM-L3-v2
 49 | 
 50 | ## Data Structure
 51 | Each interaction timestep is stored in a dictionary inside a numpy file and contains all corresponding sensory observations, different action spaces, state information and language annoations.
 52 | ### Camera Observations
 53 | The keys to access the different camera observations are:
 54 | ```
 55 | ['rgb_static'] (dtype=np.uint8, shape=(200, 200, 3)),
 56 | ['rgb_gripper'] (dtype=np.uint8, shape=(84, 84, 3)),
 57 | ['rgb_tactile'] (dtype=np.uint8, shape=(160, 120, 6)),
 58 | ['depth_static'] (dtype=np.float32, shape=(200, 200)),
 59 | ['depth_gripper'] (dtype=np.float32, shape=(84, 84)),
 60 | ['depth_tactile'] (dtype=np.float32, shape=(160, 120, 2))
 61 | ```
 62 | ### Actions
 63 | Actions are in cartesian space and define the desired tcp pose wrt to the world frame and the binary gripper action.
 64 | The keys to access the 7-DOF absolute and relative actions are:
 65 | (tcp = tool center point, i.e. a virtual frame between the gripper finger tips of the robot)
 66 | ```
 67 | ['actions']
 68 | (dtype=np.float32, shape=(7,))
 69 | tcp position (3): x,y,z in absolute world coordinates
 70 | tcp orientation (3): euler angles x,y,z in absolute world coordinates
 71 | gripper_action (1): binary (close = -1, open = 1)
 72 | 
 73 | ['rel_actions']
 74 | (dtype=np.float32, shape=(7,))
 75 | tcp position (3): x,y,z in relative world coordinates normalized and clipped to (-1, 1) with scaling factor 50
 76 | tcp orientation (3): euler angles x,y,z in relative world coordinates normalized and clipped to (-1, 1) with scaling factor 20
 77 | gripper_action (1): binary (close = -1, open = 1)
 78 | ```
 79 | For inference, Calvin env accepts both absolute and relative actions. To use absolute actions, the action is specified as a 3-tuple
 80 | `action = ((x,y,z), (euler_x, euler_y, euler_z), (gripper))`. To use relative actions, the action is specified as a
 81 | 7-tuple `action = (x,y,z, euler_x, euler_y, euler_z, gripper)`. IMPORTANT: the environment expects the relative actions
 82 | to be scaled like the `rel_actions` in the dataset.
 83 | 
 84 | ### State Observation
 85 | The keys to access the scene state information containing the position and orientation of all objects in the scenes
 86 | (we do not use them to better capture challenges present in real-world settings):
 87 | ```
 88 | ['scene_obs']
 89 | (dtype=np.float32, shape=(24,))
 90 | sliding door (1): joint state
 91 | drawer (1): joint state
 92 | button (1): joint state
 93 | switch (1): joint state
 94 | lightbulb (1): on=1, off=0
 95 | green light (1): on=1, off=0
 96 | red block (6): (x, y, z, euler_x, euler_y, euler_z)
 97 | blue block (6): (x, y, z, euler_x, euler_y, euler_z)
 98 | pink block (6): (x, y, z, euler_x, euler_y, euler_z)
 99 | ```
100 | The robot proprioceptive information, which also includes joint positions can be accessed with:
101 | ```
102 | ['robot_obs']
103 | (dtype=np.float32, shape=(15,))
104 | tcp position (3): x,y,z in world coordinates
105 | tcp orientation (3): euler angles x,y,z in world coordinates
106 | gripper opening width (1): in meter
107 | arm_joint_states (7): in rad
108 | gripper_action (1): binary (close = -1, open = 1)
109 | ```
110 | ### Language Annotations
111 | The language annotations are in a subdirectory of the train and validation folders called `lang_annotations`.
112 | The file `auto_lang_ann.npy` contains the language annotations and its embeddings besides of additional metadata such as the task id, the sequence indexes.
113 | ```
114 | ['language']['ann']: list of raw language
115 | ['language']['task']: list of task_id
116 | ['language']['emb']: precomputed miniLM language embedding
117 | ['info']['indx']: list of start and end indices corresponding to the precomputed language embeddings
118 | ```
119 | The `embeddings.npy` file is only present on the validation folder, this file contains the embeddings used only during the Rollouts (test inference) to condition the policy.
120 | 
121 | ## Visualize Language Annotations
122 | We provide a script to generate a video that visualizes the language annotations of the recorded play data.
123 | By default we visualize the first 100 sequences, but feel free to more sequences (just change this [line](https://github.com/mees/calvin/blob/main/calvin_models/calvin_agent/utils/visualize_annotations.py#L57)).
124 | A example video is.
125 | ```
126 | cd $CALVIN_ROOT/calvin_models/calvin_agent
127 | python utils/visualize_annotations.py datamodule.root_data_dir=$CALVIN_ROOT/dataset/task_D_D/ datamodule/observation_space=lang_rgb_static
128 | ```
129 | 


--------------------------------------------------------------------------------
/dataset/download_data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Download, Unzip, and Remove zip
 4 | if [ "$1" = "D" ]
 5 | then
 6 | 
 7 |     echo "Downloading task_D_D ..."
 8 |     wget http://calvin.cs.uni-freiburg.de/dataset/task_D_D.zip
 9 |     unzip task_D_D.zip && rm task_D_D.zip
10 |     echo "saved folder: task_D_D"
11 | elif [ "$1" = "ABC" ]
12 | then
13 | 
14 |     echo "Downloading task_ABC_D ..."
15 |     wget http://calvin.cs.uni-freiburg.de/dataset/task_ABC_D.zip
16 |     unzip task_ABC_D.zip && rm task_ABC_D.zip
17 |     echo "saved folder: task_ABC_D"
18 | 
19 | elif [ "$1" = "ABCD" ]
20 | then
21 | 
22 |     echo "Downloading task_ABCD_D ..."
23 |     wget http://calvin.cs.uni-freiburg.de/dataset/task_ABCD_D.zip
24 |     unzip task_ABCD_D.zip && rm task_ABCD_D.zip
25 |     echo "saved folder: task_ABCD_D"
26 | 
27 | elif [ "$1" = "debug" ]
28 | then
29 | 
30 |     echo "Downloading debug dataset ..."
31 |     wget http://calvin.cs.uni-freiburg.de/dataset/calvin_debug_dataset.zip
32 |     unzip calvin_debug_dataset.zip && rm calvin_debug_dataset.zip
33 |     echo "saved folder: calvin_debug_dataset"
34 | 
35 | 
36 | else
37 |     echo "Failed: Usage download_data.sh D | ABC | ABCD | debug"
38 |     exit 1
39 | fi
40 | 


--------------------------------------------------------------------------------
/dataset/download_lang_embeddings.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Download, Unzip, and Remove zip
 3 | if [ "$1" = "D" ]
 4 | then
 5 | 
 6 |     echo "Downloading Language Embeddings for task_D_D ..."
 7 |     cd task_D_D
 8 |     wget http://hulc.cs.uni-freiburg.de/language_embeddings/D_D_lang_embs_train.zip
 9 |     unzip D_D_lang_embs_train.zip && rm D_D_lang_embs_train.zip
10 |     wget http://hulc.cs.uni-freiburg.de/language_embeddings/D_D_lang_embs_val.zip
11 |     unzip D_D_lang_embs_val.zip && rm D_D_lang_embs_val.zip
12 |     echo "finished!"
13 | elif [ "$1" = "ABC" ]
14 | then
15 | 
16 |     echo "Downloading Language Embeddings for task_ABC_D ..."
17 |     cd task_ABC_D
18 |     wget http://hulc.cs.uni-freiburg.de/language_embeddings/ABC_D_lang_embs_train.zip
19 |     unzip ABC_D_lang_embs_train.zip && rm ABC_D_lang_embs_train.zip
20 |     wget http://hulc.cs.uni-freiburg.de/language_embeddings/ABC_D_lang_embs_val.zip
21 |     unzip ABC_D_lang_embs_val.zip && rm ABC_D_lang_embs_val.zip
22 |     echo "finished!"
23 | 
24 | elif [ "$1" = "ABCD" ]
25 | then
26 | 
27 |     echo "Downloading Language Embeddings for task_ABCD_D ..."
28 |     cd task_ABCD_D
29 |     wget http://hulc.cs.uni-freiburg.de/language_embeddings/ABCD_D_lang_embs_train.zip
30 |     unzip ABCD_D_lang_embs_train.zip && rm ABCD_D_lang_embs_train.zip
31 |     wget http://hulc.cs.uni-freiburg.de/language_embeddings/ABCD_D_lang_embs_val.zip
32 |     unzip ABCD_D_lang_embs_val.zip && rm ABCD_D_lang_embs_val.zip
33 |     echo "finished!"
34 | 
35 | else
36 |     echo "Failed: Usage download_lang_embeddings.sh D | ABC | ABCD"
37 |     exit 1
38 | fi
39 | 


--------------------------------------------------------------------------------
/hulc/__init__.py:
--------------------------------------------------------------------------------
 1 | """'Hierarchical Universal Language Conditioned Policies implementation in pytorch
 2 | :copyright: 2022 by Oier Mees
 3 | :license: MIT, see LICENSE for more details.
 4 | """
 5 | 
 6 | __version__ = "0.0.1"
 7 | __project__ = "HULC"
 8 | __author__ = "Oier Mees"
 9 | __license__ = "MIT"
10 | __email__ = "meeso@informatik.uni-freiburg.de"
11 | 


--------------------------------------------------------------------------------
/hulc/evaluation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lukashermann/hulc/7fdb09f5f1ff831ec8df29cfc61a41db573e9a06/hulc/evaluation/__init__.py


--------------------------------------------------------------------------------
/hulc/evaluation/evaluate_policy.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import logging
 3 | from pathlib import Path
 4 | import sys
 5 | 
 6 | # This is for using the locally installed repo clone when using slurm
 7 | from calvin_agent.evaluation.evaluate_policy import evaluate_policy
 8 | 
 9 | sys.path.insert(0, Path(__file__).absolute().parents[2].as_posix())
10 | from calvin_agent.evaluation.utils import get_default_model_and_env
11 | from calvin_agent.utils.utils import get_all_checkpoints, get_checkpoints_for_epochs, get_last_checkpoint
12 | from pytorch_lightning import seed_everything
13 | 
14 | logger = logging.getLogger(__name__)
15 | 
16 | 
17 | def get_epoch(checkpoint):
18 |     if "=" not in checkpoint.stem:
19 |         return "0"
20 |     checkpoint.stem.split("=")[1]
21 | 
22 | 
23 | def main():
24 |     seed_everything(0, workers=True)  # type:ignore
25 |     parser = argparse.ArgumentParser(description="Evaluate a trained model on multistep sequences with language goals.")
26 |     parser.add_argument("--dataset_path", type=str, help="Path to the dataset root directory.")
27 | 
28 |     # arguments for loading default model
29 |     parser.add_argument(
30 |         "--train_folder", type=str, help="If calvin_agent was used to train, specify path to the log dir."
31 |     )
32 |     parser.add_argument(
33 |         "--checkpoints",
34 |         type=str,
35 |         default=None,
36 |         help="Comma separated list of epochs for which checkpoints will be loaded",
37 |     )
38 |     parser.add_argument(
39 |         "--checkpoint",
40 |         type=str,
41 |         default=None,
42 |         help="Path of the checkpoint",
43 |     )
44 |     parser.add_argument(
45 |         "--last_k_checkpoints",
46 |         type=int,
47 |         help="Specify the number of checkpoints you want to evaluate (starting from last). Only used for calvin_agent.",
48 |     )
49 | 
50 |     parser.add_argument("--debug", action="store_true", help="Print debug info and visualize environment.")
51 | 
52 |     parser.add_argument("--eval_log_dir", default=None, type=str, help="Where to log the evaluation results.")
53 | 
54 |     parser.add_argument("--device", default=0, type=int, help="CUDA device")
55 |     args = parser.parse_args()
56 | 
57 |     assert "train_folder" in args
58 | 
59 |     checkpoints = []
60 |     if args.checkpoints is None and args.last_k_checkpoints is None and args.checkpoint is None:
61 |         print("Evaluating model with last checkpoint.")
62 |         checkpoints = [get_last_checkpoint(Path(args.train_folder))]
63 |     elif args.checkpoints is not None:
64 |         print(f"Evaluating model with checkpoints {args.checkpoints}.")
65 |         checkpoints = get_checkpoints_for_epochs(Path(args.train_folder), args.checkpoints)
66 |     elif args.checkpoints is None and args.last_k_checkpoints is not None:
67 |         print(f"Evaluating model with last {args.last_k_checkpoints} checkpoints.")
68 |         checkpoints = get_all_checkpoints(Path(args.train_folder))[-args.last_k_checkpoints :]
69 |     elif args.checkpoint is not None:
70 |         checkpoints = [Path(args.checkpoint)]
71 | 
72 |     env = None
73 |     for checkpoint in checkpoints:
74 |         epoch = get_epoch(checkpoint)
75 |         model, env, _ = get_default_model_and_env(
76 |             args.train_folder,
77 |             args.dataset_path,
78 |             checkpoint,
79 |             env=env,
80 |             device_id=args.device,
81 |         )
82 |         evaluate_policy(model, env, epoch, eval_log_dir=args.eval_log_dir, debug=args.debug, create_plan_tsne=True)
83 | 
84 | 
85 | if __name__ == "__main__":
86 |     main()
87 | 


--------------------------------------------------------------------------------
/hulc/evaluation/rollouts_interactive.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from pathlib import Path
  3 | 
  4 | from calvin_agent.evaluation.utils import imshow_tensor
  5 | from calvin_agent.utils.utils import get_last_checkpoint
  6 | import cv2
  7 | import hydra
  8 | import numpy as np
  9 | from omegaconf import DictConfig, OmegaConf
 10 | from omegaconf.errors import MissingMandatoryValue
 11 | from pytorch_lightning import seed_everything
 12 | import torch
 13 | 
 14 | from hulc.models.hulc import Hulc
 15 | 
 16 | logger = logging.getLogger(__name__)
 17 | 
 18 | 
 19 | def get_checkpoint(cfg):
 20 |     try:
 21 |         checkpoint = cfg.load_checkpoint
 22 |     except MissingMandatoryValue:
 23 |         checkpoint = get_last_checkpoint(Path(cfg.train_folder))
 24 |     return checkpoint
 25 | 
 26 | 
 27 | def format_sftp_path(cfg):
 28 |     """
 29 |     When using network mount from nautilus, format path
 30 |     """
 31 |     if cfg.train_folder.startswith("sftp"):
 32 |         cfg.train_folder = "/run/user/9984/gvfs/sftp:host=" + cfg.train_folder[7:]
 33 | 
 34 | 
 35 | @hydra.main(config_path="../../conf/inference", config_name="config_inference")
 36 | def test_policy(input_cfg: DictConfig) -> None:
 37 |     """
 38 |     Run inference on trained policy.
 39 |      Arguments:
 40 |         train_folder (str): path of trained model.
 41 |         load_checkpoint (str): optional model checkpoint. If not specified, the last checkpoint is taken by default.
 42 |         +datamodule.root_data_dir (str): /path/dataset when running inference on another machine than were it was trained
 43 |         visualize (bool): wether to visualize the policy rollouts (default True).
 44 |     """
 45 |     # when mounting remote folder with sftp, format path
 46 |     format_sftp_path(input_cfg)
 47 |     # load config used during training
 48 |     train_cfg_path = Path(input_cfg.train_folder) / ".hydra/config.yaml"
 49 |     train_cfg = OmegaConf.load(train_cfg_path)
 50 | 
 51 |     # merge configs to keep current cmd line overrides
 52 |     cfg = OmegaConf.merge(train_cfg, input_cfg)
 53 |     seed_everything(cfg.seed)
 54 | 
 55 |     # since we don't use the trainer during inference, manually set up data_module
 56 |     data_module = hydra.utils.instantiate(cfg.datamodule, num_workers=4)
 57 |     data_module.prepare_data()
 58 |     data_module.setup()
 59 |     dataloader = data_module.val_dataloader()
 60 |     dataset = dataloader.dataset.datasets["vis"]
 61 |     env = hydra.utils.instantiate(cfg.callbacks.rollout.env_cfg, dataset, torch.device("cuda:0"), show_gui=False)
 62 | 
 63 |     tasks = hydra.utils.instantiate(cfg.callbacks.rollout.tasks)
 64 |     checkpoint = get_checkpoint(cfg)
 65 |     logger.info("Loading model from checkpoint.")
 66 |     model = Hulc.load_from_checkpoint(checkpoint)
 67 |     model.freeze()
 68 |     # model.action_decoder._setup_action_bounds(cfg.datamodule.root_data_dir, None, None)
 69 |     model = model.cuda(0)
 70 |     logger.info("Successfully loaded model.")
 71 | 
 72 |     ep_start_end_ids = np.sort(np.load(dataset.abs_datasets_dir / "ep_start_end_ids.npy"), axis=0)
 73 | 
 74 |     for s, e in ep_start_end_ids:
 75 |         i = start_i = s
 76 |         file = dataset.abs_datasets_dir / f"episode_{i:06d}.npz"
 77 |         data = np.load(file)
 78 |         obs = env.reset(scene_obs=data["scene_obs"], robot_obs=data["robot_obs"])
 79 |         start_info = env.get_info()
 80 |         current_img_obs = start_img_obs = obs["rgb_obs"]
 81 |         start_state_obs = obs["state_obs"]
 82 |         goal_imgs = obs["rgb_obs"]
 83 |         goal_state = obs["state_obs"]
 84 |         scene_obs = data["scene_obs"]
 85 |         robot_obs = data["robot_obs"]
 86 |         while 1:
 87 |             imshow_tensor("current_img", current_img_obs[0], wait=1)
 88 |             imshow_tensor("start", start_img_obs[0], wait=1)
 89 |             imshow_tensor("goal", goal_imgs[0], wait=1)
 90 |             cv2.imshow("keylistener", np.zeros((300, 300)))
 91 |             k = cv2.waitKey(0) % 256
 92 |             if k == ord("s"):
 93 |                 start_info = env.get_info()
 94 |                 start_img_obs = obs["rgb_obs"]
 95 |                 start_state_obs = obs["state_obs"]
 96 |                 scene_obs = data["scene_obs"]
 97 |                 robot_obs = data["robot_obs"]
 98 |                 start_i = i
 99 |             elif k == ord("w"):
100 |                 end_info = env.get_info()
101 |                 print(tasks.get_task_info(start_info, end_info))
102 |                 goal_imgs = obs["rgb_obs"]
103 |                 goal_state = obs["state_obs"]
104 |                 print(f"steps: {i - start_i}")
105 |             elif k == ord("r"):
106 |                 file = dataset.abs_datasets_dir / f"episode_{i:06d}.npz"
107 |                 data = np.load(file)
108 |                 obs = env.reset(scene_obs=data["scene_obs"])
109 |                 current_img_obs = obs["rgb_obs"]
110 |             elif k == ord("a"):
111 |                 i -= 1
112 |                 i = np.clip(i, s, e)
113 |                 file = dataset.abs_datasets_dir / f"episode_{i:06d}.npz"
114 |                 data = np.load(file)
115 |                 obs = env.reset(scene_obs=data["scene_obs"], robot_obs=data["robot_obs"])
116 |                 current_img_obs = obs["rgb_obs"]
117 | 
118 |             elif k == ord("d"):
119 |                 i += 1
120 |                 i = np.clip(i, s, e)
121 |                 file = dataset.abs_datasets_dir / f"episode_{i:06d}.npz"
122 |                 data = np.load(file)
123 |                 obs = env.reset(scene_obs=data["scene_obs"], robot_obs=data["robot_obs"])
124 |                 current_img_obs = obs["rgb_obs"]
125 |             elif k == ord("q"):
126 |                 i -= 100
127 |                 i = np.clip(i, s, e)
128 |                 file = dataset.abs_datasets_dir / f"episode_{i:06d}.npz"
129 |                 data = np.load(file)
130 |                 obs = env.reset(scene_obs=data["scene_obs"], robot_obs=data["robot_obs"])
131 |                 current_img_obs = obs["rgb_obs"]
132 | 
133 |             elif k == ord("e"):
134 |                 i += 100
135 |                 i = np.clip(i, s, e)
136 |                 file = dataset.abs_datasets_dir / f"episode_{i:06d}.npz"
137 |                 data = np.load(file)
138 |                 obs = env.reset(scene_obs=data["scene_obs"], robot_obs=data["robot_obs"])
139 |                 current_img_obs = obs["rgb_obs"]
140 | 
141 |             elif k == ord("f"):
142 |                 env.reset(scene_obs=scene_obs, robot_obs=robot_obs)
143 |                 rollout(model, env, tasks, cfg, start_info, start_img_obs, start_state_obs, goal_imgs, goal_state)
144 |                 obs = env.reset(scene_obs=scene_obs, robot_obs=robot_obs)
145 |                 current_img_obs = obs["rgb_obs"]
146 |                 i = start_i
147 |             elif k == ord("n"):  # ESC
148 |                 break
149 | 
150 | 
151 | def rollout(model, env, tasks, cfg, start_info, current_img_obs, current_state_obs, goal_imgs, goal_state):
152 |     # goal image is last step of the episode
153 |     # goal_imgs = [goal_img.unsqueeze(0).cuda() for goal_img in goal_imgs]
154 |     goal_imgs = goal_imgs[0].contiguous()
155 |     for step in range(cfg.ep_len):
156 |         #  replan every replan_freq steps (default 30 i.e every second)
157 |         if step % cfg.replan_freq == 0:
158 |             plan, latent_goal = model.get_pp_plan_vision(
159 |                 current_img_obs, goal_imgs, current_state_obs, goal_state
160 |             )  # type: ignore
161 |         imshow_tensor("current_img", current_img_obs[0], wait=1)
162 | 
163 |         # use plan to predict actions with current observations
164 |         action = model.predict_with_plan(current_img_obs, current_state_obs, latent_goal, plan)
165 |         obs, _, _, current_info = env.step(action)
166 |         # check if current step solves a task
167 |         current_task_info = tasks.get_task_info(start_info, current_info)
168 |         if len(current_task_info) > 0:
169 |             print(current_task_info)
170 |         # update current observation
171 |         current_img_obs = obs["rgb_obs"]
172 |         current_state_obs = obs["state_obs"]
173 | 
174 | 
175 | if __name__ == "__main__":
176 |     test_policy()
177 | 


--------------------------------------------------------------------------------
/hulc/evaluation/run_multiple.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import multiprocessing
 3 | import os
 4 | from pathlib import Path
 5 | import subprocess
 6 | 
 7 | from calvin_agent.utils.utils import get_all_checkpoints
 8 | import numpy as np
 9 | 
10 | 
11 | def get_log_dir(log_dir):
12 |     log_dir = Path(log_dir)
13 |     os.makedirs(log_dir, exist_ok=True)
14 |     return log_dir
15 | 
16 | 
17 | def intervals(parts, duration):
18 |     part_duration = duration / parts
19 |     return [str(int(i * part_duration)) + "-" + str(int(((i + 1) * part_duration) - 1)) for i in range(parts)]
20 | 
21 | 
22 | def main():
23 |     """
24 |     This script calls the evaluate.sh script of the specified training_dir 8 times with different checkpoints
25 |     """
26 |     parser = argparse.ArgumentParser(description="Evaluate a trained model on multistep sequences with language goals.")
27 |     parser.add_argument("--dataset_path", type=str, help="Path to the dataset root directory.")
28 | 
29 |     parser.add_argument(
30 |         "--train_folder", type=str, help="If calvin_agent was used to train, specify path to the log dir."
31 |     )
32 |     parser.add_argument("--max_epoch", type=int, default=30, help="Evaluate until which epoch.")
33 |     parser.add_argument(
34 |         "--eval_log_dir", type=str, help="If calvin_agent was used to train, specify path to the log dir."
35 |     )
36 | 
37 |     args = parser.parse_args()
38 |     eval_log_dir = get_log_dir(args.eval_log_dir)
39 | 
40 |     eval_script = (Path(__file__).parent / "evaluate_policy.py").as_posix()
41 |     training_dir = Path(args.train_folder)
42 |     checkpoints = get_all_checkpoints(training_dir)
43 |     epochs = [str(e) for chk in checkpoints if (e := int(chk.stem.split("=")[1])) <= args.max_epoch]
44 |     split_epochs = np.array_split(epochs, 8)
45 |     epoch_args = [",".join(arr) for arr in split_epochs]
46 |     max_cpu_count = multiprocessing.cpu_count()
47 |     local_cpus = intervals(8, max_cpu_count)
48 |     for i, epoch_arg in enumerate(epoch_args):
49 |         cmd = [
50 |             "taskset",
51 |             "--cpu-list",
52 |             local_cpus[i],
53 |             "python",
54 |             eval_script,
55 |             "--checkpoints",
56 |             epoch_arg,
57 |             "--dataset_path",
58 |             args.dataset_path,
59 |             "--train_folder",
60 |             args.train_folder,
61 |             "--eval_log_dir",
62 |             args.eval_log_dir,
63 |             "--device",
64 |             str(i),
65 |         ]
66 |         std_out = eval_log_dir / f"stdout_{i}.out"
67 |         std_err = eval_log_dir / f"stderr_{i}.err"
68 |         with open(std_out, "wb") as out, open(std_err, "wb") as err:
69 |             subprocess.Popen(cmd, stdout=out, stderr=err, preexec_fn=os.setpgrp)
70 | 
71 | 
72 | if __name__ == "__main__":
73 |     main()
74 | 


--------------------------------------------------------------------------------
/hulc/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lukashermann/hulc/7fdb09f5f1ff831ec8df29cfc61a41db573e9a06/hulc/models/__init__.py


--------------------------------------------------------------------------------
/hulc/models/auxiliary_loss_networks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lukashermann/hulc/7fdb09f5f1ff831ec8df29cfc61a41db573e9a06/hulc/models/auxiliary_loss_networks/__init__.py


--------------------------------------------------------------------------------
/hulc/models/auxiliary_loss_networks/bc_z_lang_decoder.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | 
 5 | class BCZLangDecoder(nn.Module):
 6 |     def __init__(self, in_features: int, lang_dim: int):
 7 |         super().__init__()
 8 |         # include proprio info???
 9 |         self.mlp = nn.Sequential(
10 |             nn.Linear(in_features=in_features, out_features=512),
11 |             nn.ReLU(),
12 |             nn.Linear(in_features=512, out_features=lang_dim),
13 |         )
14 | 
15 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
16 |         x = self.mlp(x)
17 |         return x
18 | 


--------------------------------------------------------------------------------
/hulc/models/auxiliary_loss_networks/mia_lang_discriminator.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | 
 5 | class MIALangDiscriminator(nn.Module):
 6 |     def __init__(self, in_features: int, lang_dim: int, dropout_p: float):
 7 |         super().__init__()
 8 |         self.mlp = nn.Sequential(
 9 |             nn.Linear(in_features=in_features + lang_dim, out_features=512),
10 |             nn.ReLU(),
11 |             nn.Dropout(dropout_p),
12 |             nn.Linear(in_features=512, out_features=1),
13 |         )
14 | 
15 |     def forward(self, vis_emb: torch.Tensor, lang_emb: torch.Tensor) -> torch.Tensor:
16 |         x = torch.cat([vis_emb, lang_emb], dim=-1)
17 |         x = self.mlp(x)
18 |         return x
19 | 


--------------------------------------------------------------------------------
/hulc/models/auxiliary_loss_networks/proj_vis_lang.py:
--------------------------------------------------------------------------------
 1 | from typing import Tuple
 2 | 
 3 | import torch
 4 | import torch.nn as nn
 5 | 
 6 | 
 7 | class ProjVisLang(nn.Module):
 8 |     def __init__(self, im_dim: int, lang_dim: int, output_dim: int, proj_lang: bool = True):
 9 |         super().__init__()
10 |         self.mlp_im = nn.Sequential(
11 |             nn.Linear(in_features=im_dim, out_features=128),
12 |             nn.ReLU(),
13 |             nn.Linear(in_features=128, out_features=output_dim),
14 |         )
15 |         self.mlp_lang = None
16 |         if proj_lang:
17 |             self.mlp_lang = nn.Sequential(
18 |                 nn.Linear(in_features=lang_dim, out_features=128),
19 |                 nn.ReLU(),
20 |                 nn.Linear(in_features=128, out_features=output_dim),
21 |             )
22 | 
23 |     def forward(self, vis_emb: torch.Tensor, lang_emb: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
24 |         vis_emb = self.mlp_im(vis_emb)
25 |         if self.mlp_lang is not None:
26 |             lang_emb = self.mlp_lang(lang_emb)
27 |         return vis_emb, lang_emb
28 | 


--------------------------------------------------------------------------------
/hulc/models/auxiliary_loss_networks/state_decoder.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict
 2 | 
 3 | import torch
 4 | import torch.nn as nn
 5 | 
 6 | 
 7 | class StateDecoder(nn.Module):
 8 |     def __init__(self, visual_features: int, n_state_obs: int):
 9 |         super().__init__()
10 |         self.mlp = nn.Sequential(
11 |             nn.Linear(in_features=visual_features, out_features=40),
12 |             nn.ReLU(),
13 |             nn.Linear(in_features=40, out_features=40),
14 |             nn.ReLU(),
15 |             nn.Linear(in_features=40, out_features=n_state_obs),
16 |         )
17 | 
18 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
19 |         x = self.mlp(x)
20 |         return x
21 | 


--------------------------------------------------------------------------------
/hulc/models/decoders/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lukashermann/hulc/7fdb09f5f1ff831ec8df29cfc61a41db573e9a06/hulc/models/decoders/__init__.py


--------------------------------------------------------------------------------
/hulc/models/decoders/action_decoder.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional, Tuple
 2 | 
 3 | import torch
 4 | import torch.nn as nn
 5 | 
 6 | 
 7 | class ActionDecoder(nn.Module):
 8 |     def act(
 9 |         self,
10 |         latent_plan: torch.Tensor,
11 |         perceptual_emb: torch.Tensor,
12 |         latent_goal: torch.Tensor,
13 |         robot_obs: Optional[torch.Tensor] = None,
14 |     ) -> torch.Tensor:
15 |         raise NotImplementedError
16 | 
17 |     def loss(
18 |         self,
19 |         latent_plan: torch.Tensor,
20 |         perceptual_emb: torch.Tensor,
21 |         latent_goal: torch.Tensor,
22 |         actions: torch.Tensor,
23 |         robot_obs: Optional[torch.Tensor] = None,
24 |     ) -> torch.Tensor:
25 |         raise NotImplementedError
26 | 
27 |     def loss_and_act(
28 |         self,
29 |         latent_plan: torch.Tensor,
30 |         perceptual_emb: torch.Tensor,
31 |         latent_goal: torch.Tensor,
32 |         actions: torch.Tensor,
33 |         robot_obs: Optional[torch.Tensor] = None,
34 |     ) -> Tuple[torch.Tensor, torch.Tensor]:
35 |         raise NotImplementedError
36 | 
37 |     def _sample(self, *args, **kwargs):
38 |         raise NotImplementedError
39 | 
40 |     def forward(
41 |         self, latent_plan: torch.Tensor, perceptual_emb: torch.Tensor, latent_goal: torch.Tensor
42 |     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
43 |         raise NotImplementedError
44 | 
45 |     def clear_hidden_state(self) -> None:
46 |         pass
47 | 


--------------------------------------------------------------------------------
/hulc/models/decoders/deterministic_decoder.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from typing import Optional, Tuple
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | 
  7 | from hulc.models.decoders.action_decoder import ActionDecoder
  8 | from hulc.models.decoders.utils.gripper_control import tcp_to_world_frame, world_to_tcp_frame
  9 | 
 10 | logger = logging.getLogger(__name__)
 11 | 
 12 | 
 13 | class DeterministicDecoder(ActionDecoder):
 14 |     def __init__(
 15 |         self,
 16 |         perceptual_features: int,
 17 |         latent_goal_features: int,
 18 |         plan_features: int,
 19 |         hidden_size: int,
 20 |         out_features: int,
 21 |         policy_rnn_dropout_p: float,
 22 |         criterion: str,
 23 |         num_layers: int,
 24 |         rnn_model: str,
 25 |         perceptual_emb_slice: tuple,
 26 |         gripper_control: bool,
 27 |     ):
 28 |         super(DeterministicDecoder, self).__init__()
 29 |         self.plan_features = plan_features
 30 |         self.gripper_control = gripper_control
 31 |         self.out_features = out_features
 32 |         in_features = (perceptual_emb_slice[1] - perceptual_emb_slice[0]) + latent_goal_features + plan_features
 33 |         self.rnn = eval(rnn_model)
 34 |         self.rnn = self.rnn(in_features, hidden_size, num_layers, policy_rnn_dropout_p)
 35 |         self.actions = nn.Sequential(nn.Linear(hidden_size, out_features), nn.Tanh())
 36 |         self.criterion = getattr(nn, criterion)()
 37 |         self.perceptual_emb_slice = perceptual_emb_slice
 38 |         self.hidden_state = None
 39 | 
 40 |     def clear_hidden_state(self) -> None:
 41 |         self.hidden_state = None
 42 | 
 43 |     def forward(  # type: ignore
 44 |         self,
 45 |         latent_plan: torch.Tensor,
 46 |         perceptual_emb: torch.Tensor,
 47 |         latent_goal: torch.Tensor,
 48 |         h_0: Optional[torch.Tensor] = None,
 49 |     ) -> Tuple[torch.Tensor, torch.Tensor]:
 50 |         perceptual_emb = perceptual_emb[..., slice(*self.perceptual_emb_slice)]
 51 |         batch_size, seq_len = perceptual_emb.shape[0], perceptual_emb.shape[1]
 52 |         latent_plan = latent_plan.unsqueeze(1).expand(-1, seq_len, -1) if latent_plan.nelement() > 0 else latent_plan
 53 |         latent_goal = latent_goal.unsqueeze(1).expand(-1, seq_len, -1)
 54 |         x = torch.cat([latent_plan, perceptual_emb, latent_goal], dim=-1)  # b, s, (plan + visuo-propio + goal)
 55 |         if not isinstance(self.rnn, nn.Sequential) and isinstance(self.rnn, nn.RNNBase):
 56 |             x, h_n = self.rnn(x, h_0)
 57 |         else:
 58 |             x = self.rnn(x)
 59 |             h_n = None
 60 |         actions = self.actions(x)
 61 |         return actions, h_n
 62 | 
 63 |     def loss_and_act(
 64 |         self,
 65 |         latent_plan: torch.Tensor,
 66 |         perceptual_emb: torch.Tensor,
 67 |         latent_goal: torch.Tensor,
 68 |         actions: torch.Tensor,
 69 |         robot_obs: Optional[torch.Tensor] = None,
 70 |     ) -> Tuple[torch.Tensor, torch.Tensor]:
 71 |         pred_actions, _ = self(latent_plan, perceptual_emb, latent_goal)
 72 |         # loss
 73 |         if self.gripper_control:
 74 |             actions_tcp = world_to_tcp_frame(actions, robot_obs)
 75 |             loss = self.criterion(pred_actions, actions_tcp)
 76 |             pred_actions_world = tcp_to_world_frame(pred_actions, robot_obs)
 77 |             return loss, pred_actions_world
 78 |         else:
 79 |             loss = self.criterion(pred_actions, actions)
 80 |             return loss, pred_actions
 81 | 
 82 |     def loss(
 83 |         self,
 84 |         latent_plan: torch.Tensor,
 85 |         perceptual_emb: torch.Tensor,
 86 |         latent_goal: torch.Tensor,
 87 |         actions: torch.Tensor,
 88 |         robot_obs: Optional[torch.Tensor] = None,
 89 |     ) -> torch.Tensor:
 90 |         pred_actions, _ = self(latent_plan, perceptual_emb, latent_goal)
 91 |         if self.gripper_control:
 92 |             actions_tcp = world_to_tcp_frame(actions, robot_obs)
 93 |             self.criterion(pred_actions, actions_tcp)
 94 |         return self.criterion(pred_actions, actions)
 95 | 
 96 |     def act(
 97 |         self,
 98 |         latent_plan: torch.Tensor,
 99 |         perceptual_emb: torch.Tensor,
100 |         latent_goal: torch.Tensor,
101 |         robot_obs: Optional[torch.Tensor] = None,
102 |     ) -> torch.Tensor:
103 |         pred_actions, self.hidden_state = self(latent_plan, perceptual_emb, latent_goal, self.hidden_state)
104 |         if self.gripper_control:
105 |             pred_actions_world = tcp_to_world_frame(pred_actions, robot_obs)
106 |             return pred_actions_world
107 |         else:
108 |             return pred_actions
109 | 


--------------------------------------------------------------------------------
/hulc/models/decoders/logistic_decoder_rnn.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from pathlib import Path
  3 | from typing import List, Optional, Tuple, Union
  4 | 
  5 | import numpy as np
  6 | from omegaconf import ListConfig, OmegaConf
  7 | import torch
  8 | import torch.nn as nn
  9 | import torch.nn.functional as F
 10 | 
 11 | import hulc
 12 | from hulc.models.decoders.action_decoder import ActionDecoder
 13 | from hulc.models.decoders.utils.gripper_control import tcp_to_world_frame, world_to_tcp_frame
 14 | from hulc.models.decoders.utils.rnn import gru_decoder, lstm_decoder, mlp_decoder, rnn_decoder  # needed for line 60
 15 | 
 16 | logger = logging.getLogger(__name__)
 17 | 
 18 | 
 19 | def log_sum_exp(x):
 20 |     """numerically stable log_sum_exp implementation that prevents overflow"""
 21 |     axis = len(x.size()) - 1
 22 |     m, _ = torch.max(x, dim=axis)
 23 |     m2, _ = torch.max(x, dim=axis, keepdim=True)
 24 |     return m + torch.log(torch.sum(torch.exp(x - m2), dim=axis))
 25 | 
 26 | 
 27 | class LogisticDecoderRNN(ActionDecoder):
 28 |     def __init__(
 29 |         self,
 30 |         perceptual_features: int,
 31 |         latent_goal_features: int,
 32 |         plan_features: int,
 33 |         n_mixtures: int,
 34 |         hidden_size: int,
 35 |         out_features: int,
 36 |         log_scale_min: float,
 37 |         act_max_bound: Union[List[float], ListConfig],
 38 |         act_min_bound: Union[List[float], ListConfig],
 39 |         dataset_dir: str,
 40 |         load_action_bounds: bool,
 41 |         num_classes: int,
 42 |         gripper_alpha: float,
 43 |         policy_rnn_dropout_p: float,
 44 |         num_layers: int,
 45 |         rnn_model: str,
 46 |         gripper_control: bool,
 47 |         discrete_gripper: bool,
 48 |         perceptual_emb_slice: Optional[tuple] = None,
 49 |     ):
 50 |         super(LogisticDecoderRNN, self).__init__()
 51 |         self.n_dist = n_mixtures
 52 |         self.gripper_control = gripper_control
 53 |         self.discrete_gripper = discrete_gripper
 54 |         self.log_scale_min = log_scale_min
 55 |         self.num_classes = num_classes
 56 |         self.plan_features = plan_features
 57 |         if perceptual_emb_slice is not None:
 58 |             in_features = (perceptual_emb_slice[1] - perceptual_emb_slice[0]) + latent_goal_features + plan_features
 59 |         else:
 60 |             in_features = perceptual_features + latent_goal_features + plan_features
 61 |         self.out_features = out_features - 1 if discrete_gripper else out_features  # for discrete gripper act
 62 |         self.gripper_alpha = gripper_alpha
 63 |         self.rnn = eval(rnn_model)
 64 |         self.rnn = self.rnn(in_features, hidden_size, num_layers, policy_rnn_dropout_p)
 65 |         self.mean_fc = nn.Linear(hidden_size, self.out_features * self.n_dist)
 66 |         self.log_scale_fc = nn.Linear(hidden_size, self.out_features * self.n_dist)
 67 |         self.prob_fc = nn.Linear(hidden_size, self.out_features * self.n_dist)
 68 |         self.register_buffer("one_hot_embedding_eye", torch.eye(self.n_dist))
 69 |         self.register_buffer("ones", torch.ones(1, 1, self.n_dist))
 70 |         self._setup_action_bounds(dataset_dir, act_max_bound, act_min_bound, load_action_bounds)
 71 |         # hack for mypy
 72 |         self.one_hot_embedding_eye: torch.Tensor = self.one_hot_embedding_eye
 73 |         self.action_max_bound: torch.Tensor = self.action_max_bound
 74 |         self.action_min_bound: torch.Tensor = self.action_min_bound
 75 |         if self.discrete_gripper:
 76 |             self.gripper_bounds: torch.Tensor = self.gripper_bounds
 77 |             self.gripper_fc = nn.Linear(hidden_size, 2)
 78 |             self.criterion = nn.CrossEntropyLoss()
 79 |         self.perceptual_emb_slice = perceptual_emb_slice
 80 |         self.hidden_state = None
 81 | 
 82 |     def clear_hidden_state(self) -> None:
 83 |         self.hidden_state = None
 84 | 
 85 |     def loss_and_act(  # type:  ignore
 86 |         self,
 87 |         latent_plan: torch.Tensor,
 88 |         perceptual_emb: torch.Tensor,
 89 |         latent_goal: torch.Tensor,
 90 |         actions: torch.Tensor,
 91 |         robot_obs: torch.Tensor,
 92 |     ) -> Tuple[torch.Tensor, torch.Tensor]:
 93 |         logit_probs, log_scales, means, gripper_act, _ = self(latent_plan, perceptual_emb, latent_goal)
 94 |         pred_actions = self._sample(logit_probs, log_scales, means, gripper_act)
 95 |         if self.gripper_control:
 96 |             actions_tcp = world_to_tcp_frame(actions, robot_obs)
 97 |             loss = self._loss(logit_probs, log_scales, means, gripper_act, actions_tcp)
 98 |             pred_actions_world = tcp_to_world_frame(pred_actions, robot_obs)
 99 |             return loss, pred_actions_world
100 |         else:
101 |             loss = self._loss(logit_probs, log_scales, means, gripper_act, actions)
102 |             return loss, pred_actions
103 | 
104 |     def act(  # type:  ignore
105 |         self,
106 |         latent_plan: torch.Tensor,
107 |         perceptual_emb: torch.Tensor,
108 |         latent_goal: torch.Tensor,
109 |         robot_obs: torch.Tensor,
110 |     ) -> torch.Tensor:
111 |         logit_probs, log_scales, means, gripper_act, self.hidden_state = self(
112 |             latent_plan, perceptual_emb, latent_goal, self.hidden_state
113 |         )
114 |         pred_actions = self._sample(logit_probs, log_scales, means, gripper_act)
115 |         if self.gripper_control:
116 |             pred_actions_world = tcp_to_world_frame(pred_actions, robot_obs)
117 |             return pred_actions_world
118 |         else:
119 |             return pred_actions
120 | 
121 |     def loss(  # type:  ignore
122 |         self,
123 |         latent_plan: torch.Tensor,
124 |         perceptual_emb: torch.Tensor,
125 |         latent_goal: torch.Tensor,
126 |         actions: torch.Tensor,
127 |         robot_obs: torch.Tensor,
128 |     ) -> torch.Tensor:  # type:  ignore
129 |         logit_probs, log_scales, means, gripper_act, _ = self(latent_plan, perceptual_emb, latent_goal)
130 |         if self.gripper_control:
131 |             actions_tcp = world_to_tcp_frame(actions, robot_obs)
132 |             return self._loss(logit_probs, log_scales, means, gripper_act, actions_tcp)
133 |         else:
134 |             return self._loss(logit_probs, log_scales, means, gripper_act, actions)
135 | 
136 |     def _loss(
137 |         self,
138 |         logit_probs: torch.Tensor,
139 |         log_scales: torch.Tensor,
140 |         means: torch.Tensor,
141 |         gripper_act: torch.Tensor,
142 |         actions: torch.Tensor,
143 |     ) -> torch.Tensor:
144 |         if self.discrete_gripper:
145 |             logistics_loss = self._logistic_loss(logit_probs, log_scales, means, actions[:, :, :-1])
146 |             gripper_gt = actions[:, :, -1].clone()
147 |             # @fixme: hack because discrete actions are now  -1 and 1, but we need 0, 1 for crossentropy loss
148 |             m = gripper_gt == -1
149 |             gripper_gt[m] = 0
150 |             gripper_act_loss = self.criterion(gripper_act.view(-1, 2), gripper_gt.view(-1).long())
151 |             total_loss = logistics_loss + self.gripper_alpha * gripper_act_loss
152 |             return total_loss
153 |         else:
154 |             logistics_loss = self._logistic_loss(logit_probs, log_scales, means, actions)
155 |             return logistics_loss
156 | 
157 |     def _setup_action_bounds(self, dataset_dir, act_max_bound, act_min_bound, load_action_bounds):
158 |         if load_action_bounds:
159 |             try:
160 |                 statistics_path = Path(hulc.__file__).parent / dataset_dir / "training/statistics.yaml"
161 |                 statistics = OmegaConf.load(statistics_path)
162 |                 act_max_bound = statistics.act_max_bound
163 |                 act_min_bound = statistics.act_min_bound
164 |                 logger.info(f"Loaded action bounds from {statistics_path}")
165 |             except FileNotFoundError:
166 |                 logger.info(
167 |                     f"Could not load statistics.yaml in {statistics_path}, taking action bounds defined in hydra conf"
168 |                 )
169 |         if self.discrete_gripper:
170 |             self.register_buffer("gripper_bounds", torch.Tensor([act_min_bound[-1], act_max_bound[-1]]))
171 |             act_max_bound = act_max_bound[:-1]  # for discrete grasp
172 |             act_min_bound = act_min_bound[:-1]
173 |         action_max_bound = torch.Tensor(act_max_bound).float()
174 |         action_min_bound = torch.Tensor(act_min_bound).float()
175 |         assert action_max_bound.shape[0] == self.out_features
176 |         assert action_min_bound.shape[0] == self.out_features
177 |         action_max_bound = action_max_bound.unsqueeze(0).unsqueeze(0)  # [1, 1, action_space]
178 |         action_min_bound = action_min_bound.unsqueeze(0).unsqueeze(0)  # [1, 1, action_space]
179 |         action_max_bound = action_max_bound.unsqueeze(-1) * self.ones  # broadcast to [1, 1, action_space, N_DIST]
180 |         action_min_bound = action_min_bound.unsqueeze(-1) * self.ones  # broadcast to [1, 1, action_space, N_DIST]
181 |         self.register_buffer("action_max_bound", action_max_bound)
182 |         self.register_buffer("action_min_bound", action_min_bound)
183 | 
184 |     def _logistic_loss(
185 |         self,
186 |         logit_probs: torch.Tensor,
187 |         log_scales: torch.Tensor,
188 |         means: torch.Tensor,
189 |         actions: torch.Tensor,
190 |     ) -> torch.Tensor:
191 |         # Appropriate scale
192 |         log_scales = torch.clamp(log_scales, min=self.log_scale_min)
193 |         # Broadcast actions (B, A, N_DIST)
194 |         actions = actions.unsqueeze(-1) * self.ones
195 |         # Approximation of CDF derivative (PDF)
196 |         centered_actions = actions - means
197 |         inv_stdv = torch.exp(-log_scales)
198 |         assert torch.is_tensor(self.action_max_bound)
199 |         assert torch.is_tensor(self.action_min_bound)
200 |         act_range = (self.action_max_bound - self.action_min_bound) / 2.0
201 |         plus_in = inv_stdv * (centered_actions + act_range / (self.num_classes - 1))
202 |         cdf_plus = torch.sigmoid(plus_in)
203 |         min_in = inv_stdv * (centered_actions - act_range / (self.num_classes - 1))
204 |         cdf_min = torch.sigmoid(min_in)
205 | 
206 |         # Corner Cases
207 |         log_cdf_plus = plus_in - F.softplus(plus_in)  # log probability for edge case of 0 (before scaling)
208 |         log_one_minus_cdf_min = -F.softplus(min_in)  # log probability for edge case of 255 (before scaling)
209 |         # Log probability in the center of the bin
210 |         mid_in = inv_stdv * centered_actions
211 |         log_pdf_mid = mid_in - log_scales - 2.0 * F.softplus(mid_in)
212 |         # Probability for all other cases
213 |         cdf_delta = cdf_plus - cdf_min
214 | 
215 |         # Log probability
216 |         log_probs = torch.where(
217 |             actions < self.action_min_bound + 1e-3,
218 |             log_cdf_plus,
219 |             torch.where(
220 |                 actions > self.action_max_bound - 1e-3,
221 |                 log_one_minus_cdf_min,
222 |                 torch.where(
223 |                     cdf_delta > 1e-5,
224 |                     torch.log(torch.clamp(cdf_delta, min=1e-12)),
225 |                     log_pdf_mid - np.log((self.num_classes - 1) / 2),
226 |                 ),
227 |             ),
228 |         )
229 |         log_probs = log_probs + F.log_softmax(logit_probs, dim=-1)
230 |         loss = -torch.sum(log_sum_exp(log_probs), dim=-1).mean()
231 |         return loss
232 | 
233 |     # Sampling from logistic distribution
234 |     def _sample(  # type: ignore
235 |         self, logit_probs: torch.Tensor, log_scales: torch.Tensor, means: torch.Tensor, gripper_act: torch.Tensor
236 |     ) -> torch.Tensor:  # type: ignore
237 |         # Selecting Logistic distribution (Gumbel Sample)
238 |         r1, r2 = 1e-5, 1.0 - 1e-5
239 |         temp = (r1 - r2) * torch.rand(means.shape, device=means.device) + r2
240 |         temp = logit_probs - torch.log(-torch.log(temp))
241 |         argmax = torch.argmax(temp, -1)
242 |         # TODO: find out why mypy complains about type
243 |         dist = self.one_hot_embedding_eye[argmax]
244 | 
245 |         # Select scales and means
246 |         log_scales = (dist * log_scales).sum(dim=-1)
247 |         means = (dist * means).sum(dim=-1)
248 | 
249 |         # Inversion sampling for logistic mixture sampling
250 |         scales = torch.exp(log_scales)  # Make positive
251 |         u = (r1 - r2) * torch.rand(means.shape, device=means.device) + r2
252 |         actions = means + scales * (torch.log(u) - torch.log(1.0 - u))
253 |         if self.discrete_gripper:
254 |             gripper_cmd = self.gripper_bounds[gripper_act.argmax(dim=-1)]
255 |             full_action = torch.cat([actions, gripper_cmd.unsqueeze(-1)], 2)
256 |             return full_action
257 |         else:
258 |             return actions
259 | 
260 |     def forward(  # type: ignore
261 |         self,
262 |         latent_plan: torch.Tensor,
263 |         perceptual_emb: torch.Tensor,
264 |         latent_goal: torch.Tensor,
265 |         h_0: Optional[torch.Tensor] = None,
266 |     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
267 |         if self.perceptual_emb_slice is not None:
268 |             perceptual_emb = perceptual_emb[..., slice(*self.perceptual_emb_slice)]
269 |         batch_size, seq_len = perceptual_emb.shape[0], perceptual_emb.shape[1]
270 |         latent_plan = latent_plan.unsqueeze(1).expand(-1, seq_len, -1)
271 |         latent_goal = latent_goal.unsqueeze(1).expand(-1, seq_len, -1)
272 |         x = torch.cat([latent_plan, perceptual_emb, latent_goal], dim=-1)  # b, s, (plan + visuo-propio + goal)
273 |         if not isinstance(self.rnn, nn.Sequential) and isinstance(self.rnn, nn.RNNBase):
274 |             x, h_n = self.rnn(x, h_0)
275 |         else:
276 |             x = self.rnn(x)
277 |             h_n = None
278 |         probs = self.prob_fc(x)
279 |         means = self.mean_fc(x)
280 |         log_scales = self.log_scale_fc(x)
281 |         log_scales = torch.clamp(log_scales, min=self.log_scale_min)
282 |         gripper_act = self.gripper_fc(x) if self.discrete_gripper else None
283 |         # Appropriate dimensions
284 |         logit_probs = probs.view(batch_size, seq_len, self.out_features, self.n_dist)
285 |         means = means.view(batch_size, seq_len, self.out_features, self.n_dist)
286 |         log_scales = log_scales.view(batch_size, seq_len, self.out_features, self.n_dist)
287 |         return logit_probs, log_scales, means, gripper_act, h_n
288 | 


--------------------------------------------------------------------------------
/hulc/models/decoders/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lukashermann/hulc/7fdb09f5f1ff831ec8df29cfc61a41db573e9a06/hulc/models/decoders/utils/__init__.py


--------------------------------------------------------------------------------
/hulc/models/decoders/utils/gripper_control.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | import numpy as np
 4 | from hulc.models.decoders.utils.pytorch3d_transforms import (
 5 |     euler_angles_to_matrix,
 6 |     matrix_to_euler_angles,
 7 |     matrix_to_quaternion,
 8 |     quaternion_to_matrix,
 9 | )
10 | import torch
11 | from torch.cuda.amp import autocast
12 | 
13 | logger = logging.getLogger(__name__)
14 | 
15 | 
16 | def world_to_tcp_frame(action, robot_obs):
17 |     with autocast(dtype=torch.float32):
18 |         b, s, _ = action.shape
19 |         world_T_tcp = euler_angles_to_matrix(robot_obs[..., 3:6], convention="XYZ").float().view(-1, 3, 3)
20 |         tcp_T_world = torch.inverse(world_T_tcp)
21 |         pos_w_rel = action[..., :3].view(-1, 3, 1)
22 |         pos_tcp_rel = tcp_T_world @ pos_w_rel
23 |         # downscaling is necessary here to get pseudo infinitesimal rotation
24 |         orn_w_rel = action[..., 3:6] * 0.01
25 |         world_T_tcp_new = (
26 |             euler_angles_to_matrix(robot_obs[..., 3:6] + orn_w_rel, convention="XYZ").float().view(-1, 3, 3)
27 |         )
28 |         tcp_new_T_tcp_old = torch.inverse(world_T_tcp_new) @ world_T_tcp
29 |         orn_tcp_rel = matrix_to_euler_angles(tcp_new_T_tcp_old, convention="XYZ").float()
30 |         orn_tcp_rel = torch.where(orn_tcp_rel < -np.pi, orn_tcp_rel + 2 * np.pi, orn_tcp_rel)
31 |         orn_tcp_rel = torch.where(orn_tcp_rel > np.pi, orn_tcp_rel - 2 * np.pi, orn_tcp_rel)
32 |         # upscaling again
33 |         orn_tcp_rel *= 100
34 |         action_tcp = torch.cat([pos_tcp_rel.view(b, s, -1), orn_tcp_rel.view(b, s, -1), action[..., -1:]], dim=-1)
35 |         assert not torch.any(action_tcp.isnan())
36 |     return action_tcp
37 | 
38 | 
39 | def tcp_to_world_frame(action, robot_obs):
40 |     with autocast(dtype=torch.float32):
41 |         b, s, _ = action.shape
42 |         world_T_tcp = euler_angles_to_matrix(robot_obs[..., 3:6], convention="XYZ").float().view(-1, 3, 3)
43 |         pos_tcp_rel = action[..., :3].view(-1, 3, 1)
44 |         pos_w_rel = world_T_tcp @ pos_tcp_rel
45 |         # downscaling is necessary here to get pseudo infinitesimal rotation
46 |         orn_tcp_rel = action[..., 3:6] * 0.01
47 |         tcp_new_T_tcp_old = euler_angles_to_matrix(orn_tcp_rel, convention="XYZ").float().view(-1, 3, 3)
48 |         world_T_tcp_new = world_T_tcp @ torch.inverse(tcp_new_T_tcp_old)
49 | 
50 |         orn_w_new = matrix_to_euler_angles(world_T_tcp_new, convention="XYZ").float()
51 |         if torch.any(orn_w_new.isnan()):
52 |             logger.warning("NaN value in euler angles.")
53 |             orn_w_new = matrix_to_euler_angles(
54 |                 quaternion_to_matrix(matrix_to_quaternion(world_T_tcp_new)), convention="XYZ"
55 |             ).float()
56 |         orn_w_rel = orn_w_new - robot_obs[..., 3:6].view(-1, 3)
57 |         orn_w_rel = torch.where(orn_w_rel < -np.pi, orn_w_rel + 2 * np.pi, orn_w_rel)
58 |         orn_w_rel = torch.where(orn_w_rel > np.pi, orn_w_rel - 2 * np.pi, orn_w_rel)
59 |         # upscaling again
60 |         orn_w_rel *= 100
61 |         action_w = torch.cat([pos_w_rel.view(b, s, -1), orn_w_rel.view(b, s, -1), action[..., -1:]], dim=-1)
62 |         assert not torch.any(action_w.isnan())
63 |     return action_w
64 | 


--------------------------------------------------------------------------------
/hulc/models/decoders/utils/rnn.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | 
 5 | def rnn_decoder(in_features: int, hidden_size: int, num_layers: int, policy_rnn_dropout_p: float) -> torch.nn.Module:
 6 |     return nn.RNN(
 7 |         input_size=in_features,
 8 |         hidden_size=hidden_size,
 9 |         num_layers=num_layers,
10 |         nonlinearity="relu",
11 |         bidirectional=False,
12 |         batch_first=True,
13 |         dropout=policy_rnn_dropout_p,
14 |     )
15 | 
16 | 
17 | def lstm_decoder(in_features: int, hidden_size: int, num_layers: int, policy_rnn_dropout_p: float) -> torch.nn.Module:
18 |     return nn.LSTM(
19 |         input_size=in_features,
20 |         hidden_size=hidden_size,
21 |         num_layers=num_layers,
22 |         bidirectional=False,
23 |         batch_first=True,
24 |         dropout=policy_rnn_dropout_p,
25 |     )
26 | 
27 | 
28 | def gru_decoder(in_features: int, hidden_size: int, num_layers: int, policy_rnn_dropout_p: float) -> torch.nn.Module:
29 |     return nn.GRU(
30 |         input_size=in_features,
31 |         hidden_size=hidden_size,
32 |         num_layers=num_layers,
33 |         bidirectional=False,
34 |         batch_first=True,
35 |         dropout=policy_rnn_dropout_p,
36 |     )
37 | 
38 | 
39 | def mlp_decoder(in_features: int, hidden_size: int, num_layers: int, policy_rnn_dropout_p: float) -> torch.nn.Module:
40 |     return nn.Sequential(
41 |         nn.Linear(in_features=in_features, out_features=hidden_size),
42 |         nn.ReLU(),
43 |         nn.Linear(in_features=hidden_size, out_features=hidden_size),
44 |         nn.ReLU(),
45 |         nn.Linear(in_features=hidden_size, out_features=hidden_size),
46 |     )
47 | 


--------------------------------------------------------------------------------
/hulc/models/encoders/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lukashermann/hulc/7fdb09f5f1ff831ec8df29cfc61a41db573e9a06/hulc/models/encoders/__init__.py


--------------------------------------------------------------------------------
/hulc/models/encoders/clip_lang_encoder.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | import torch
 4 | import torch.nn as nn
 5 | 
 6 | from hulc.models.perceptual_encoders.clip import build_model, load_clip, tokenize
 7 | 
 8 | 
 9 | class LangClip(nn.Module):
10 |     def __init__(self, freeze_backbone: bool = True, model_name: str = "RN50"):
11 |         super(LangClip, self).__init__()
12 |         self.device = "cuda" if torch.cuda.is_available() else "cpu"
13 |         # Load CLIP model
14 |         print(f"loading language CLIP model with backbone: {model_name}")
15 |         self._load_clip(model_name)
16 |         if freeze_backbone:
17 |             for param in self.clip_rn50.parameters():
18 |                 param.requires_grad = False
19 | 
20 |     def _load_clip(self, model_name: str) -> None:
21 |         model, _ = load_clip(model_name, device=self.device)
22 |         self.clip_rn50 = build_model(model.state_dict()).to(self.device)
23 | 
24 |     def forward(self, x: List) -> torch.Tensor:
25 |         with torch.no_grad():
26 |             tokens = tokenize(x).to(self.device)
27 |             emb = self.clip_rn50.encode_text(tokens)
28 |         return torch.unsqueeze(emb, 1)
29 | 


--------------------------------------------------------------------------------
/hulc/models/encoders/goal_encoders.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict
 2 | 
 3 | import torch
 4 | import torch.nn as nn
 5 | import torch.nn.functional as F
 6 | 
 7 | 
 8 | class VisualGoalEncoder(nn.Module):
 9 |     def __init__(
10 |         self,
11 |         hidden_size: int,
12 |         latent_goal_features: int,
13 |         in_features: int,
14 |         l2_normalize_goal_embeddings: bool,
15 |         activation_function: str,
16 |     ):
17 |         super().__init__()
18 |         self.l2_normalize_output = l2_normalize_goal_embeddings
19 |         self.act_fn = getattr(nn, activation_function)()
20 |         self.mlp = nn.Sequential(
21 |             nn.Linear(in_features=in_features, out_features=hidden_size),
22 |             # nn.BatchNorm1d(hidden_size),
23 |             self.act_fn,
24 |             nn.Linear(in_features=hidden_size, out_features=hidden_size),
25 |             # nn.BatchNorm1d(hidden_size),
26 |             self.act_fn,
27 |             nn.Linear(in_features=hidden_size, out_features=latent_goal_features),
28 |         )
29 |         self.ln = nn.LayerNorm(latent_goal_features)
30 | 
31 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
32 |         x = self.mlp(x)
33 |         if self.l2_normalize_output:
34 |             x = F.normalize(x, p=2, dim=1)
35 |         x = self.ln(x)
36 |         return x
37 | 
38 | 
39 | class LanguageGoalEncoder(nn.Module):
40 |     def __init__(
41 |         self,
42 |         in_features: int,
43 |         hidden_size: int,
44 |         latent_goal_features: int,
45 |         l2_normalize_goal_embeddings: bool,
46 |         word_dropout_p: float,
47 |         activation_function: str,
48 |     ):
49 |         super().__init__()
50 |         self.l2_normalize_output = l2_normalize_goal_embeddings
51 |         self.act_fn = getattr(nn, activation_function)()
52 |         self.mlp = nn.Sequential(
53 |             nn.Dropout(word_dropout_p),
54 |             nn.Linear(in_features=in_features, out_features=hidden_size),
55 |             # nn.BatchNorm1d(hidden_size),
56 |             self.act_fn,
57 |             nn.Linear(in_features=hidden_size, out_features=hidden_size),
58 |             # nn.BatchNorm1d(hidden_size),
59 |             self.act_fn,
60 |             nn.Linear(in_features=hidden_size, out_features=latent_goal_features),
61 |         )
62 |         self.ln = nn.LayerNorm(latent_goal_features)
63 | 
64 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
65 |         x = self.mlp(x)
66 |         if self.l2_normalize_output:
67 |             x = F.normalize(x, p=2, dim=1)
68 |         x = self.ln(x)
69 |         return x
70 | 


--------------------------------------------------------------------------------
/hulc/models/encoders/lang_encoder.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | 
 5 | class LanguageEncoder(nn.Module):
 6 |     def __init__(
 7 |         self,
 8 |         language_features: int,
 9 |         hidden_size: int,
10 |         out_features: int,
11 |         word_dropout_p: float,
12 |         activation_function: str,
13 |     ):
14 |         super().__init__()
15 |         self.act_fn = getattr(nn, activation_function)()
16 |         self.mlp = nn.Sequential(
17 |             nn.Dropout(word_dropout_p),
18 |             nn.Linear(in_features=language_features, out_features=hidden_size),
19 |             # nn.BatchNorm1d(hidden_size),
20 |             self.act_fn,
21 |             nn.Linear(in_features=hidden_size, out_features=hidden_size),
22 |             # nn.BatchNorm1d(hidden_size),
23 |             self.act_fn,
24 |             nn.Linear(in_features=hidden_size, out_features=out_features),
25 |         )
26 | 
27 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
28 |         x = self.mlp(x)
29 |         return x
30 | 


--------------------------------------------------------------------------------
/hulc/models/encoders/language_network.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | from sentence_transformers import SentenceTransformer
 4 | import torch
 5 | import torch.nn as nn
 6 | 
 7 | 
 8 | class SBert(nn.Module):
 9 |     def __init__(self, nlp_model: str):
10 |         #  choose model from https://www.sbert.net/docs/pretrained_models.html
11 |         super().__init__()
12 |         assert isinstance(nlp_model, str)
13 |         self.model = SentenceTransformer(nlp_model)
14 | 
15 |     def forward(self, x: List) -> torch.Tensor:
16 |         emb = self.model.encode(x, convert_to_tensor=True)
17 |         return torch.unsqueeze(emb, 1)
18 | 


--------------------------------------------------------------------------------
/hulc/models/gcbc.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from typing import Dict
  3 | 
  4 | import torch
  5 | 
  6 | from hulc.models.hulc import Hulc
  7 | 
  8 | logger = logging.getLogger(__name__)
  9 | 
 10 | 
 11 | class GCBC(Hulc):
 12 |     """
 13 |     Goal-conditioned behavior cloning.
 14 |     """
 15 | 
 16 |     @staticmethod
 17 |     def setup_input_sizes(
 18 |         perceptual_encoder,
 19 |         plan_proposal,
 20 |         plan_recognition,
 21 |         visual_goal,
 22 |         action_decoder,
 23 |         distribution,
 24 |     ):
 25 |         """
 26 |         Configure the input feature sizes of the respective parts of the network.
 27 | 
 28 |         Args:
 29 |             perceptual_encoder: DictConfig for perceptual encoder.
 30 |             plan_proposal: DictConfig for plan proposal network.
 31 |             plan_recognition: DictConfig for plan recognition network.
 32 |             visual_goal: DictConfig for visual goal encoder.
 33 |             action_decoder: DictConfig for action decoder network.
 34 |             distribution: DictConfig for plan distribution (continuous or discrete).
 35 |         """
 36 |         plan_proposal.perceptual_features = perceptual_encoder.latent_size
 37 |         plan_recognition.in_features = perceptual_encoder.latent_size
 38 |         visual_goal.in_features = perceptual_encoder.latent_size
 39 |         action_decoder.perceptual_features = perceptual_encoder.latent_size
 40 | 
 41 |         if distribution.dist == "discrete":
 42 |             plan_proposal.plan_features = distribution.class_size * distribution.category_size
 43 |             plan_recognition.plan_features = distribution.class_size * distribution.category_size
 44 |             action_decoder.plan_features = 0
 45 |         elif distribution.dist == "continuous":
 46 |             plan_proposal.plan_features = distribution.plan_features
 47 |             plan_recognition.plan_features = distribution.plan_features
 48 |             action_decoder.plan_features = 0
 49 | 
 50 |     def training_step(self, batch: Dict[str, Dict], batch_idx: int) -> torch.Tensor:  # type: ignore
 51 |         """
 52 |         Compute and return the training loss.
 53 | 
 54 |         Args:
 55 |             batch (dict):
 56 |                 - 'vis' (dict):
 57 |                     - 'rgb_obs' (dict):
 58 |                         - 'rgb_static' (Tensor): RGB camera image of static camera
 59 |                         - ...
 60 |                     - 'depth_obs' (dict):
 61 |                         - 'depth_static' (Tensor): Depth camera image of depth camera
 62 |                         - ...
 63 |                     - 'robot_obs' (Tensor): Proprioceptive state observation.
 64 |                     - 'actions' (Tensor): Ground truth actions.
 65 |                     - 'state_info' (dict):
 66 |                         - 'robot_obs' (Tensor): Unnormalized robot states.
 67 |                         - 'scene_obs' (Tensor): Unnormalized scene states.
 68 |                     - 'idx' (LongTensor): Episode indices.
 69 |                 - 'lang' (dict):
 70 |                     Like 'vis' but with additional keys:
 71 |                         - 'language' (Tensor): Embedded Language labels.
 72 |                         - 'use_for_aux_lang_loss' (BoolTensor): Mask of which sequences in the batch to consider for
 73 |                             auxiliary loss.
 74 |             batch_idx (int): Integer displaying index of this batch.
 75 | 
 76 | 
 77 |         Returns:
 78 |             loss tensor
 79 |         """
 80 |         action_loss, proprio_loss, lang_pred_loss, lang_contrastive_loss, lang_clip_loss, total_loss = (
 81 |             torch.tensor(0.0).to(self.device),
 82 |             torch.tensor(0.0).to(self.device),
 83 |             torch.tensor(0.0).to(self.device),
 84 |             torch.tensor(0.0).to(self.device),
 85 |             torch.tensor(0.0).to(self.device),
 86 |             torch.tensor(0.0).to(self.device),
 87 |         )
 88 | 
 89 |         batch_size: Dict[str, int] = {}
 90 |         total_bs = 0
 91 |         for self.modality_scope, dataset_batch in batch.items():
 92 |             perceptual_emb = self.perceptual_encoder(
 93 |                 dataset_batch["rgb_obs"], dataset_batch["depth_obs"], dataset_batch["robot_obs"]
 94 |             )
 95 |             if self.state_recons:
 96 |                 proprio_loss += self.perceptual_encoder.state_reconstruction_loss()
 97 |             if "lang" in self.modality_scope:
 98 |                 latent_goal = self.language_goal(dataset_batch["lang"])
 99 |             else:
100 |                 latent_goal = self.visual_goal(perceptual_emb[:, -1])
101 | 
102 |             robot_obs = dataset_batch["state_info"]["robot_obs"]
103 |             actions = dataset_batch["actions"]
104 |             empty_plan = torch.empty((dataset_batch["actions"].shape[0]), 0).to(self.device)
105 |             act_loss = self.action_decoder.loss(empty_plan, perceptual_emb, latent_goal, actions, robot_obs)
106 |             _, seq_feat = self.plan_recognition(perceptual_emb)
107 | 
108 |             if "lang" in self.modality_scope:
109 |                 if not torch.any(dataset_batch["use_for_aux_lang_loss"]):
110 |                     batch_size["aux_lang"] = 1
111 |                 else:
112 |                     batch_size["aux_lang"] = torch.sum(dataset_batch["use_for_aux_lang_loss"]).detach()  # type:ignore
113 |                     if self.use_bc_z_auxiliary_loss:
114 |                         lang_pred_loss += self.bc_z_auxiliary_loss(
115 |                             seq_feat, dataset_batch["lang"], dataset_batch["use_for_aux_lang_loss"]
116 |                         )
117 |                     if self.use_clip_auxiliary_loss:
118 |                         lang_clip_loss += self.clip_auxiliary_loss(
119 |                             seq_feat, latent_goal, dataset_batch["use_for_aux_lang_loss"]
120 |                         )
121 |                     if self.use_mia_auxiliary_loss:
122 |                         lang_contrastive_loss += self.mia_auxiliary_loss(
123 |                             seq_feat, latent_goal, dataset_batch["use_for_aux_lang_loss"]
124 |                         )
125 |             action_loss += act_loss
126 |             total_loss += act_loss
127 |             batch_size[self.modality_scope] = dataset_batch["actions"].shape[0]
128 |             total_bs += dataset_batch["actions"].shape[0]
129 | 
130 |             self.log(
131 |                 f"train/action_loss_{self.modality_scope}",
132 |                 act_loss,
133 |                 on_step=False,
134 |                 on_epoch=True,
135 |                 batch_size=batch_size[self.modality_scope],
136 |             )
137 |         total_loss = total_loss / len(batch)  # divide accumulated gradients by number of datasets
138 |         action_loss = action_loss / len(batch)
139 |         if self.state_recons:
140 |             proprio_loss = proprio_loss / len(batch)
141 |             total_loss = total_loss + self.st_recon_beta * proprio_loss
142 |             self.log(
143 |                 "train/pred_proprio",
144 |                 self.st_recon_beta * proprio_loss,
145 |                 on_step=False,
146 |                 on_epoch=True,
147 |                 batch_size=total_bs,
148 |             )
149 |         if self.use_bc_z_auxiliary_loss:
150 |             total_loss = total_loss + self.bc_z_auxiliary_loss_beta * lang_pred_loss
151 |             self.log(
152 |                 "train/pred_lang",
153 |                 self.bc_z_auxiliary_loss_beta * lang_pred_loss,
154 |                 on_step=False,
155 |                 on_epoch=True,
156 |                 batch_size=batch_size["aux_lang"],
157 |                 sync_dist=True,
158 |             )
159 |         if self.use_mia_auxiliary_loss:
160 |             total_loss = total_loss + self.mia_auxiliary_loss_beta * lang_contrastive_loss
161 |             self.log(
162 |                 "train/lang_contrastive",
163 |                 self.mia_auxiliary_loss_beta * lang_contrastive_loss,
164 |                 on_step=False,
165 |                 on_epoch=True,
166 |                 batch_size=batch_size["aux_lang"],
167 |                 sync_dist=True,
168 |             )
169 |         if self.use_clip_auxiliary_loss:
170 |             total_loss = total_loss + self.clip_auxiliary_loss_beta * lang_clip_loss
171 |             self.log(
172 |                 "train/lang_clip_loss",
173 |                 self.clip_auxiliary_loss_beta * lang_clip_loss,
174 |                 on_step=False,
175 |                 on_epoch=True,
176 |                 batch_size=batch_size["aux_lang"],
177 |                 sync_dist=True,
178 |             )
179 |         self.log("train/action_loss", action_loss, on_step=False, on_epoch=True, batch_size=total_bs)
180 |         self.log("train/total_loss", total_loss, on_step=False, on_epoch=True, batch_size=total_bs)
181 |         return total_loss
182 | 
183 |     def validation_step(self, batch: Dict[str, Dict], batch_idx: int) -> Dict[str, torch.Tensor]:  # type: ignore
184 |         """
185 |         Compute and log the validation losses and additional metrics.
186 | 
187 |         Args:
188 |             batch (dict):
189 |                 - 'vis' (dict):
190 |                     - 'rgb_obs' (dict):
191 |                         - 'rgb_static' (Tensor): RGB camera image of static camera
192 |                         - ...
193 |                     - 'depth_obs' (dict):
194 |                         - 'depth_static' (Tensor): Depth camera image of depth camera
195 |                         - ...
196 |                     - 'robot_obs' (Tensor): Proprioceptive state observation.
197 |                     - 'actions' (Tensor): Ground truth actions.
198 |                     - 'state_info' (dict):
199 |                         - 'robot_obs' (Tensor): Unnormalized robot states.
200 |                         - 'scene_obs' (Tensor): Unnormalized scene states.
201 |                     - 'idx' (LongTensor): Episode indices.
202 |                 - 'lang' (dict):
203 |                     Like 'vis' but with additional keys:
204 |                         - 'language' (Tensor): Embedded Language labels.
205 |                         - 'use_for_aux_lang_loss' (BoolTensor): Mask of which sequences in the batch to consider for
206 |                             auxiliary loss.
207 |             batch_idx (int): Integer displaying index of this batch.
208 | 
209 |         Returns:
210 |             Dictionary containing the sampled plans of plan recognition and plan proposal networks, as well as the
211 |             episode indices.
212 |         """
213 |         output = {}
214 |         val_total_act_loss = torch.tensor(0.0).to(self.device)
215 |         for self.modality_scope, dataset_batch in batch.items():
216 |             perceptual_emb = self.perceptual_encoder(
217 |                 dataset_batch["rgb_obs"], dataset_batch["depth_obs"], dataset_batch["robot_obs"]
218 |             )
219 |             if self.state_recons:
220 |                 state_recon_loss = self.perceptual_encoder.state_reconstruction_loss()
221 |                 self.log(f"val/proprio_loss_{self.modality_scope}", state_recon_loss, sync_dist=True)
222 |             if "lang" in self.modality_scope:
223 |                 latent_goal = self.language_goal(dataset_batch["lang"])
224 |             else:
225 |                 latent_goal = self.visual_goal(perceptual_emb[:, -1])
226 | 
227 |             robot_obs = dataset_batch["state_info"]["robot_obs"]
228 |             actions = dataset_batch["actions"]
229 |             empty_plan = torch.empty((dataset_batch["actions"].shape[0]), 0).to(self.device)
230 |             action_loss, sample_act = self.action_decoder.loss_and_act(  # type:  ignore
231 |                 empty_plan, perceptual_emb, latent_goal, actions, robot_obs
232 |             )
233 |             mae = torch.nn.functional.l1_loss(
234 |                 sample_act[..., :-1], actions[..., :-1], reduction="none"
235 |             )  # (batch, seq, 6)
236 |             mae = torch.mean(mae, 1)  # (batch, 6)
237 |             # gripper action
238 |             gripper_discrete = sample_act[..., -1]
239 |             gt_gripper_act = actions[..., -1]
240 |             m = gripper_discrete > 0
241 |             gripper_discrete[m] = 1
242 |             gripper_discrete[~m] = -1
243 |             gripper_sr = torch.mean((gt_gripper_act == gripper_discrete).float())
244 |             _, seq_feat = self.plan_recognition(perceptual_emb)
245 | 
246 |             if "lang" in self.modality_scope:
247 |                 if self.use_bc_z_auxiliary_loss:
248 |                     val_pred_lang_loss = self.bc_z_auxiliary_loss(
249 |                         seq_feat, dataset_batch["lang"], dataset_batch["use_for_aux_lang_loss"]
250 |                     )
251 |                     self.log("val/lang_pred_loss", val_pred_lang_loss, sync_dist=True)
252 |                 if self.use_clip_auxiliary_loss:
253 |                     val_pred_clip_loss = self.clip_auxiliary_loss(
254 |                         seq_feat, latent_goal, dataset_batch["use_for_aux_lang_loss"]
255 |                     )
256 |                     self.log("val/val_pred_clip_loss", val_pred_clip_loss, sync_dist=True)
257 |                     self.clip_groundtruth(seq_feat, dataset_batch["idx"], dataset_batch["use_for_aux_lang_loss"])
258 |                 if self.use_mia_auxiliary_loss:
259 |                     val_pred_contrastive_loss = self.mia_auxiliary_loss(
260 |                         seq_feat, latent_goal, dataset_batch["use_for_aux_lang_loss"]
261 |                     )
262 |                     self.log("val/lang_contrastive_loss", val_pred_contrastive_loss, sync_dist=True)
263 |             val_total_act_loss += action_loss
264 |             mae_mean = mae.mean()
265 |             pos_mae = mae[..., :3].mean()
266 |             orn_mae = mae[..., 3:6].mean()
267 |             self.log(f"val_total_mae/{self.modality_scope}_total_mae", mae_mean, sync_dist=True)
268 |             self.log(f"val_pos_mae/{self.modality_scope}_pos_mae", pos_mae, sync_dist=True)
269 |             self.log(f"val_orn_mae/{self.modality_scope}_orn_mae", orn_mae, sync_dist=True)
270 |             self.log(f"val_act/{self.modality_scope}_act_loss", action_loss, sync_dist=True)
271 |             self.log(f"val_grip/{self.modality_scope}_grip_sr", gripper_sr, sync_dist=True)
272 |             self.log(
273 |                 "val_act/action_loss",
274 |                 val_total_act_loss / len(self.trainer.datamodule.modalities),  # type:ignore
275 |                 sync_dist=True,
276 |             )
277 |             output[f"idx_{self.modality_scope}"] = dataset_batch["idx"]
278 | 
279 |         return output
280 | 
281 |     def reset(self):
282 |         """
283 |         Call this at the beginning of a new rollout when doing inference.
284 |         """
285 |         self.latent_goal = None
286 | 
287 |     def step(self, obs, goal):
288 |         """
289 |         Do one step of inference with the model.
290 | 
291 |         Args:
292 |             obs (dict): Observation from environment.
293 |             goal (dict): Goal as visual observation or embedded language instruction.
294 | 
295 |         Returns:
296 |             Predicted action.
297 |         """
298 |         with torch.no_grad():
299 |             if self.latent_goal is None:
300 |                 if isinstance(goal, str):
301 |                     embedded_lang = torch.from_numpy(self.lang_embeddings[goal]).to(self.device).squeeze(0).float()
302 |                     self.latent_goal = self.language_goal(embedded_lang)
303 |                 else:
304 |                     imgs = {
305 |                         k: torch.cat([v, goal["rgb_obs"][k]], dim=1) for k, v in obs["rgb_obs"].items()
306 |                     }  # (1, 2, C, H, W)
307 |                     depth_imgs = {k: torch.cat([v, goal["depth_obs"][k]], dim=1) for k, v in obs["depth_obs"].items()}
308 |                     state = torch.cat([obs["robot_obs"], goal["robot_obs"]], dim=1)
309 |                     perceptual_emb = self.perceptual_encoder(imgs, depth_imgs, state)
310 |                     self.latent_goal = self.visual_goal(perceptual_emb[:, -1])
311 | 
312 |             perceptual_emb = self.perceptual_encoder(obs["rgb_obs"], obs["depth_obs"], obs["robot_obs"])
313 |             empty_plan = torch.empty(1, 0).to(self.device)
314 |             action = self.action_decoder.act(
315 |                 empty_plan, perceptual_emb, self.latent_goal, obs["robot_obs_raw"]
316 |             )  # type:  ignore
317 |             return action
318 | 


--------------------------------------------------------------------------------
/hulc/models/perceptual_encoders/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lukashermann/hulc/7fdb09f5f1ff831ec8df29cfc61a41db573e9a06/hulc/models/perceptual_encoders/__init__.py


--------------------------------------------------------------------------------
/hulc/models/perceptual_encoders/concat_encoders.py:
--------------------------------------------------------------------------------
  1 | from typing import Dict, Optional
  2 | 
  3 | import hydra
  4 | from omegaconf import DictConfig
  5 | import torch
  6 | import torch.nn as nn
  7 | from torch.nn.functional import mse_loss
  8 | 
  9 | 
 10 | class ConcatEncoders(nn.Module):
 11 |     def __init__(
 12 |         self,
 13 |         rgb_static: DictConfig,
 14 |         proprio: DictConfig,
 15 |         device: torch.device,
 16 |         depth_static: Optional[DictConfig] = None,
 17 |         rgb_gripper: Optional[DictConfig] = None,
 18 |         depth_gripper: Optional[DictConfig] = None,
 19 |         tactile: Optional[DictConfig] = None,
 20 |         state_decoder: Optional[DictConfig] = None,
 21 |     ):
 22 |         super().__init__()
 23 |         self._latent_size = rgb_static.visual_features
 24 |         if rgb_gripper:
 25 |             self._latent_size += rgb_gripper.visual_features
 26 |         if depth_static:
 27 |             self._latent_size += depth_static.visual_features
 28 |         if depth_gripper:
 29 |             self._latent_size += depth_gripper.visual_features
 30 |         if tactile:
 31 |             self._latent_size += tactile.visual_features
 32 |         visual_features = self._latent_size
 33 |         # super ugly, fix this clip ddp thing in a better way
 34 |         if "clip" in rgb_static["_target_"]:
 35 |             self.rgb_static_encoder = hydra.utils.instantiate(rgb_static, device=device)
 36 |         else:
 37 |             self.rgb_static_encoder = hydra.utils.instantiate(rgb_static)
 38 |         self.depth_static_encoder = hydra.utils.instantiate(depth_static) if depth_static else None
 39 |         self.rgb_gripper_encoder = hydra.utils.instantiate(rgb_gripper) if rgb_gripper else None
 40 |         self.depth_gripper_encoder = hydra.utils.instantiate(depth_gripper) if depth_gripper else None
 41 |         self.tactile_encoder = hydra.utils.instantiate(tactile)
 42 |         self.proprio_encoder = hydra.utils.instantiate(proprio)
 43 |         if self.proprio_encoder:
 44 |             self._latent_size += self.proprio_encoder.out_features
 45 | 
 46 |         self.state_decoder = None
 47 |         if state_decoder:
 48 |             state_decoder.visual_features = visual_features
 49 |             state_decoder.n_state_obs = self.proprio_encoder.out_features
 50 |             self.state_decoder = hydra.utils.instantiate(state_decoder)
 51 | 
 52 |         self.current_visual_embedding = None
 53 |         self.current_state_obs = None
 54 | 
 55 |     @property
 56 |     def latent_size(self):
 57 |         return self._latent_size
 58 | 
 59 |     def forward(
 60 |         self, imgs: Dict[str, torch.Tensor], depth_imgs: Dict[str, torch.Tensor], state_obs: torch.Tensor
 61 |     ) -> torch.Tensor:
 62 |         rgb_static = imgs["rgb_static"]
 63 |         rgb_gripper = imgs["rgb_gripper"] if "rgb_gripper" in imgs else None
 64 |         rgb_tactile = imgs["rgb_tactile"] if "rgb_tactile" in imgs else None
 65 |         depth_static = depth_imgs["depth_static"] if "depth_static" in depth_imgs else None
 66 |         depth_gripper = depth_imgs["depth_gripper"] if "depth_gripper" in depth_imgs else None
 67 | 
 68 |         b, s, c, h, w = rgb_static.shape
 69 |         rgb_static = rgb_static.reshape(-1, c, h, w)  # (batch_size * sequence_length, 3, 200, 200)
 70 |         # ------------ Vision Network ------------ #
 71 |         encoded_imgs = self.rgb_static_encoder(rgb_static)  # (batch*seq_len, 64)
 72 |         encoded_imgs = encoded_imgs.reshape(b, s, -1)  # (batch, seq, 64)
 73 | 
 74 |         if depth_static is not None:
 75 |             depth_static = torch.unsqueeze(depth_static, 2)
 76 |             depth_static = depth_static.reshape(-1, 1, h, w)  # (batch_size * sequence_length, 3, 200, 200)
 77 |             encoded_depth_static = self.depth_static_encoder(depth_static)  # (batch*seq_len, 64)
 78 |             encoded_depth_static = encoded_depth_static.reshape(b, s, -1)  # (batch, seq, 64)
 79 |             encoded_imgs = torch.cat([encoded_imgs, encoded_depth_static], dim=-1)
 80 | 
 81 |         if rgb_gripper is not None:
 82 |             b, s, c, h, w = rgb_gripper.shape
 83 |             rgb_gripper = rgb_gripper.reshape(-1, c, h, w)  # (batch_size * sequence_length, 3, 84, 84)
 84 |             encoded_imgs_gripper = self.rgb_gripper_encoder(rgb_gripper)  # (batch*seq_len, 64)
 85 |             encoded_imgs_gripper = encoded_imgs_gripper.reshape(b, s, -1)  # (batch, seq, 64)
 86 |             encoded_imgs = torch.cat([encoded_imgs, encoded_imgs_gripper], dim=-1)
 87 |             if depth_gripper is not None:
 88 |                 depth_gripper = torch.unsqueeze(depth_gripper, 2)
 89 |                 depth_gripper = depth_gripper.reshape(-1, 1, h, w)  # (batch_size * sequence_length, 1, 84, 84)
 90 |                 encoded_depth_gripper = self.depth_gripper_encoder(depth_gripper)
 91 |                 encoded_depth_gripper = encoded_depth_gripper.reshape(b, s, -1)  # (batch, seq, 64)
 92 |                 encoded_imgs = torch.cat([encoded_imgs, encoded_depth_gripper], dim=-1)
 93 | 
 94 |         if rgb_tactile is not None:
 95 |             b, s, c, h, w = rgb_tactile.shape
 96 |             rgb_tactile = rgb_tactile.reshape(-1, c, h, w)  # (batch_size * sequence_length, 3, 84, 84)
 97 |             encoded_tactile = self.tactile_encoder(rgb_tactile)
 98 |             encoded_tactile = encoded_tactile.reshape(b, s, -1)
 99 |             encoded_imgs = torch.cat([encoded_imgs, encoded_tactile], dim=-1)
100 | 
101 |         self.current_visual_embedding = encoded_imgs
102 |         self.current_state_obs = state_obs  # type: ignore
103 |         if self.proprio_encoder:
104 |             state_obs_out = self.proprio_encoder(state_obs)
105 |             perceptual_emb = torch.cat([encoded_imgs, state_obs_out], dim=-1)
106 |         else:
107 |             perceptual_emb = encoded_imgs
108 | 
109 |         return perceptual_emb
110 | 
111 |     def state_reconstruction_loss(self):
112 |         assert self.state_decoder is not None
113 |         proprio_pred = self.state_decoder(self.current_visual_embedding)
114 |         return mse_loss(self.current_state_obs, proprio_pred)
115 | 


--------------------------------------------------------------------------------
/hulc/models/perceptual_encoders/proprio_encoder.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from torch import nn
 3 | 
 4 | 
 5 | class IdentityEncoder(nn.Module):
 6 |     def __init__(self, proprioception_dims):
 7 |         super(IdentityEncoder, self).__init__()
 8 |         # remove a dimension if we convert robot orientation quaternion to euler angles
 9 |         self.n_state_obs = int(np.sum(np.diff([list(x) for x in [list(y) for y in proprioception_dims.keep_indices]])))
10 |         self.identity = nn.Identity()
11 | 
12 |     @property
13 |     def out_features(self):
14 |         return self.n_state_obs
15 | 
16 |     def forward(self, x):
17 |         return self.identity(x)
18 | 


--------------------------------------------------------------------------------
/hulc/models/perceptual_encoders/tactile_encoder.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | import torchvision.models as models
 5 | 
 6 | 
 7 | class TactileEncoder(nn.Module):
 8 |     def __init__(self, visual_features: int, freeze_tactile_backbone: bool = True):
 9 |         super(TactileEncoder, self).__init__()
10 |         # Load pre-trained resnet-18
11 |         net = models.resnet18(pretrained=True)
12 |         # Remove the last fc layer, and rebuild
13 |         modules = list(net.children())[:-1]
14 |         self.net = nn.Sequential(*modules)
15 |         if freeze_tactile_backbone:
16 |             for param in self.net.parameters():
17 |                 param.requires_grad = False
18 |         self.fc1 = nn.Linear(1024, 512)
19 |         self.fc2 = nn.Linear(512, visual_features)
20 | 
21 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
22 |         x_l = self.net(x[:, :3, :, :]).squeeze()
23 |         x_r = self.net(x[:, 3:, :, :]).squeeze()
24 |         x = torch.cat((x_l, x_r), dim=-1)
25 |         # Add fc layer for final prediction
26 |         output = F.relu(self.fc1(x))  # batch, 512
27 |         output = self.fc2(output)  # batch, 64
28 |         return output
29 | 


--------------------------------------------------------------------------------
/hulc/models/perceptual_encoders/vision_clip.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | from hulc.models.perceptual_encoders.clip import load_clip
 6 | 
 7 | 
 8 | class VisionClip(nn.Module):
 9 |     def __init__(
10 |         self, device: torch.device, visual_features: int, freeze_backbone: bool = True, model_name: str = "RN50"
11 |     ):
12 |         super(VisionClip, self).__init__()
13 |         # Load CLIP model
14 |         print(f"loading vision CLIP model with backbone: {model_name}")
15 |         self.clip_model, _ = load_clip(model_name, device=device)
16 |         if freeze_backbone:
17 |             for param in self.clip_model.parameters():
18 |                 param.requires_grad = False
19 |         if "RN50" in model_name:
20 |             self.fc1 = nn.Linear(1024, 512)
21 |             self.fc2 = nn.Linear(512, visual_features)
22 |         elif "ViT-B/32" in model_name:
23 |             self.fc1 = nn.Linear(512, 256)
24 |             self.fc2 = nn.Linear(256, visual_features)
25 | 
26 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
27 |         x = self.clip_model.encode_image(x)  # type:ignore
28 |         output = F.relu(self.fc1(x))  # batch, 512
29 |         output = self.fc2(output)  # batch, 64
30 |         return output
31 | 


--------------------------------------------------------------------------------
/hulc/models/perceptual_encoders/vision_network.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | from typing import Optional, Tuple
  4 | 
  5 | import torch
  6 | import torch.nn as nn
  7 | import torch.nn.functional as F
  8 | from torch.nn.parameter import Parameter
  9 | 
 10 | 
 11 | class VisionNetwork(nn.Module):
 12 |     # reference: https://arxiv.org/pdf/2005.07648.pdf
 13 |     def __init__(
 14 |         self,
 15 |         input_width: int,
 16 |         input_height: int,
 17 |         activation_function: str,
 18 |         dropout_vis_fc: float,
 19 |         l2_normalize_output: bool,
 20 |         visual_features: int,
 21 |         num_c: int,
 22 |         use_sinusoid: bool,
 23 |         spatial_softmax_temp: float,
 24 |     ):
 25 |         super(VisionNetwork, self).__init__()
 26 |         self.l2_normalize_output = l2_normalize_output
 27 |         self.act_fn = getattr(nn, activation_function)()
 28 |         # w,h,kernel_size,padding,stride
 29 |         w, h = self.calc_out_size(input_width, input_height, 8, 0, 4)
 30 |         w, h = self.calc_out_size(w, h, 4, 0, 2)
 31 |         w, h = self.calc_out_size(w, h, 3, 0, 1)
 32 |         self.use_sinusoid = use_sinusoid
 33 |         temp = None if not isinstance(spatial_softmax_temp, float) else spatial_softmax_temp
 34 |         self.spatial_softmax = SpatialSoftmax(num_rows=w, num_cols=h, temperature=temp)  # shape: [N, 128]
 35 |         # model
 36 |         self.conv_model = nn.Sequential(
 37 |             # input shape: [N, 3, 200, 200]
 38 |             nn.Conv2d(in_channels=num_c, out_channels=32, kernel_size=8, stride=4),  # shape: [N, 32, 49, 49]
 39 |             # nn.BatchNorm2d(32),
 40 |             self.act_fn,
 41 |             nn.Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2),  # shape: [N, 64, 23, 23]
 42 |             # nn.BatchNorm2d(64),
 43 |             self.act_fn,
 44 |             nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1),  # shape: [N, 64, 21, 21]
 45 |             # nn.BatchNorm2d(64),
 46 |             self.act_fn,
 47 |         )
 48 |         k = 3 if self.use_sinusoid else 1
 49 |         self.fc1 = nn.Sequential(
 50 |             nn.Linear(in_features=128 * k, out_features=512), self.act_fn, nn.Dropout(dropout_vis_fc)
 51 |         )  # shape: [N, 512]
 52 |         self.fc2 = nn.Linear(in_features=512, out_features=visual_features)  # shape: [N, 64]
 53 |         self.ln = nn.LayerNorm(visual_features)
 54 | 
 55 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
 56 |         x = self.conv_model(x)
 57 |         x = self.spatial_softmax(x)
 58 |         if self.use_sinusoid:
 59 |             x = torch.cat([x, torch.sin(x), torch.cos(x)], 1)
 60 |         x = self.fc1(x)
 61 |         x = self.fc2(x)
 62 |         if self.l2_normalize_output:
 63 |             x = F.normalize(x, p=2, dim=1)
 64 |         x = self.ln(x)
 65 |         return x  # shape: [N, 64]
 66 | 
 67 |     @staticmethod
 68 |     def calc_out_size(w: int, h: int, kernel_size: int, padding: int, stride: int) -> Tuple[int, int]:
 69 |         width = (w - kernel_size + 2 * padding) // stride + 1
 70 |         height = (h - kernel_size + 2 * padding) // stride + 1
 71 |         return width, height
 72 | 
 73 | 
 74 | class SpatialSoftmax(nn.Module):
 75 |     def __init__(self, num_rows: int, num_cols: int, temperature: Optional[float] = None):
 76 |         """
 77 |         Computes the spatial softmax of a convolutional feature map.
 78 |         Read more here:
 79 |         "Learning visual feature spaces for robotic manipulation with
 80 |         deep spatial autoencoders." Finn et al., http://arxiv.org/abs/1509.06113.
 81 |         :param num_rows:  size related to original image width
 82 |         :param num_cols:  size related to original image height
 83 |         :param temperature: Softmax temperature (optional). If None, a learnable temperature is created.
 84 |         """
 85 |         super(SpatialSoftmax, self).__init__()
 86 |         self.num_rows = num_rows
 87 |         self.num_cols = num_cols
 88 |         grid_x, grid_y = torch.meshgrid(
 89 |             torch.linspace(-1.0, 1.0, num_cols), torch.linspace(-1.0, 1.0, num_rows), indexing="ij"
 90 |         )
 91 |         x_map = grid_x.reshape(-1)
 92 |         y_map = grid_y.reshape(-1)
 93 |         self.register_buffer("x_map", x_map)
 94 |         self.register_buffer("y_map", y_map)
 95 |         if temperature:
 96 |             self.register_buffer("temperature", torch.ones(1) * temperature)
 97 |         else:
 98 |             self.temperature = Parameter(torch.ones(1))
 99 | 
100 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
101 |         n, c, h, w = x.shape
102 |         x = x.contiguous().view(-1, h * w)  # batch, C, W*H
103 |         softmax_attention = F.softmax(x / self.temperature, dim=1)  # batch, C, W*H
104 |         expected_x = torch.sum(self.x_map * softmax_attention, dim=1, keepdim=True)
105 |         expected_y = torch.sum(self.y_map * softmax_attention, dim=1, keepdim=True)
106 |         expected_xy = torch.cat((expected_x, expected_y), 1)
107 |         self.coords = expected_xy.view(-1, c * 2)
108 |         return self.coords  # batch, C*2
109 | 


--------------------------------------------------------------------------------
/hulc/models/perceptual_encoders/vision_network_gripper.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | from typing import Tuple
 4 | 
 5 | import torch
 6 | import torch.nn as nn
 7 | import torch.nn.functional as F
 8 | 
 9 | 
10 | def nature_cnn(act_fn, num_c):
11 |     return nn.Sequential(
12 |         nn.Conv2d(num_c, 32, 8, stride=4),
13 |         act_fn,
14 |         nn.Conv2d(32, 64, 4, stride=2),
15 |         act_fn,
16 |         nn.Conv2d(64, 64, 3, stride=1),
17 |         act_fn,
18 |         nn.Flatten(start_dim=1),
19 |         nn.Linear(64 * 7 * 7, 128),
20 |         act_fn,
21 |     )
22 | 
23 | 
24 | class VisionNetwork(nn.Module):
25 |     def __init__(
26 |         self,
27 |         input_width: int,
28 |         input_height: int,
29 |         conv_encoder: str,
30 |         activation_function: str,
31 |         dropout_vis_fc: float,
32 |         l2_normalize_output: bool,
33 |         visual_features: int,
34 |         num_c: int,
35 |     ):
36 |         super(VisionNetwork, self).__init__()
37 |         self.l2_normalize_output = l2_normalize_output
38 |         self.act_fn = getattr(nn, activation_function)()
39 |         # model
40 |         # this calls the method with the name conv_encoder
41 |         self.conv_model = eval(conv_encoder)
42 |         self.conv_model = self.conv_model(self.act_fn, num_c)
43 |         self.fc1 = nn.Sequential(
44 |             nn.Linear(in_features=128, out_features=512), self.act_fn, nn.Dropout(dropout_vis_fc)
45 |         )  # shape: [N, 512]
46 |         self.fc2 = nn.Linear(in_features=512, out_features=visual_features)  # shape: [N, 64]
47 |         self.ln = nn.LayerNorm(visual_features)
48 | 
49 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
50 |         x = self.conv_model(x)
51 |         x = self.fc1(x)
52 |         x = self.fc2(x)
53 |         if self.l2_normalize_output:
54 |             x = F.normalize(x, p=2, dim=1)
55 |         x = self.ln(x)
56 |         return x  # shape: [N, 64]
57 | 
58 |     @staticmethod
59 |     def calc_out_size(w: int, h: int, kernel_size: int, padding: int, stride: int) -> Tuple[int, int]:
60 |         width = (w - kernel_size + 2 * padding) // stride + 1
61 |         height = (h - kernel_size + 2 * padding) // stride + 1
62 |         return width, height
63 | 


--------------------------------------------------------------------------------
/hulc/models/plan_encoders/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lukashermann/hulc/7fdb09f5f1ff831ec8df29cfc61a41db573e9a06/hulc/models/plan_encoders/__init__.py


--------------------------------------------------------------------------------
/hulc/models/plan_encoders/plan_proposal_net.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import torch
 3 | import torch.nn as nn
 4 | 
 5 | from hulc.utils.distributions import Distribution, State
 6 | 
 7 | 
 8 | class PlanProposalNetwork(nn.Module):
 9 |     def __init__(
10 |         self,
11 |         perceptual_features: int,
12 |         latent_goal_features: int,
13 |         plan_features: int,
14 |         activation_function: str,
15 |         hidden_size: int,
16 |         dist: Distribution,
17 |     ):
18 |         super(PlanProposalNetwork, self).__init__()
19 |         self.perceptual_features = perceptual_features
20 |         self.latent_goal_features = latent_goal_features
21 |         self.plan_features = plan_features
22 |         self.hidden_size = hidden_size
23 |         self.in_features = self.perceptual_features + self.latent_goal_features
24 |         self.act_fn = getattr(nn, activation_function)()
25 |         self.dist = dist
26 |         self.fc_model = nn.Sequential(
27 |             nn.Linear(in_features=self.in_features, out_features=hidden_size),  # shape: [N, 136]
28 |             # nn.BatchNorm1d(hidden_size),
29 |             self.act_fn,
30 |             nn.Linear(in_features=hidden_size, out_features=hidden_size),
31 |             # nn.BatchNorm1d(hidden_size),
32 |             self.act_fn,
33 |             nn.Linear(in_features=hidden_size, out_features=hidden_size),
34 |             # nn.BatchNorm1d(hidden_size),
35 |             self.act_fn,
36 |             nn.Linear(in_features=hidden_size, out_features=hidden_size),
37 |             # nn.BatchNorm1d(hidden_size),
38 |             self.act_fn,
39 |         )
40 |         self.fc_state = self.dist.build_state(self.hidden_size, self.plan_features)
41 | 
42 |     def forward(self, initial_percep_emb: torch.Tensor, latent_goal: torch.Tensor) -> State:
43 |         x = torch.cat([initial_percep_emb, latent_goal], dim=-1)
44 |         x = self.fc_model(x)
45 |         my_state = self.fc_state(x)
46 |         state = self.dist.forward_dist(my_state)
47 |         return state
48 | 


--------------------------------------------------------------------------------
/hulc/models/plan_encoders/plan_recognition_net.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import math
  4 | from typing import Tuple
  5 | 
  6 | import torch
  7 | import torch.nn as nn
  8 | 
  9 | from hulc.utils.distributions import Distribution, State
 10 | 
 11 | 
 12 | class PlanRecognitionBiRNNNetwork(nn.Module):
 13 |     def __init__(
 14 |         self,
 15 |         in_features: int,
 16 |         plan_features: int,
 17 |         action_space: int,
 18 |         birnn_dropout_p: float,
 19 |         dist: Distribution,
 20 |         rnn_type: str,
 21 |     ):
 22 |         super(PlanRecognitionBiRNNNetwork, self).__init__()
 23 |         self.plan_features = plan_features
 24 |         self.action_space = action_space
 25 |         self.in_features = in_features
 26 |         self.dist = dist
 27 |         self.birnn_model = eval(rnn_type)(
 28 |             input_size=self.in_features,
 29 |             hidden_size=2048,
 30 |             num_layers=2,
 31 |             bidirectional=True,
 32 |             batch_first=True,
 33 |             dropout=birnn_dropout_p,
 34 |         )  # shape: [N, seq_len, feat]
 35 |         self.fc_state = self.dist.build_state(4096, self.plan_features)
 36 | 
 37 |     def forward(self, perceptual_emb: torch.Tensor) -> Tuple[State, torch.Tensor]:
 38 |         x, hn = self.birnn_model(perceptual_emb)
 39 |         x = x[:, -1]  # we just need only last unit output
 40 |         my_state = self.fc_state(x)
 41 |         state = self.dist.forward_dist(my_state)
 42 |         return state, x
 43 | 
 44 | 
 45 | class PlanRecognitionTransformersNetwork(nn.Module):
 46 |     def __init__(
 47 |         self,
 48 |         num_heads: int,
 49 |         num_layers: int,
 50 |         encoder_hidden_size: int,
 51 |         fc_hidden_size: int,
 52 |         plan_features: int,
 53 |         in_features: int,
 54 |         action_space: int,
 55 |         encoder_normalize: bool,
 56 |         positional_normalize: bool,
 57 |         position_embedding: bool,
 58 |         max_position_embeddings: int,
 59 |         dropout_p: bool,
 60 |         dist: Distribution,
 61 |     ):
 62 | 
 63 |         super().__init__()
 64 |         self.in_features = in_features
 65 |         self.plan_features = plan_features
 66 |         self.action_space = action_space
 67 |         self.padding = False
 68 |         self.dist = dist
 69 |         self.hidden_size = fc_hidden_size
 70 |         self.position_embedding = position_embedding
 71 |         self.encoder_normalize = encoder_normalize
 72 |         self.positional_normalize = positional_normalize
 73 |         mod = self.in_features % num_heads
 74 |         if mod != 0:
 75 |             print(f"Padding for Num of Heads : {num_heads}")
 76 |             self.padding = True
 77 |             self.pad = num_heads - mod
 78 |             self.in_features += self.pad
 79 |         if position_embedding:
 80 |             self.position_embeddings = nn.Embedding(max_position_embeddings, self.in_features)
 81 |         else:
 82 |             self.positional_encoder = PositionalEncoding(self.in_features)  # TODO: with max window_size
 83 |         encoder_layer = nn.TransformerEncoderLayer(
 84 |             self.in_features, num_heads, dim_feedforward=encoder_hidden_size, dropout=dropout_p
 85 |         )
 86 |         encoder_norm = nn.LayerNorm(self.in_features) if encoder_normalize else None
 87 |         if self.positional_normalize:
 88 |             self.layernorm = nn.LayerNorm(self.in_features)
 89 |         self.dropout = nn.Dropout(p=dropout_p)
 90 |         self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers, norm=encoder_norm)
 91 |         self.fc = nn.Linear(in_features=self.in_features, out_features=fc_hidden_size)
 92 |         self.fc_state = self.dist.build_state(fc_hidden_size, self.plan_features)
 93 | 
 94 |     def forward(self, perceptual_emb: torch.Tensor) -> Tuple[State, torch.Tensor]:
 95 |         batch_size, seq_len = perceptual_emb.shape[0], perceptual_emb.shape[1]
 96 |         perceptual_emb = (
 97 |             torch.cat([perceptual_emb, torch.zeros((batch_size, seq_len, self.pad)).to(perceptual_emb.device)], dim=-1)
 98 |             if self.padding
 99 |             else perceptual_emb
100 |         )
101 |         if self.position_embedding:
102 |             position_ids = torch.arange(seq_len, dtype=torch.long, device=perceptual_emb.device).unsqueeze(0)
103 |             position_embeddings = self.position_embeddings(position_ids)
104 |             x = perceptual_emb + position_embeddings
105 |             x = x.permute(1, 0, 2)
106 |         else:
107 |             # padd the perceptual embeddig
108 |             x = self.positional_encoder(perceptual_emb.permute(1, 0, 2))  # [s, b, emb]
109 |         if self.positional_normalize:
110 |             x = self.layernorm(x)
111 |         x = self.dropout(x)
112 |         x = self.transformer_encoder(x)
113 |         x = self.fc(x.permute(1, 0, 2))
114 |         x = torch.mean(x, dim=1)  # gather all the sequence info
115 |         my_state = self.fc_state(x)
116 |         state = self.dist.forward_dist(my_state)
117 |         return state, x
118 | 
119 | 
120 | class PositionalEncoding(nn.Module):
121 |     """Implementation from: https://pytorch.org/tutorials/beginner/transformer_tutorial.html"""
122 | 
123 |     def __init__(self, d_model, max_len=5000):
124 |         super(PositionalEncoding, self).__init__()
125 | 
126 |         pe = torch.zeros(max_len, d_model)
127 |         position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
128 |         div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
129 |         pe[:, 0::2] = torch.sin(position * div_term)
130 |         pe[:, 1::2] = torch.cos(position * div_term) if d_model % 2 == 0 else torch.cos(position * div_term[:-1])
131 |         pe = pe.unsqueeze(0).transpose(0, 1)
132 |         self.register_buffer("pe", pe)
133 | 
134 |     def forward(self, x):
135 |         x = x + self.pe[: x.size(0), :]
136 |         return x
137 | 


--------------------------------------------------------------------------------
/hulc/training.py:
--------------------------------------------------------------------------------
  1 | from datetime import timedelta
  2 | import logging
  3 | from pathlib import Path
  4 | import sys
  5 | from typing import List, Union
  6 | 
  7 | from lightning_lite.accelerators.cuda import num_cuda_devices
  8 | from pytorch_lightning.strategies import DDPStrategy
  9 | 
 10 | # This is for using the locally installed repo clone when using slurm
 11 | sys.path.insert(0, Path(__file__).absolute().parents[1].as_posix())
 12 | from calvin_agent.utils.utils import get_git_commit_hash, get_last_checkpoint, print_system_env_info
 13 | import hydra
 14 | from omegaconf import DictConfig, ListConfig, OmegaConf
 15 | from pytorch_lightning import Callback, LightningModule, seed_everything, Trainer
 16 | from pytorch_lightning.callbacks import LearningRateMonitor
 17 | from pytorch_lightning.loggers import Logger
 18 | from pytorch_lightning.utilities import rank_zero_only
 19 | 
 20 | import hulc
 21 | import hulc.models.hulc as models_m
 22 | from hulc.utils.utils import initialize_pretrained_weights
 23 | 
 24 | logger = logging.getLogger(__name__)
 25 | 
 26 | 
 27 | @hydra.main(config_path="../conf", config_name="config")
 28 | def train(cfg: DictConfig) -> None:
 29 |     """
 30 |     This is called to start a training.
 31 | 
 32 |     Args:
 33 |         cfg: hydra config
 34 |     """
 35 |     # sets seeds for numpy, torch, python.random and PYTHONHASHSEED.
 36 |     seed_everything(cfg.seed, workers=True)  # type: ignore
 37 |     datamodule = hydra.utils.instantiate(cfg.datamodule, training_repo_root=Path(hulc.__file__).parents[1])
 38 |     chk = get_last_checkpoint(Path.cwd())
 39 | 
 40 |     # Load Model
 41 |     if chk is not None:
 42 |         model = getattr(models_m, cfg.model["_target_"].split(".")[-1]).load_from_checkpoint(chk.as_posix())
 43 |     else:
 44 |         model = hydra.utils.instantiate(cfg.model)
 45 |         if "pretrain_chk" in cfg:
 46 |             initialize_pretrained_weights(model, cfg)
 47 | 
 48 |     log_rank_0(f"Training with the following config:\n{OmegaConf.to_yaml(cfg)}")
 49 |     log_rank_0("Repo commit hash: {}".format(get_git_commit_hash(Path(hydra.utils.to_absolute_path(__file__)))))
 50 |     log_rank_0(print_system_env_info())
 51 | 
 52 |     train_logger = setup_logger(cfg, model)
 53 |     callbacks = setup_callbacks(cfg.callbacks)
 54 |     lr_logger = LearningRateMonitor(logging_interval="step")
 55 |     callbacks.append(lr_logger)
 56 | 
 57 |     trainer_args = {
 58 |         **cfg.trainer,
 59 |         "logger": train_logger,
 60 |         "callbacks": callbacks,
 61 |         "benchmark": False,
 62 |     }
 63 | 
 64 |     # Configure multi-GPU training
 65 |     if is_multi_gpu_training(trainer_args["devices"]):
 66 |         # increase default timeout for loading data into shared memory
 67 |         trainer_args["strategy"] = DDPStrategy(find_unused_parameters=False, timeout=timedelta(seconds=3600))
 68 |         if not cfg.slurm:
 69 |             modify_argv_hydra()
 70 | 
 71 |     trainer = Trainer(**trainer_args)
 72 | 
 73 |     # Start training
 74 |     trainer.fit(model, datamodule=datamodule, ckpt_path=chk)  # type: ignore
 75 | 
 76 | 
 77 | def setup_callbacks(callbacks_cfg: DictConfig) -> List[Callback]:
 78 |     """
 79 |     Instantiate all training callbacks.
 80 | 
 81 |     Args:
 82 |         callbacks_cfg: DictConfig with all callback params
 83 | 
 84 |     Returns:
 85 |         List of instantiated callbacks.
 86 |     """
 87 |     callbacks = [hydra.utils.instantiate(cb) for cb in callbacks_cfg.values()]
 88 |     return callbacks
 89 | 
 90 | 
 91 | def setup_logger(cfg: DictConfig, model: LightningModule) -> Logger:
 92 |     """
 93 |     Set up the logger (tensorboard or wandb) from hydra config.
 94 | 
 95 |     Args:
 96 |         cfg: Hydra config
 97 |         model: LightningModule
 98 | 
 99 |     Returns:
100 |         logger
101 |     """
102 |     pathlib_cwd = Path.cwd()
103 |     if "group" in cfg.logger:
104 |         cfg.logger.group = pathlib_cwd.parent.name
105 |         cfg.logger.name = pathlib_cwd.parent.name + "/" + pathlib_cwd.name
106 |         cfg.logger.id = cfg.logger.name.replace("/", "_")
107 |         train_logger = hydra.utils.instantiate(cfg.logger)
108 |         # train_logger.watch(model)
109 |     else:
110 |         train_logger = hydra.utils.instantiate(cfg.logger)
111 |     return train_logger
112 | 
113 | 
114 | def modify_argv_hydra() -> None:
115 |     """
116 |     To make hydra work with pytorch-lightning and ddp, we modify sys.argv for the child processes spawned with ddp.
117 |     This is only used when NOT using slurm.
118 |     """
119 |     cwd = Path.cwd().as_posix()
120 |     cwd = f'"{cwd}"'
121 |     sys.argv = sys.argv[:1]
122 |     sys.argv.extend(
123 |         [
124 |             f"hydra.run.dir={cwd}",
125 |             "hydra/hydra_logging=disabled",
126 |             "hydra/job_logging=disabled",
127 |         ]
128 |     )
129 |     overrides = OmegaConf.load(".hydra/overrides.yaml")
130 |     for o in overrides:
131 |         if "hydra/sweeper" in o:  # type: ignore
132 |             continue
133 | 
134 |         if "hydra/launcher" in o:  # type: ignore
135 |             continue
136 | 
137 |         sys.argv.append(o)  # type: ignore
138 | 
139 | 
140 | def is_multi_gpu_training(devices: Union[int, str, ListConfig]) -> bool:
141 |     """
142 |     Check if training on multiple GPUs.
143 |     See https://pytorch-lightning.readthedocs.io/en/stable/common/trainer.html#devices
144 | 
145 |      Args:
146 |         devices: int, str or ListConfig specifying devices
147 | 
148 |     Returns:
149 |         True if multi-gpu training (ddp), False otherwise.
150 |     """
151 |     num_gpu_available = num_cuda_devices()
152 |     if isinstance(devices, int):
153 |         return devices > 1 or (devices == -1 and num_gpu_available > 1)
154 |     elif isinstance(devices, str) and devices == "auto":
155 |         return num_gpu_available > 1
156 |     elif isinstance(devices, str):
157 |         return len(devices) > 1
158 |     elif isinstance(devices, ListConfig):
159 |         return len(devices) > 1
160 |     else:
161 |         raise ValueError
162 | 
163 | 
164 | @rank_zero_only
165 | def log_rank_0(*args, **kwargs):
166 |     # when using ddp, only log with rank 0 process
167 |     logger.info(*args, **kwargs)
168 | 
169 | 
170 | if __name__ == "__main__":
171 |     train()
172 | 


--------------------------------------------------------------------------------
/hulc/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lukashermann/hulc/7fdb09f5f1ff831ec8df29cfc61a41db573e9a06/hulc/utils/__init__.py


--------------------------------------------------------------------------------
/hulc/utils/bpe_simple_vocab_16e6.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lukashermann/hulc/7fdb09f5f1ff831ec8df29cfc61a41db573e9a06/hulc/utils/bpe_simple_vocab_16e6.txt.gz


--------------------------------------------------------------------------------
/hulc/utils/clip_tokenizer.py:
--------------------------------------------------------------------------------
  1 | from functools import lru_cache
  2 | import gzip
  3 | import html
  4 | import os
  5 | 
  6 | import ftfy
  7 | import regex as re
  8 | 
  9 | 
 10 | @lru_cache()
 11 | def default_bpe():
 12 |     return os.path.join(os.path.dirname(os.path.abspath(__file__)), "bpe_simple_vocab_16e6.txt.gz")
 13 | 
 14 | 
 15 | @lru_cache()
 16 | def bytes_to_unicode():
 17 |     """
 18 |     Returns list of utf-8 byte and a corresponding list of unicode strings.
 19 |     The reversible bpe codes work on unicode strings.
 20 |     This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
 21 |     When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
 22 |     This is a signficant percentage of your normal, say, 32K bpe vocab.
 23 |     To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
 24 |     And avoids mapping to whitespace/control characters the bpe code barfs on.
 25 |     """
 26 |     bs = list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
 27 |     cs = bs[:]
 28 |     n = 0
 29 |     for b in range(2 ** 8):
 30 |         if b not in bs:
 31 |             bs.append(b)
 32 |             cs.append(2 ** 8 + n)
 33 |             n += 1
 34 |     cs = [chr(n) for n in cs]
 35 |     return dict(zip(bs, cs))
 36 | 
 37 | 
 38 | def get_pairs(word):
 39 |     """Return set of symbol pairs in a word.
 40 |     Word is represented as tuple of symbols (symbols being variable-length strings).
 41 |     """
 42 |     pairs = set()
 43 |     prev_char = word[0]
 44 |     for char in word[1:]:
 45 |         pairs.add((prev_char, char))
 46 |         prev_char = char
 47 |     return pairs
 48 | 
 49 | 
 50 | def basic_clean(text):
 51 |     text = ftfy.fix_text(text)
 52 |     text = html.unescape(html.unescape(text))
 53 |     return text.strip()
 54 | 
 55 | 
 56 | def whitespace_clean(text):
 57 |     text = re.sub(r"\s+", " ", text)
 58 |     text = text.strip()
 59 |     return text
 60 | 
 61 | 
 62 | class SimpleTokenizer(object):
 63 |     def __init__(self, bpe_path: str = default_bpe()):
 64 |         self.byte_encoder = bytes_to_unicode()
 65 |         self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
 66 |         merges = gzip.open(bpe_path).read().decode("utf-8").split("\n")
 67 |         merges = merges[1 : 49152 - 256 - 2 + 1]
 68 |         merges = [tuple(merge.split()) for merge in merges]  # type:ignore
 69 |         vocab = list(bytes_to_unicode().values())
 70 |         vocab = vocab + [v + "</w>" for v in vocab]
 71 |         for merge in merges:
 72 |             vocab.append("".join(merge))
 73 |         vocab.extend(["<|startoftext|>", "<|endoftext|>"])
 74 |         self.encoder = dict(zip(vocab, range(len(vocab))))
 75 |         self.decoder = {v: k for k, v in self.encoder.items()}
 76 |         self.bpe_ranks = dict(zip(merges, range(len(merges))))
 77 |         self.cache = {"<|startoftext|>": "<|startoftext|>", "<|endoftext|>": "<|endoftext|>"}
 78 |         self.pat = re.compile(
 79 |             r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""",
 80 |             re.IGNORECASE,
 81 |         )
 82 | 
 83 |     def bpe(self, token):
 84 |         if token in self.cache:
 85 |             return self.cache[token]
 86 |         word = tuple(token[:-1]) + (token[-1] + "</w>",)
 87 |         pairs = get_pairs(word)
 88 | 
 89 |         if not pairs:
 90 |             return token + "</w>"
 91 | 
 92 |         while True:
 93 |             bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
 94 |             if bigram not in self.bpe_ranks:
 95 |                 break
 96 |             first, second = bigram
 97 |             new_word = []
 98 |             i = 0
 99 |             while i < len(word):
100 |                 try:
101 |                     j = word.index(first, i)
102 |                     new_word.extend(word[i:j])
103 |                     i = j
104 |                 except Exception as ex:
105 |                     new_word.extend(word[i:])
106 |                     print(ex.message, ex.args)
107 |                     break
108 | 
109 |                 if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
110 |                     new_word.append(first + second)
111 |                     i += 2
112 |                 else:
113 |                     new_word.append(word[i])
114 |                     i += 1
115 |             new_word = tuple(new_word)
116 |             word = new_word
117 |             if len(word) == 1:
118 |                 break
119 |             else:
120 |                 pairs = get_pairs(word)
121 |         word = " ".join(word)
122 |         self.cache[token] = word
123 |         return word
124 | 
125 |     def encode(self, text):
126 |         bpe_tokens = []
127 |         text = whitespace_clean(basic_clean(text)).lower()
128 |         for token in re.findall(self.pat, text):
129 |             token = "".join(self.byte_encoder[b] for b in token.encode("utf-8"))
130 |             bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(" "))
131 |         return bpe_tokens
132 | 
133 |     def decode(self, tokens):
134 |         text = "".join([self.decoder[token] for token in tokens])
135 |         text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors="replace").replace("</w>", " ")
136 |         return text
137 | 


--------------------------------------------------------------------------------
/hulc/utils/distributions.py:
--------------------------------------------------------------------------------
 1 | from collections import namedtuple
 2 | from typing import Union
 3 | 
 4 | import torch
 5 | from torch.distributions import Independent, Normal, OneHotCategoricalStraightThrough  # type: ignore
 6 | import torch.nn as nn
 7 | import torch.nn.functional as F
 8 | 
 9 | DiscState = namedtuple("DiscState", ["logit"])
10 | ContState = namedtuple("ContState", ["mean", "std"])
11 | 
12 | State = Union[DiscState, ContState]
13 | 
14 | 
15 | class Distribution:
16 |     def __init__(self, **kwargs):
17 |         self.dist = kwargs.get("dist")
18 |         assert self.dist == "discrete" or self.dist == "continuous"
19 |         if self.dist == "discrete":
20 |             self.category_size = kwargs.get("category_size")
21 |             self.class_size = kwargs.get("class_size")
22 | 
23 |     def get_dist(self, state):
24 |         if self.dist == "discrete":
25 |             shape = state.logit.shape
26 |             logits = torch.reshape(state.logit, shape=(*shape[:-1], self.category_size, self.class_size))
27 |             return Independent(OneHotCategoricalStraightThrough(logits=logits), 1)
28 |         elif self.dist == "continuous":
29 |             return Independent(Normal(state.mean, state.std), 1)
30 | 
31 |     def detach_state(self, state):
32 |         if self.dist == "discrete":
33 |             return DiscState(state.logit.detach())
34 |         elif self.dist == "continuous":
35 |             return ContState(state.mean.detach(), state.std.detach())
36 | 
37 |     def sample_latent_plan(self, distribution):
38 |         sampled_plan = distribution.sample()
39 |         if self.dist == "discrete":
40 |             sampled_plan = torch.flatten(sampled_plan, start_dim=-2, end_dim=-1)
41 |         return sampled_plan
42 | 
43 |     def build_state(self, hidden_size, plan_features):
44 |         fc_state = []
45 |         if self.dist == "discrete":
46 |             fc_state += [nn.Linear(hidden_size, plan_features)]
47 |         elif self.dist == "continuous":
48 |             fc_state += [nn.Linear(hidden_size, 2 * plan_features)]
49 |         return nn.Sequential(*fc_state)
50 | 
51 |     def forward_dist(self, x):
52 |         if self.dist == "discrete":
53 |             prior_logit = x
54 |             state = DiscState(prior_logit)  # type: State
55 |         elif self.dist == "continuous":
56 |             mean, var = torch.chunk(x, 2, dim=-1)
57 |             min_std = 0.0001
58 |             std = F.softplus(var) + min_std
59 |             state = ContState(mean, std)
60 |         return state
61 | 


--------------------------------------------------------------------------------
/hulc/utils/kl_callbacks.py:
--------------------------------------------------------------------------------
 1 | from pytorch_lightning import Callback, LightningModule, Trainer
 2 | import torch
 3 | 
 4 | 
 5 | def sigmoid(scale: float, shift: float, x: int) -> float:
 6 |     return torch.sigmoid(torch.Tensor([(x - shift) / (scale / 12)])).item()
 7 | 
 8 | 
 9 | class KLSchedule(Callback):
10 |     """
11 |     Base class for KL Annealing
12 |     """
13 | 
14 |     def __init__(self, start_epoch: int, end_epoch: int, max_kl_beta: float):
15 |         self.start_epoch = start_epoch
16 |         self.end_epoch = end_epoch
17 |         self.max_kl_beta = max_kl_beta
18 | 
19 |     def on_train_epoch_start(self, trainer: Trainer, pl_module: LightningModule) -> None:
20 |         epoch = pl_module.current_epoch
21 |         kl_beta = self._anneal_fn(epoch)
22 |         pl_module.set_kl_beta(kl_beta)  # type: ignore
23 | 
24 |     def _anneal_fn(self, epoch):
25 |         raise NotImplementedError
26 | 
27 | 
28 | class KLConstantSchedule(KLSchedule):
29 |     def __init__(self):
30 |         pass
31 | 
32 |     def on_train_epoch_start(self, trainer: Trainer, pl_module: LightningModule) -> None:
33 |         pass
34 | 
35 |     def _anneal_fn(self, epoch: int) -> None:
36 |         pass
37 | 
38 | 
39 | class KLSigmoidSchedule(KLSchedule):
40 |     def _anneal_fn(self, epoch: int) -> float:
41 |         if epoch < self.start_epoch:
42 |             kl_beta = 0.0
43 |         elif epoch > self.end_epoch:
44 |             kl_beta = self.max_kl_beta
45 |         else:
46 |             scale = self.end_epoch - self.start_epoch
47 |             shift = (self.end_epoch + self.start_epoch) / 2
48 |             kl_beta = sigmoid(scale=scale, shift=shift, x=epoch) * self.max_kl_beta
49 |         return kl_beta
50 | 
51 | 
52 | class KLLinearSchedule(KLSchedule):
53 |     def _anneal_fn(self, epoch: int) -> float:
54 |         if epoch < self.start_epoch:
55 |             kl_beta = 0.0
56 |         elif epoch > self.end_epoch:
57 |             kl_beta = self.max_kl_beta
58 |         else:
59 |             kl_beta = self.max_kl_beta * (epoch - self.start_epoch) / (self.end_epoch - self.start_epoch)
60 |         return kl_beta
61 | 
62 | 
63 | if __name__ == "__main__":
64 |     import matplotlib
65 |     import matplotlib.pyplot as plt
66 | 
67 |     matplotlib.use("TkAgg")
68 |     import numpy as np
69 | 
70 |     kl = KLLinearSchedule(10, 50, 0.1)
71 |     x = np.arange(200)
72 |     y = [kl._anneal_fn(i) for i in x]
73 |     plt.plot(x, y)
74 | 
75 |     kl2 = KLSigmoidSchedule(10, 50, 0.1)
76 |     x = np.arange(200)
77 |     y = [kl2._anneal_fn(i) for i in x]
78 |     plt.plot(x, y)
79 | 
80 |     plt.show()
81 | 


--------------------------------------------------------------------------------
/hulc/utils/transforms.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | import torch.nn as nn
 4 | import torch.nn.functional as F
 5 | 
 6 | 
 7 | # source: https://github.com/facebookresearch/drqv2/blob/main/drqv2.py
 8 | class RandomShiftsAug(nn.Module):
 9 |     def __init__(self, pad):
10 |         super().__init__()
11 |         self.pad = pad
12 | 
13 |     def forward(self, x):
14 |         x = x.float()
15 |         n, c, h, w = x.size()
16 |         assert h == w
17 |         padding = tuple([self.pad] * 4)
18 |         x = F.pad(x, padding, "replicate")
19 |         eps = 1.0 / (h + 2 * self.pad)
20 |         arange = torch.linspace(-1.0 + eps, 1.0 - eps, h + 2 * self.pad, device=x.device, dtype=x.dtype)[:h]
21 |         arange = arange.unsqueeze(0).repeat(h, 1).unsqueeze(2)
22 |         base_grid = torch.cat([arange, arange.transpose(1, 0)], dim=2)
23 |         base_grid = base_grid.unsqueeze(0).repeat(n, 1, 1, 1)
24 | 
25 |         shift = torch.randint(0, 2 * self.pad + 1, size=(n, 1, 1, 2), device=x.device, dtype=x.dtype)
26 |         shift *= 2.0 / (h + 2 * self.pad)
27 | 
28 |         grid = base_grid + shift
29 |         return F.grid_sample(x, grid, padding_mode="zeros", align_corners=False)
30 | 
31 | 
32 | class RelativeActions(object):
33 |     """Transform absolute actions to relative"""
34 | 
35 |     def __init__(self, max_pos, max_orn):
36 |         self.max_pos = max_pos
37 |         self.max_orn = max_orn
38 | 
39 |     @staticmethod
40 |     def batch_angle_between(a, b):
41 |         diff = b - a
42 |         return (diff + np.pi) % (2 * np.pi) - np.pi
43 | 
44 |     def __call__(self, action_and_obs):
45 |         actions, robot_obs = action_and_obs
46 |         assert isinstance(actions, np.ndarray)
47 |         assert isinstance(robot_obs, np.ndarray)
48 | 
49 |         rel_pos = actions[:, :3] - robot_obs[:, :3]
50 |         rel_pos = np.clip(rel_pos, -self.max_pos, self.max_pos) / self.max_pos
51 | 
52 |         rel_orn = self.batch_angle_between(robot_obs[:, 3:6], actions[:, 3:6])
53 |         rel_orn = np.clip(rel_orn, -self.max_orn, self.max_orn) / self.max_orn
54 | 
55 |         gripper = actions[:, -1:]
56 |         return np.concatenate([rel_pos, rel_orn, gripper], axis=1)
57 | 
58 |     def __repr__(self):
59 |         return self.__class__.__name__ + f"(max_pos={self.max_pos}, max_orn={self.max_orn})"
60 | 


--------------------------------------------------------------------------------
/hulc/utils/utils.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | from calvin_agent.utils.utils import format_sftp_path
 4 | from pytorch_lightning.utilities.cloud_io import load as pl_load
 5 | 
 6 | 
 7 | def initialize_pretrained_weights(model, cfg):
 8 |     pretrain_chk = pl_load(format_sftp_path(Path(cfg.pretrain_chk)), map_location=lambda storage, loc: storage)
 9 |     batch_size = model.plan_recognition.position_embeddings.weight.shape[0]
10 |     weight = "plan_recognition.position_embeddings.weight"
11 |     pretrain_chk["state_dict"][weight] = pretrain_chk["state_dict"][weight][:batch_size]
12 |     if "pretrain_exclude_pr" in cfg and cfg.pretrain_exclude_pr:
13 |         for key in list(pretrain_chk["state_dict"].keys()):
14 |             if key.startswith("plan_recognition"):
15 |                 del pretrain_chk["state_dict"][key]
16 |     model.load_state_dict(pretrain_chk["state_dict"], strict=False)
17 | 


--------------------------------------------------------------------------------
/install.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | cd calvin_env/tacto
4 | pip install -e .
5 | cd ..
6 | pip install -e .
7 | cd ..
8 | pip install -e .
9 | 


--------------------------------------------------------------------------------
/media/hulc_rollout.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lukashermann/hulc/7fdb09f5f1ff831ec8df29cfc61a41db573e9a06/media/hulc_rollout.gif


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.black]
 2 | # https://github.com/psf/black
 3 | line-length = 120
 4 | target-version = ["py38"]
 5 | exclude = "(.eggs|.git|.hg|.mypy_cache|.nox|.tox|.venv|.svn|_build|buck-out|build|dist)"
 6 | 
 7 | [tool.isort]
 8 | profile = "black"
 9 | line_length = 120
10 | force_sort_within_sections = "True"
11 | order_by_type = "False"
12 | 


--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | black
2 | flake8
3 | isort
4 | pre-commit
5 | mypy
6 | pytest
7 | pytest-cov
8 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | cmake
 2 | wheel
 3 | numpy>1.2
 4 | hydra-core==1.1.1
 5 | hydra-colorlog
 6 | matplotlib
 7 | opencv-python
 8 | omegaconf
 9 | plotly
10 | ftfy
11 | pytorch-lightning==1.8.6
12 | lightning_lite
13 | torch==1.13.1
14 | torchvision
15 | MulticoreTSNE
16 | gitpython
17 | scipy
18 | sentence-transformers
19 | gym
20 | moviepy
21 | tqdm
22 | termcolor
23 | wandb
24 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """Setup hulc installation."""
 4 | 
 5 | from os import path as op
 6 | import re
 7 | 
 8 | from setuptools import find_packages, setup
 9 | 
10 | 
11 | def _read(f):
12 |     return open(op.join(op.dirname(__file__), f)).read() if op.exists(f) else ""
13 | 
14 | 
15 | _meta = _read("hulc/__init__.py")
16 | 
17 | 
18 | def find_meta(_meta, string):
19 |     l_match = re.search(r"^" + string + r'\s*=\s*"(.*)"', _meta, re.M)
20 |     if l_match:
21 |         return l_match.group(1)
22 |     raise RuntimeError(f"Unable to find {string} string.")
23 | 
24 | 
25 | install_requires = [
26 |     l for l in _read("requirements.txt").split("\n") if l and not l.startswith("#") and not l.startswith("-")
27 | ]
28 | 
29 | meta = dict(
30 |     name=find_meta(_meta, "__project__"),
31 |     version=find_meta(_meta, "__version__"),
32 |     license=find_meta(_meta, "__license__"),
33 |     description="Hierarchical Universal Language Conditioned Policies",
34 |     platforms=("Any"),
35 |     zip_safe=False,
36 |     keywords="pytorch hulc".split(),
37 |     author=find_meta(_meta, "__author__"),
38 |     author_email=find_meta(_meta, "__email__"),
39 |     url=" https://github.com/mees/hulc",
40 |     packages=find_packages(exclude=["tests"]),
41 |     install_requires=install_requires,
42 | )
43 | 
44 | if __name__ == "__main__":
45 |     print("find_package", find_packages(exclude=["tests"]))
46 |     setup(**meta)
47 | 


--------------------------------------------------------------------------------
/setup_local.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """Setup hulc installation."""
 4 | 
 5 | from os import path as op
 6 | import re
 7 | 
 8 | from setuptools import find_packages, setup
 9 | 
10 | 
11 | def _read(f):
12 |     return open(op.join(op.dirname(__file__), f)).read() if op.exists(f) else ""
13 | 
14 | 
15 | _meta = _read("hulc/__init__.py")
16 | 
17 | 
18 | def find_meta(_meta, string):
19 |     l_match = re.search(r"^" + string + r'\s*=\s*"(.*)"', _meta, re.M)
20 |     if l_match:
21 |         return l_match.group(1)
22 |     raise RuntimeError(f"Unable to find {string} string.")
23 | 
24 | 
25 | meta = dict(
26 |     name=find_meta(_meta, "__project__"),
27 |     version=find_meta(_meta, "__version__"),
28 |     license=find_meta(_meta, "__license__"),
29 |     description="Hierarchical Universal Language Conditioned Policies",
30 |     platforms=("Any"),
31 |     zip_safe=False,
32 |     keywords="pytorch Lfp".split(),
33 |     author=find_meta(_meta, "__author__"),
34 |     author_email=find_meta(_meta, "__email__"),
35 |     url=" https://github.com/mees/hulc",
36 |     packages=find_packages(exclude=["tests"]),
37 | )
38 | 
39 | if __name__ == "__main__":
40 |     print("find_package", find_packages(exclude=["tests"]))
41 |     setup(**meta)
42 | 


--------------------------------------------------------------------------------
/slurm_scripts/README.md:
--------------------------------------------------------------------------------
 1 | ##	Training CALVIN on a Slurm Cluster
 2 | ### Starting a training
 3 | ```bash
 4 | $ cd $HULC_ROOT/slurm_scripts
 5 | $ python slurm_training.py --venv hulc_venv datamodule.root_data_dir=/path/to/dataset/
 6 | ```
 7 | This assumes that `--venv hulc_venv` specifies a conda environment.
 8 | To use virtualenv instead, change line 18 of sbatch_lfp.sh accordingly.
 9 | 
10 | All hydra arguments can be used as in the normal training.
11 | 
12 | Use the following optional command line arguments for slurm:
13 | - `--log_dir`: slurm log directory
14 | - `--job_name`: slurm job name
15 | - `--gpus`: number of gpus
16 | - `--mem`: memory
17 | - `--cpus`: number of cpus
18 | - `--days`: time limit in days
19 | - `--partition`: name of slurm partition
20 | 
21 | The script will create a new folder in the specified log dir with a date tag and the job name.
22 | This is done *before* the job is submitted to the slurm queue.
23 | In order to ensure reproducibility, the current state of the calvin repository
24 | is copied to the log directory at *submit time* and is
25 | locally installed, such that you can schedule multiple trainings and there is no interference with
26 | future changes to the repository.
27 | 
28 | ### Resuming a training
29 | Every job submission creates a `resume_training.sh` script in the log folder. To resume a training,
30 | call `$ sh <PATH_TO_LOG_DIR>/resume_training.sh`. By default, the model loads the latest saved checkpoint.
31 | 
32 | ### Evaluating a model
33 | To evaluate a trained model via slurm, run `$ sh <PATH_TO_LOG_DIR>/evaluate.sh`, which will automatically place a job on the
34 | same partition as it was trained on. Note that this script is also autogenerated.
35 | 


--------------------------------------------------------------------------------
/slurm_scripts/sbatch_eval.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Print some information about the job to STDOUT
 3 | echo "Workingdir: $PWD";
 4 | echo "Started at $(date)";
 5 | echo "Running job $SLURM_JOB_NAME";
 6 | echo "cpus per node: $SLURM_JOB_CPUS_PER_NODE";
 7 | echo "gres: $SLURM_GRES";
 8 | echo "mem: $SLURM_MEM_PER_NODE";
 9 | echo "ntasks: $SLURM_NTASKS";
10 | echo "JID $SLURM_JOB_ID on queue $SLURM_JOB_PARTITION";
11 | 
12 | export NCCL_DEBUG=INFO
13 | export PYTHONFAULTHANDLER=1
14 | export HYDRA_FULL_ERROR=1
15 | 
16 | # Job to perform
17 | source ~/.bashrc
18 | conda activate $1
19 | srun python ${@:2}
20 | 
21 | # Print some Information about the end-time to STDOUT
22 | echo "DONE";
23 | echo "Finished at $(date)";
24 | 


--------------------------------------------------------------------------------
/slurm_scripts/sbatch_lfp.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Print some information about the job to STDOUT
 3 | echo "Workingdir: $PWD";
 4 | echo "Started at $(date)";
 5 | echo "Running job $SLURM_JOB_NAME";
 6 | echo "cpus per node: $SLURM_JOB_CPUS_PER_NODE";
 7 | echo "gres: $SLURM_GRES";
 8 | echo "mem: $SLURM_MEM_PER_NODE";
 9 | echo "ntasks: $SLURM_NTASKS";
10 | echo "JID $SLURM_JOB_ID on queue $SLURM_JOB_PARTITION";
11 | 
12 | export NCCL_DEBUG=INFO
13 | export PYTHONFAULTHANDLER=1
14 | export HYDRA_FULL_ERROR=1
15 | 
16 | # Job to perform
17 | source ~/.bashrc
18 | conda activate $1
19 | timeout 23.8h srun python $3 slurm=true hydra.run.dir=$4 trainer.devices=$5 ${@:6}
20 | 
21 | if [[ $? -eq 124 ]]; then
22 | echo "Time limit exceeded. Resubmit job.";
23 | ssh ${USER}@$2 <<ENDSSH
24 | sh $4/resume_training.sh
25 | ENDSSH
26 | fi
27 | 
28 | rm /dev/shm/train_*
29 | rm /dev/shm/val_*
30 | 
31 | # Print some Information about the end-time to STDOUT
32 | echo "DONE";
33 | echo "Finished at $(date)";
34 | 


--------------------------------------------------------------------------------
/slurm_scripts/slurm_eval.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | import subprocess
 3 | import sys
 4 | 
 5 | from calvin_agent.utils.utils import get_all_checkpoints
 6 | import numpy as np
 7 | 
 8 | 
 9 | def main():
10 |     """
11 |     This script calls the evaluate.sh script of the specified training_dir 8 times with different checkpoints
12 |     """
13 |     training_dir = Path(sys.argv[1])
14 |     eval_log_dir = training_dir / "evaluation"
15 |     max_epoch = int(sys.argv[2]) if len(sys.argv) > 2 else np.inf
16 | 
17 |     checkpoints = get_all_checkpoints(training_dir)
18 |     epochs = [str(e) for chk in checkpoints if (e := int(chk.stem.split("=")[1])) <= max_epoch]
19 |     split_epochs = np.array_split(epochs, 8)
20 |     epoch_args = [",".join(arr) for arr in split_epochs if len(arr)]
21 |     for epoch_arg in epoch_args:
22 |         cmd = [(training_dir / "evaluate.sh").as_posix(), "--checkpoints", epoch_arg, "--eval_log_dir", eval_log_dir]
23 |         output = subprocess.check_output(cmd)
24 |         print(output.decode("utf-8"))
25 | 
26 | 
27 | if __name__ == "__main__":
28 |     main()
29 | 


--------------------------------------------------------------------------------
/slurm_scripts/slurm_training.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import datetime
  3 | import os
  4 | from pathlib import Path
  5 | import stat
  6 | import subprocess
  7 | 
  8 | from git import Repo
  9 | import numpy as np
 10 | from setuptools import sandbox
 11 | 
 12 | default_log_dir = f"/home/{os.environ['USER']}/logs" if "USER" in os.environ else "/tmp"
 13 | if default_log_dir == "/tmp":
 14 |     print("CAUTION: logging to /tmp")
 15 | parser = argparse.ArgumentParser(description="Parse slurm parameters and hydra config overrides")
 16 | 
 17 | parser.add_argument("--script", type=str, default="./sbatch_lfp.sh")
 18 | parser.add_argument("--train_file", type=str, default="../hulc/training.py")
 19 | parser.add_argument("-l", "--log_dir", type=str, default=default_log_dir)
 20 | parser.add_argument("-j", "--job_name", type=str, default="play_training")
 21 | parser.add_argument("-g", "--gpus", type=int, default=1)
 22 | parser.add_argument("--mem", type=int, default=0)  # 0 means no memory limit
 23 | parser.add_argument("--cpus", type=int, default=8)
 24 | parser.add_argument("--days", type=int, default=1)
 25 | parser.add_argument("-v", "--venv", type=str)
 26 | parser.add_argument("-p", "--partition", type=str, default="alldlc_gpu-rtx2080")
 27 | parser.add_argument("--login_node", type=str, default="kis3bat1")
 28 | parser.add_argument("-x", "--exclude", type=str)
 29 | parser.add_argument("--no_clone", action="store_true")
 30 | args, unknownargs = parser.parse_known_args()
 31 | 
 32 | 
 33 | assert np.all(["gpu" not in arg for arg in unknownargs])
 34 | assert np.all(["hydra.run.dir" not in arg for arg in unknownargs])
 35 | assert np.all(["log_dir" not in arg for arg in unknownargs])
 36 | assert np.all(["hydra.sweep.dir" not in arg for arg in unknownargs])
 37 | 
 38 | log_dir = Path(args.log_dir).absolute() / f'{datetime.datetime.now().strftime("%Y-%m-%d/%H-%M-%S")}_{args.job_name}'
 39 | os.makedirs(log_dir)
 40 | args.script = Path(args.script).absolute()
 41 | args.train_file = Path(args.train_file).absolute()
 42 | 
 43 | 
 44 | def create_git_copy(repo_src_dir, repo_target_dir):
 45 |     repo = Repo(repo_src_dir)
 46 |     repo.clone(repo_target_dir)
 47 |     orig_cwd = os.getcwd()
 48 |     os.chdir(repo_target_dir)
 49 |     os.environ["PYTHONPATH"] = os.getcwd() + ":" + os.environ.get("PYTHONPATH", "")
 50 |     sandbox.run_setup("setup_local.py", ["develop", "--install-dir", "."])
 51 |     os.chdir(orig_cwd)
 52 | 
 53 | 
 54 | if not args.no_clone:
 55 |     repo_src_dir = Path(__file__).absolute().parents[1]
 56 |     repo_target_dir = log_dir / "hulc"
 57 |     create_git_copy(repo_src_dir, repo_target_dir)
 58 | 
 59 |     args.script = repo_target_dir / os.path.relpath(args.script, repo_src_dir)
 60 |     args.train_file = repo_target_dir / os.path.relpath(args.train_file, repo_src_dir)
 61 | 
 62 | if args.partition == "test":
 63 |     args.partition = "testdlc_gpu-rtx2080"
 64 | 
 65 | args.time = f"{args.days}-00:00"
 66 | if args.partition == "testdlc_gpu-rtx2080":
 67 |     args.time = "01:00:00"
 68 | 
 69 | job_opts = {
 70 |     "script": f"{args.script.as_posix()} {args.venv} {args.login_node} {args.train_file.as_posix()} {log_dir.as_posix()} {args.gpus} {' '.join(unknownargs)}",
 71 |     "partition": args.partition,
 72 |     "mem": args.mem,
 73 |     "ntasks-per-node": args.gpus,
 74 |     "cpus-per-task": args.cpus,
 75 |     "gres": f"gpu:{args.gpus}",
 76 |     "output": os.path.join(log_dir, "%x.%N.%j.out"),
 77 |     "error": os.path.join(log_dir, "%x.%N.%j.err"),
 78 |     "job-name": args.job_name,
 79 |     "mail-type": "END,FAIL",
 80 |     "time": args.time,
 81 | }
 82 | 
 83 | if args.exclude is not None:
 84 |     job_opts["exclude"] = ",".join(map(lambda x: f"dlcgpu{int(x):02d}", args.exclude.split(",")))
 85 | 
 86 | 
 87 | def submit_job(job_info):
 88 |     # Construct sbatch command
 89 |     slurm_cmd = ["sbatch"]
 90 |     for key, value in job_info.items():
 91 |         # Check for special case keys
 92 |         if key == "script":
 93 |             continue
 94 |         slurm_cmd.append(f"--{key}={value}")
 95 |     slurm_cmd.append(job_info["script"])
 96 |     print("Generated slurm batch command: '%s'" % slurm_cmd)
 97 | 
 98 |     # Run sbatch command as subprocess.
 99 |     try:
100 |         sbatch_output = subprocess.check_output(slurm_cmd)
101 |         create_resume_script(slurm_cmd)
102 |     except subprocess.CalledProcessError as e:
103 |         # Print error message from sbatch for easier debugging, then pass on exception
104 |         if sbatch_output is not None:
105 |             print("ERROR: Subprocess call output: %s" % sbatch_output)
106 |         raise e
107 | 
108 |     print(sbatch_output.decode("utf-8"))
109 | 
110 | 
111 | def create_resume_script(slurm_cmd):
112 |     file_path = os.path.join(log_dir, "resume_training.sh")
113 |     with open(file_path, "w") as file:
114 |         file.write("#!/bin/bash\n")
115 |         file.write(" ".join(slurm_cmd))
116 |     st = os.stat(file_path)
117 |     os.chmod(file_path, st.st_mode | stat.S_IEXEC)
118 | 
119 | 
120 | def create_eval_script():
121 |     # Construct sbatch command
122 |     eval_log_dir = log_dir / "evaluation"
123 |     os.makedirs(eval_log_dir, exist_ok=True)
124 |     eval_sbatch_script = Path("./sbatch_eval.sh").absolute()
125 |     eval_file = args.train_file.parent / "evaluation/evaluate_policy.py"
126 | 
127 |     dataset_path = next(filter(lambda x: x.split("=")[0] == "datamodule.root_data_dir", unknownargs)).split("=")[1]
128 | 
129 |     eval_cmd = ["sbatch"]
130 |     eval_job_opts = {
131 |         "partition": args.partition,
132 |         "mem": args.mem,
133 |         "ntasks-per-node": 1,
134 |         "cpus-per-task": 8,
135 |         "gres": "gpu:1",
136 |         "output": os.path.join(eval_log_dir, "%x.%N.%j.out"),
137 |         "error": os.path.join(eval_log_dir, "%x.%N.%j.err"),
138 |         "job-name": f"{args.job_name}_eval",
139 |         "mail-type": "END,FAIL",
140 |         "time": "1-00:00",
141 |     }
142 |     for key, value in eval_job_opts.items():
143 |         eval_cmd.append(f"--{key}={value}")
144 |     eval_args = f"{eval_sbatch_script.as_posix()} {args.venv} {eval_file.as_posix()}"
145 |     eval_args += f" --dataset_path {dataset_path}"
146 |     eval_args += f" --train_folder {log_dir}"
147 |     eval_args += " ${@:1}"
148 |     eval_cmd.append(eval_args)
149 | 
150 |     file_path = os.path.join(log_dir, "evaluate.sh")
151 |     with open(file_path, "w") as file:
152 |         file.write("#!/bin/bash\n")
153 |         file.write(" ".join(eval_cmd))
154 |     st = os.stat(file_path)
155 |     os.chmod(file_path, st.st_mode | stat.S_IEXEC)
156 | 
157 | 
158 | submit_job(job_opts)
159 | create_eval_script()
160 | 


--------------------------------------------------------------------------------