├── ovon ├── models │ ├── pointnav.py │ ├── encoders │ │ ├── habitat_resnet.py │ │ ├── vc1_encoder.py │ │ ├── siglip_encoder.py │ │ ├── dinov2_encoder.py │ │ ├── make_encoder.py │ │ ├── cma_xattn.py │ │ ├── depth_encoder.py │ │ ├── visual_encoder.py │ │ ├── cross_attention.py │ │ ├── vit.py │ │ └── visual_encoder_v2.py │ ├── visual_encoders.py │ ├── transformer_policy.py │ └── transforms.py ├── task │ ├── __init__.py │ ├── simulator.py │ └── rewards.py ├── measurements │ ├── __init__.py │ ├── collision_penalty.py │ ├── sum_reward.py │ └── imagenav.py ├── utils │ ├── analysis │ │ ├── My_HM3D_Missing_Tags_Taxonomy.csv │ │ ├── fixes_to_provided_mappings.ods │ │ ├── fixes_to_provided_mappings.csv │ │ ├── postprocess_meta.py │ │ └── Issues_with_provided_mappings.csv │ ├── test_dataset.py │ ├── plot_tsne.py │ ├── rollout_storage_no_2d.py │ ├── visualize_policy_embeddings.py │ ├── visualize │ │ ├── statistics.py │ │ ├── viz.py │ │ └── semantic_nav_analysis.py │ ├── dagger_environment_worker.py │ ├── shuffle_episodes.py │ ├── utils.py │ ├── cache_clip_embeddings.py │ ├── lr_scheduler.py │ ├── visualize_trajectories.py │ └── sample_episodes.py ├── dataset │ ├── __init__.py │ └── ovon_dataset.py ├── trainers │ ├── envs.py │ ├── inference_worker_with_kv.py │ ├── ppo_trainer_no_2d.py │ ├── ver_rollout_storage_with_kv.py │ └── dagger_trainer.py ├── __init__.py ├── test.py └── obs_transformers │ ├── relabel_imagegoal.py │ ├── relabel_teacher_actions.py │ └── resize.py ├── setup.py ├── docs └── ovon_task.jpg ├── .gitmodules ├── .gitignore ├── config ├── tasks │ ├── objectnav_locobot_hm3d.yaml │ ├── objectnav_stretch_hm3d.yaml │ └── imagenav_stretch_hm3d.yaml └── experiments │ ├── rnn_rl.yaml │ ├── transformer_rl.yaml │ ├── rnn_rl_finetune.yaml │ ├── rnn_dagger.yaml │ ├── transformer_rl_finetune.yaml │ └── transformer_dagger.yaml ├── scripts ├── dataset │ ├── generate_objectnav_dataset.sh │ ├── submit_language_annotation_jobs.sh │ ├── clean_full_dataset.sh │ ├── clean_scene_dataset.sh │ ├── submit_data_gen_jobs.sh │ ├── generate_val_dataset.sh │ ├── generate_dataset.sh │ ├── generate_languagenav_dataset.sh │ └── generate_val_episode_counts.py ├── prepare_dataset.sh ├── train_ddppo.sh ├── train_ver.sh ├── train │ ├── 2-imagenav-ver.sh │ ├── 1-ovon-ver.sh │ ├── 3-ovon-ver-scratch.sh │ └── 4-objectnav-transformer.sh └── eval │ ├── 4-objectnav-transformer.sh │ └── 3-ovon-ver-eval-analysis.sh ├── setup.sh ├── .pre-commit-config.yaml ├── pyproject.toml └── README.md /ovon/models/pointnav.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /ovon/task/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /ovon/measurements/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup() 4 | -------------------------------------------------------------------------------- /ovon/utils/analysis/My_HM3D_Missing_Tags_Taxonomy.csv: -------------------------------------------------------------------------------- 1 | unknown" 2 | -------------------------------------------------------------------------------- /docs/ovon_task.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/naokiyokoyama/ovon/HEAD/docs/ovon_task.jpg -------------------------------------------------------------------------------- /ovon/utils/analysis/fixes_to_provided_mappings.ods: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/naokiyokoyama/ovon/HEAD/ovon/utils/analysis/fixes_to_provided_mappings.ods -------------------------------------------------------------------------------- /ovon/dataset/__init__.py: -------------------------------------------------------------------------------- 1 | from ovon.dataset import ( 2 | generate_viewpoints, 3 | objectnav_generator, 4 | pose_sampler, 5 | semantic_utils, 6 | visualization, 7 | ) 8 | from ovon.dataset.ovon_dataset import OVONDatasetV1 9 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "distributed_dagger"] 2 | path = distributed_dagger 3 | url = git@github.com:naokiyokoyama/distributed_dagger 4 | [submodule "frontier_exploration"] 5 | path = frontier_exploration 6 | url = git@github.com:naokiyokoyama/frontier_exploration 7 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # python 2 | __pycache__/ 3 | *.egg-info/ 4 | 5 | # environment 6 | variables.sh 7 | habitat-lab 8 | 9 | # development 10 | .idea 11 | .vscode 12 | 13 | # data 14 | data/ 15 | 16 | # logs 17 | tb/ 18 | train.log 19 | slurm_logs/ 20 | 21 | # scripts 22 | scripts/ 23 | -------------------------------------------------------------------------------- /ovon/models/encoders/habitat_resnet.py: -------------------------------------------------------------------------------- 1 | from typing import Dict 2 | 3 | import numpy as np 4 | import torch 5 | import torch.nn as nn 6 | from habitat_baselines.rl.ddppo.policy.resnet_policy import ResNetEncoder 7 | 8 | 9 | class HabitatResNetEncoder(nn.Module): 10 | def __init__(self, *args, **kwargs): 11 | super().__init__() 12 | self.backbone = ResNetEncoder(**kwargs) 13 | self.output_size = np.prod(self.backbone.output_shape) 14 | self.output_shape = self.backbone.output_shape 15 | 16 | def forward(self, observations: Dict, *args, **kwargs) -> torch.Tensor: 17 | return self.backbone(observations) 18 | -------------------------------------------------------------------------------- /config/tasks/objectnav_locobot_hm3d.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | defaults: 4 | - /habitat: habitat_config_base 5 | - /habitat/task: objectnav 6 | - /habitat/simulator/agents@habitat.simulator.agents.main_agent: rgb_agent 7 | - /habitat/dataset/objectnav: hm3d 8 | - _self_ 9 | 10 | habitat: 11 | environment: 12 | max_episode_steps: 500 13 | 14 | simulator: 15 | turn_angle: 30 16 | tilt_angle: 30 17 | action_space_config: "v1" 18 | agents: 19 | main_agent: 20 | sim_sensors: 21 | rgb_sensor: 22 | width: 640 23 | height: 480 24 | hfov: 79 25 | position: [0, 0.88, 0] 26 | height: 0.88 27 | radius: 0.18 28 | habitat_sim_v0: 29 | gpu_device_id: 0 30 | allow_sliding: False 31 | -------------------------------------------------------------------------------- /config/tasks/objectnav_stretch_hm3d.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | defaults: 4 | - /habitat: habitat_config_base 5 | - /habitat/task: objectnav 6 | - /habitat/simulator/agents@habitat.simulator.agents.main_agent: rgb_agent 7 | - /habitat/dataset/objectnav: hm3d 8 | - _self_ 9 | 10 | habitat: 11 | environment: 12 | max_episode_steps: 500 13 | 14 | simulator: 15 | turn_angle: 30 16 | tilt_angle: 30 17 | action_space_config: "v1" 18 | agents: 19 | main_agent: 20 | sim_sensors: 21 | rgb_sensor: 22 | width: 360 23 | height: 640 24 | hfov: 42 25 | position: [0, 1.31, 0] 26 | height: 1.41 27 | radius: 0.17 28 | habitat_sim_v0: 29 | gpu_device_id: 0 30 | allow_sliding: False 31 | -------------------------------------------------------------------------------- /scripts/dataset/generate_objectnav_dataset.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=ovon 3 | #SBATCH --output=slurm_logs/dataset-%j.out 4 | #SBATCH --error=slurm_logs/dataset-%j.err 5 | #SBATCH --gpus 4 6 | #SBATCH --nodes 1 7 | #SBATCH --cpus-per-task 10 8 | #SBATCH --ntasks-per-node 1 9 | #SBATCH --constraint=a40 10 | #SBATCH --partition=short 11 | #SBATCH --exclude=conroy 12 | #SBATCH --signal=USR1@100 13 | #SBATCH --requeue 14 | 15 | export GLOG_minloglevel=2 16 | export HABITAT_SIM_LOG=quiet 17 | export MAGNUM_LOG=quiet 18 | 19 | MAIN_ADDR=$(scontrol show hostnames "${SLURM_JOB_NODELIST}" | head -n 1) 20 | export MAIN_ADDR 21 | 22 | srun python ovon/dataset/objectnav_generator.py \ 23 | --split train \ 24 | --tasks-per-gpu 12 \ 25 | --start-poses-per-object 8000 \ 26 | --use-v1-scenes \ 27 | --multiprocessing 28 | -------------------------------------------------------------------------------- /scripts/dataset/submit_language_annotation_jobs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | path=$1 4 | split=$2 5 | model=$3 6 | 7 | export OPENAI_APIKEY= 8 | 9 | count=0 10 | prompt_meta_files=`ls ${path}/${split}/content/*_meta.json` 11 | for i in ${prompt_meta_files[@]} 12 | do 13 | scene_id=`basename $i` 14 | base=${scene_id%.*} # remove .gz 15 | base=${base%.*} # remove .json 16 | 17 | if [ -f "${path}/${split}/content/${base}_annotated.json" ]; then 18 | echo "Skipping ${base}" 19 | continue 20 | fi 21 | 22 | meta_path="${path}/${split}/content/${base}.json" 23 | meta_output_path="${path}/${split}/content/${base}_annotated.json" 24 | 25 | echo "Submitting ${base} - ${count}" 26 | python ovon/dataset/generate_captions.py --path $meta_path --output-path $meta_output_path --model $model 27 | count=$((count + 1)) 28 | done 29 | -------------------------------------------------------------------------------- /scripts/dataset/clean_full_dataset.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | split=$1 4 | content_dir="data/datasets/ovon/hm3d/v3_shuffled/${split}/content" 5 | output_path="data/datasets/ovon/hm3d/v3_shuffled_cleaned/${split}/content" 6 | 7 | gz_files=`ls ${content_dir}/*json.gz` 8 | for i in ${gz_files[@]} 9 | do 10 | scene_id=`basename $i` 11 | base=${scene_id%.*} # remove .gz 12 | base=${base%.*} # remove .json 13 | 14 | if [ -f "${output_path}/${base}.json.gz" ]; then 15 | echo "Skipping ${base}" 16 | continue 17 | fi 18 | echo "Submitting ${base}" 19 | 20 | sbatch --job-name=clean-${split}-${base} \ 21 | --output=slurm_logs/dataset/clean-${split}-${base}.out \ 22 | --error=slurm_logs/dataset/clean-${split}-${base}.err \ 23 | --export=ALL,scene_path=$i,output_path=$output_path \ 24 | scripts/dataset/clean_scene_dataset.sh 25 | done 26 | 27 | -------------------------------------------------------------------------------- /scripts/dataset/clean_scene_dataset.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --gpus 1 3 | #SBATCH --nodes 1 4 | #SBATCH --cpus-per-task 7 5 | #SBATCH --ntasks-per-node 1 6 | #SBATCH --signal=USR1@100 7 | #SBATCH --requeue 8 | #SBATCH --constraint="a40|rtx_6000|2080_ti" 9 | #SBATCH --partition=short 10 | #SBATCH --exclude calculon,alexa,cortana,bmo,c3po,ripl-s1,t1000,hal,irona,fiona 11 | 12 | export GLOG_minloglevel=2 13 | export HABITAT_SIM_LOG=quiet 14 | export MAGNUM_LOG=quiet 15 | 16 | MAIN_ADDR=$(scontrol show hostnames "${SLURM_JOB_NODELIST}" | head -n 1) 17 | export MAIN_ADDR 18 | 19 | source /srv/flash1/rramrakhya3/miniconda3/etc/profile.d/conda.sh 20 | conda deactivate 21 | conda activate ovon 22 | 23 | echo "\n" 24 | echo $scene_path 25 | echo $(which python) 26 | echo "ola" 27 | 28 | srun python ovon/dataset/clean_episodes.py --path $scene_path --output-path $output_path -------------------------------------------------------------------------------- /config/tasks/imagenav_stretch_hm3d.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | defaults: 4 | - /habitat: habitat_config_base 5 | - /habitat/task: imagenav 6 | - /habitat/simulator/agents@habitat.simulator.agents.main_agent: rgb_agent 7 | - _self_ 8 | 9 | habitat: 10 | environment: 11 | max_episode_steps: 500 12 | 13 | simulator: 14 | turn_angle: 30 15 | tilt_angle: 30 16 | action_space_config: "v1" 17 | agents: 18 | main_agent: 19 | sim_sensors: 20 | rgb_sensor: 21 | width: 360 22 | height: 640 23 | hfov: 42 24 | position: [0, 1.31, 0] 25 | height: 1.41 26 | radius: 0.17 27 | habitat_sim_v0: 28 | gpu_device_id: 0 29 | allow_sliding: False 30 | 31 | dataset: 32 | type: PointNav-v1 33 | split: train 34 | data_path: data/datasets/imagenav/hm3d/v1_stretch/{split}/{split}.json.gz 35 | -------------------------------------------------------------------------------- /setup.sh: -------------------------------------------------------------------------------- 1 | # Record of how the environment was set up 2 | # Create conda environment. Mamba is recommended for faster installation. 3 | conda_env_name=ovon 4 | mamba create -n $conda_env_name python=3.7 cmake=3.14.0 -y 5 | mamba install -n $conda_env_name \ 6 | habitat-sim=0.2.3 headless pytorch=1.12.1 cudatoolkit=11.3 \ 7 | -c pytorch -c nvidia -c conda-forge -c aihabitat -y 8 | 9 | # Install this repo as a package 10 | mamba activate $conda_env_name 11 | pip install -e . 12 | 13 | # Install frontier_exploration 14 | cd frontier_exploration && pip install -e . && cd .. 15 | 16 | # Install habitat-lab 17 | git clone --branch v0.2.3 git@github.com:facebookresearch/habitat-lab.git 18 | cd habitat-lab 19 | pip install -e habitat-lab 20 | pip install -e habitat-baselines 21 | 22 | pip install ftfy regex tqdm GPUtil trimesh seaborn timm scikit-learn einops transformers 23 | pip install git+https://github.com/openai/CLIP.git 24 | -------------------------------------------------------------------------------- /ovon/trainers/envs.py: -------------------------------------------------------------------------------- 1 | # from typing import Optional 2 | 3 | # import gym 4 | # import habitat 5 | # import numpy as np 6 | # from habitat import Dataset 7 | # from habitat.core.environments import RLTaskEnv 8 | # from habitat.gym.gym_wrapper import HabGymWrapper 9 | 10 | 11 | # class CustomRLTaskEnv(RLTaskEnv): 12 | # def after_update(self): 13 | # self._env.episode_iterator.after_update() 14 | 15 | 16 | # @habitat.registry.register_env(name="CustomGymHabitatEnv") 17 | # class CustomGymHabitatEnv(gym.Wrapper): 18 | # """ 19 | # A registered environment that wraps a RLTaskEnv with the HabGymWrapper 20 | # to use the default gym API. 21 | # """ 22 | 23 | # def __init__( 24 | # self, config: "DictConfig", dataset: Optional[Dataset] = None 25 | # ): 26 | # base_env = CustomRLTaskEnv(config=config, dataset=dataset) 27 | # env = HabGymWrapper(env=base_env) 28 | # super().__init__(env) 29 | -------------------------------------------------------------------------------- /scripts/dataset/submit_data_gen_jobs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | scenes_dir=$1 4 | split=$2 5 | output_path=$3 6 | start_poses_per_object=2000 7 | episodes_per_object=50 8 | 9 | glb_files=`ls ${scenes_dir}/*/*.semantic.glb` 10 | for i in ${glb_files[@]} 11 | do 12 | scene_id=`basename $i` 13 | base=${scene_id%.*} # remove .gz 14 | base=${base%.*} # remove .json 15 | 16 | if [ -f "${output_path}/${split}/content/${base}.json.gz" ]; then 17 | echo "Skipping ${base}" 18 | continue 19 | fi 20 | 21 | echo "Submitting ${base}" 22 | sbatch --job-name=$split-${base} \ 23 | --output=slurm_logs/dataset/lnav-$split-${base}.out \ 24 | --error=slurm_logs/dataset/lnav-$split-${base}.err \ 25 | --gpus 1 \ 26 | --cpus-per-task 6 \ 27 | --export=ALL,scene=$i,num_tasks=1,split=$split,num_scenes=1,output_path=$output_path,start_poses_per_object=$start_poses_per_object,episodes_per_object=$episodes_per_object \ 28 | scripts/dataset/generate_languagenav_dataset.sh 29 | done 30 | 31 | -------------------------------------------------------------------------------- /scripts/prepare_dataset.sh: -------------------------------------------------------------------------------- 1 | valid=true 2 | 3 | # Check for two args 4 | if [ -z "$2" ] 5 | then 6 | echo "Please provide two paths to hm3d-train-glb-v0.2 and hm3d-val-glb-v0.2 as args" 7 | valid=false 8 | fi 9 | 10 | # Check for valid paths 11 | if [ ! -d "$1" ] 12 | then 13 | echo "$1 is not a valid directory!" 14 | echo "Please provide a valid path for hm3d-train-glb-v0.2" 15 | valid=false 16 | fi 17 | 18 | if [ ! -d "$2" ] 19 | then 20 | echo "$2 is not a valid directory!" 21 | echo "Please provide a valid path for hm3d-val-glb-v0.2" 22 | valid=false 23 | fi 24 | 25 | if [ $valid = true ] 26 | then 27 | mkdir -p habitat-lab/data/scene_datasets/hm3d && 28 | echo "Creating symlink from $1 to habitat-lab/data/scene_datasets/hm3d" && 29 | ln -s `realpath $1` habitat-lab/data/scene_datasets/hm3d/train && 30 | echo "Creating symlink from $2 to habitat-lab/data/scene_datasets/hm3d" && 31 | ln -s `realpath $2` habitat-lab/data/scene_datasets/hm3d/val && 32 | echo "Done" 33 | fi 34 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/charliermarsh/ruff-pre-commit 3 | rev: 'v0.0.263' 4 | hooks: 5 | - id: ruff 6 | args: ['--fix', '--config', 'pyproject.toml'] 7 | 8 | - repo: https://github.com/pre-commit/pre-commit-hooks 9 | rev: v4.4.0 10 | hooks: 11 | - id: end-of-file-fixer 12 | - id: trailing-whitespace 13 | - id: check-yaml 14 | - id: check-added-large-files 15 | - id: check-toml 16 | 17 | - repo: https://github.com/psf/black 18 | rev: 23.3.0 19 | hooks: 20 | - id: black 21 | language_version: python3.7 22 | args: ['--config', 'pyproject.toml'] 23 | verbose: true 24 | 25 | # - repo: https://github.com/pre-commit/mirrors-mypy 26 | # rev: v1.2.0 27 | # hooks: 28 | # - id: mypy 29 | # pass_filenames: false 30 | # additional_dependencies: 31 | # - types-protobuf 32 | # - types-requests 33 | # - types-simplejson 34 | # - types-ujson 35 | # - types-PyYAML 36 | # - types-toml 37 | # - types-six 38 | -------------------------------------------------------------------------------- /scripts/dataset/generate_val_dataset.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=ovon-dgen 3 | #SBATCH --output=slurm_logs/dataset-%j.out 4 | #SBATCH --error=slurm_logs/dataset-%j.err 5 | #SBATCH --gpus 1 6 | #SBATCH --nodes 1 7 | #SBATCH --cpus-per-task 16 8 | #SBATCH --ntasks-per-node 1 9 | #SBATCH --partition=short 10 | #SBATCH --exclude=conroy,ig-88 11 | #SBATCH --signal=USR1@100 12 | #SBATCH --requeue 13 | 14 | export GLOG_minloglevel=2 15 | export HABITAT_SIM_LOG=quiet 16 | export MAGNUM_LOG=quiet 17 | 18 | MAIN_ADDR=$(scontrol show hostnames "${SLURM_JOB_NODELIST}" | head -n 1) 19 | export MAIN_ADDR 20 | 21 | source /srv/flash1/rramrakhya3/miniconda3/etc/profile.d/conda.sh 22 | conda deactivate 23 | conda activate ovon 24 | 25 | SPLIT="val" 26 | NUM_TASKS=$1 27 | OUTPUT_PATH=$2 28 | NUM_SCENES=-1 29 | 30 | srun python ovon/dataset/objectnav_generator.py \ 31 | --split $SPLIT \ 32 | --num-scenes $NUM_SCENES \ 33 | --tasks-per-gpu $NUM_TASKS \ 34 | --start-poses-per-object 200 \ 35 | --episodes-per-object 10 \ 36 | --output-path $OUTPUT_PATH \ 37 | --multiprocessing \ 38 | --disable-euc-geo-ratio-check 39 | -------------------------------------------------------------------------------- /scripts/train_ddppo.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=ovon 3 | #SBATCH --output=slurm_logs/ovon-ddppo-%j.out 4 | #SBATCH --error=slurm_logs/ovon-ddppo-%j.err 5 | #SBATCH --gpus 4 6 | #SBATCH --nodes 1 7 | #SBATCH --cpus-per-task 10 8 | #SBATCH --ntasks-per-node 4 9 | #SBATCH --constraint=a40 10 | #SBATCH --partition=short 11 | #SBATCH --signal=USR1@100 12 | #SBATCH --requeue 13 | 14 | 15 | export GLOG_minloglevel=2 16 | export HABITAT_SIM_LOG=quiet 17 | export MAGNUM_LOG=quiet 18 | 19 | MAIN_ADDR=$(scontrol show hostnames "${SLURM_JOB_NODELIST}" | head -n 1) 20 | export MAIN_ADDR 21 | 22 | TENSORBOARD_DIR="tb/objectnav/ddppo/resnetclip/seed_1" 23 | CHECKPOINT_DIR="data/new_checkpoints/objectnav/ddppo/resnetclip/seed_1" 24 | 25 | srun python -um ovon.run \ 26 | --run-type train \ 27 | --exp-config config/experiments/ver_objectnav.yaml \ 28 | habitat_baselines.rl.policy.name=PointNavResNetCLIPPolicy \ 29 | habitat_baselines.rl.ddppo.train_encoder=False \ 30 | habitat_baselines.rl.ddppo.backbone=resnet50_clip_avgpool \ 31 | habitat_baselines.tensorboard_dir=${TENSORBOARD_DIR} \ 32 | habitat_baselines.checkpoint_folder=${CHECKPOINT_DIR} \ 33 | -------------------------------------------------------------------------------- /scripts/dataset/generate_dataset.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=ovon-dgen 3 | #SBATCH --output=slurm_logs/dataset-%j.out 4 | #SBATCH --error=slurm_logs/dataset-%j.err 5 | #SBATCH --gpus 1 6 | #SBATCH --nodes 1 7 | #SBATCH --cpus-per-task 6 8 | #SBATCH --ntasks-per-node 1 9 | #SBATCH --constraint="a40|rtx_6000|2080_ti" 10 | #SBATCH --partition=short 11 | #SBATCH --exclude calculon,alexa,cortana,bmo,c3po,ripl-s1,t1000,hal,irona,fiona 12 | #SBATCH --signal=USR1@100 13 | #SBATCH --requeue 14 | 15 | export GLOG_minloglevel=2 16 | export HABITAT_SIM_LOG=quiet 17 | export MAGNUM_LOG=quiet 18 | 19 | MAIN_ADDR=$(scontrol show hostnames "${SLURM_JOB_NODELIST}" | head -n 1) 20 | export MAIN_ADDR 21 | 22 | source /srv/flash1/rramrakhya3/miniconda3/etc/profile.d/conda.sh 23 | conda deactivate 24 | conda activate ovon 25 | 26 | # SPLIT=$1 27 | # NUM_TASKS=$2 28 | # OUTPUT_PATH=$3 29 | # NUM_SCENES=-1 30 | 31 | srun python ovon/dataset/objectnav_generator.py \ 32 | --scene $scene \ 33 | --split $split \ 34 | --num-scenes $num_scenes \ 35 | --tasks-per-gpu $num_tasks \ 36 | --output-path $output_path \ 37 | --start-poses-per-object $start_poses_per_object \ 38 | --episodes-per-object $episodes_per_object \ 39 | --disable-euc-geo-ratio-check 40 | -------------------------------------------------------------------------------- /ovon/__init__.py: -------------------------------------------------------------------------------- 1 | from ovon import config 2 | from ovon.dataset import ovon_dataset 3 | from ovon.measurements import collision_penalty, nav, sum_reward 4 | from ovon.models import ( 5 | clip_policy, 6 | objaverse_clip_policy, 7 | ovrl_policy, 8 | transformer_policy, 9 | ) 10 | from ovon.obs_transformers import ( 11 | image_goal_encoder, 12 | relabel_imagegoal, 13 | relabel_teacher_actions, 14 | resize, 15 | ) 16 | from ovon.task import rewards, sensors, simulator 17 | from ovon.trainers import dagger_trainer, ppo_trainer_no_2d, ver_trainer 18 | from ovon.utils import visualize_trajectories 19 | 20 | try: 21 | import frontier_exploration 22 | except ModuleNotFoundError as e: 23 | # If the error was due to the frontier_exploration package not being installed, then 24 | # pass, but warn. Do not pass if it was due to another package being missing. 25 | if e.name != "frontier_exploration": 26 | raise e 27 | else: 28 | print( 29 | "Warning: frontier_exploration package not installed. Things may not work. " 30 | "To install:\n" 31 | "git clone git@github.com:naokiyokoyama/frontier_exploration.git &&\n" 32 | "cd frontier_exploration && pip install -e ." 33 | ) 34 | -------------------------------------------------------------------------------- /ovon/test.py: -------------------------------------------------------------------------------- 1 | import habitat_sim 2 | 3 | GOAL_CATEGORIES = [ 4 | "chair", 5 | "bed", 6 | "plant", 7 | "toilet", 8 | "tv_monitor", 9 | "sofa", 10 | ] 11 | 12 | backend_cfg = habitat_sim.SimulatorConfiguration() 13 | backend_cfg.scene_id = ( 14 | "data/scene_datasets/hm3d/minival/00800-TEEsavR23oF/TEEsavR23oF.basis.glb" 15 | ) 16 | backend_cfg.scene_dataset_config_file = ( 17 | "data/scene_datasets/hm3d/hm3d_annotated_basis.scene_dataset_config.json" 18 | ) 19 | 20 | rgb_cfg = habitat_sim.CameraSensorSpec() 21 | rgb_cfg.uuid = "rgb" 22 | rgb_cfg.sensor_type = habitat_sim.SensorType.COLOR 23 | 24 | 25 | sem_cfg = habitat_sim.CameraSensorSpec() 26 | sem_cfg.uuid = "semantic" 27 | sem_cfg.sensor_type = habitat_sim.SensorType.SEMANTIC 28 | 29 | agent_cfg = habitat_sim.agent.AgentConfiguration() 30 | agent_cfg.sensor_specifications = [rgb_cfg, sem_cfg] 31 | 32 | sim_cfg = habitat_sim.Configuration(backend_cfg, [agent_cfg]) 33 | sim = habitat_sim.Simulator(sim_cfg) 34 | 35 | total_objects = len(sim.semantic_scene.objects) 36 | print("Total objects - {}".format(total_objects)) 37 | 38 | for obj in sim.semantic_scene.objects: 39 | if obj.category.name("") in GOAL_CATEGORIES: 40 | print(obj.category.name(""), obj.obb.center, obj.aabb.center, obj.obb.sizes) 41 | -------------------------------------------------------------------------------- /scripts/train_ver.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=ovon 3 | #SBATCH --output=slurm_logs/ovon-ver-%j.out 4 | #SBATCH --error=slurm_logs/ovon-ver-%j.err 5 | #SBATCH --gpus 2 6 | #SBATCH --nodes 1 7 | #SBATCH --cpus-per-task 10 8 | #SBATCH --ntasks-per-node 2 9 | #SBATCH --constraint=a40 10 | #SBATCH --partition=short 11 | #SBATCH --signal=USR1@100 12 | #SBATCH --requeue 13 | 14 | export GLOG_minloglevel=2 15 | export HABITAT_SIM_LOG=quiet 16 | export MAGNUM_LOG=quiet 17 | 18 | MAIN_ADDR=$(scontrol show hostnames "${SLURM_JOB_NODELIST}" | head -n 1) 19 | export MAIN_ADDR 20 | 21 | TENSORBOARD_DIR="tb/objectnav/ver/resnetclip/hm3d_v0.2_22_cat/seed_1" 22 | CHECKPOINT_DIR="data/new_checkpoints/objectnav/ver/resnetclip/hm3d_v0.2_22_cat/seed_1" 23 | DATA_PATH="data/datasets/objectnav/hm3d_semantic_v0.2/v1" 24 | 25 | srun python -um ovon.run \ 26 | --run-type train \ 27 | --exp-config config/experiments/ver_objectnav.yaml \ 28 | habitat_baselines.trainer_name="ver" \ 29 | habitat_baselines.rl.policy.name=PointNavResNetCLIPPolicy \ 30 | habitat_baselines.rl.ddppo.train_encoder=False \ 31 | habitat_baselines.rl.ddppo.backbone=resnet50_clip_avgpool \ 32 | habitat_baselines.tensorboard_dir=${TENSORBOARD_DIR} \ 33 | habitat_baselines.checkpoint_folder=${CHECKPOINT_DIR} \ 34 | habitat.dataset.data_path=${DATA_PATH}/train/train.json.gz 35 | -------------------------------------------------------------------------------- /scripts/dataset/generate_languagenav_dataset.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=ovon-dgen 3 | #SBATCH --output=slurm_logs/dataset-%j.out 4 | #SBATCH --error=slurm_logs/dataset-%j.err 5 | #SBATCH --gpus 1 6 | #SBATCH --nodes 1 7 | #SBATCH --cpus-per-task 6 8 | #SBATCH --ntasks-per-node 1 9 | #SBATCH --constraint="a40|rtx_6000|2080_ti" 10 | #SBATCH --partition=short 11 | #SBATCH --exclude calculon,alexa,cortana,bmo,c3po,ripl-s1,t1000,hal,irona,fiona 12 | #SBATCH --signal=USR1@100 13 | #SBATCH --requeue 14 | 15 | export GLOG_minloglevel=2 16 | export HABITAT_SIM_LOG=quiet 17 | export MAGNUM_LOG=quiet 18 | 19 | MAIN_ADDR=$(scontrol show hostnames "${SLURM_JOB_NODELIST}" | head -n 1) 20 | export MAIN_ADDR 21 | 22 | source /srv/flash1/rramrakhya3/miniconda3/etc/profile.d/conda.sh 23 | conda deactivate 24 | conda activate ovon 25 | 26 | # SPLIT=$1 27 | # NUM_TASKS=$2 28 | # OUTPUT_PATH=$3 29 | # NUM_SCENES=-1 30 | export PYTHONPATH=/srv/flash1/rramrakhya3/spring_2023/habitat-sim/src_python/ 31 | export HOME=/srv/flash1/rramrakhya3/summer_2023/ 32 | 33 | srun python ovon/dataset/languagenav_generator.py \ 34 | --scene $scene \ 35 | --split $split \ 36 | --num-scenes $num_scenes \ 37 | --tasks-per-gpu $num_tasks \ 38 | --output-path $output_path \ 39 | --start-poses-per-object $start_poses_per_object \ 40 | --episodes-per-object $episodes_per_object \ 41 | --with-start-poses 42 | -------------------------------------------------------------------------------- /ovon/models/encoders/vc1_encoder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from habitat_baselines.common.tensor_dict import TensorDict 4 | 5 | from ovon.obs_transformers.resize import image_resize 6 | 7 | 8 | class VC1Encoder(nn.Module): 9 | def __init__(self): 10 | from vc_models.models.vit import model_utils 11 | 12 | super().__init__() 13 | ( 14 | self.model, 15 | self.output_size, 16 | self.model_transforms, 17 | model_info, 18 | ) = model_utils.load_model(model_utils.VC1_BASE_NAME) 19 | self.output_shape = (self.output_size,) 20 | 21 | def forward(self, observations: "TensorDict", *args, **kwargs) -> torch.Tensor: 22 | rgb = observations["rgb"] 23 | if rgb.dtype == torch.uint8: 24 | rgb = rgb.float() / 255.0 25 | 26 | if rgb.shape[1] != 224 and rgb.shape[2] != 224: 27 | # The img loaded should be Bx3x250x250 28 | rgb = image_resize(rgb, size=(250, 250), channels_last=True) 29 | 30 | assert rgb.shape[1] == 224 and rgb.shape[2] == 224 31 | # Change the channels to be first 32 | rgb = rgb.permute(0, 3, 1, 2) 33 | with torch.inference_mode(): 34 | # Output will be of size Bx3x224x224 35 | x = self.model_transforms(rgb) 36 | # Embedding will be 1x768 37 | x = self.model(x) 38 | 39 | return x 40 | -------------------------------------------------------------------------------- /ovon/utils/test_dataset.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import glob 3 | import os 4 | 5 | from ovon.utils.utils import count_episodes, load_dataset 6 | 7 | 8 | def test_dataset(path): 9 | print(path) 10 | files = glob.glob(os.path.join(path, "*.json.gz")) 11 | dataset = load_dataset(files[0]) 12 | 13 | print("Total # of episodes: {}".format(count_episodes(dataset))) 14 | 15 | for ep in dataset["episodes"]: 16 | if len(ep["children_object_categories"]) > 0: 17 | print("Found episode with children object categories") 18 | break 19 | 20 | print("Object goal: {}".format(ep["object_category"])) 21 | for children in ep["children_object_categories"]: 22 | print(children) 23 | scene_id = ep["scene_id"].split("/")[-1] 24 | goal_key = f"{scene_id}_{children}" 25 | 26 | # Ignore if there are no valid viewpoints for goal 27 | if goal_key not in dataset["goals_by_category"]: 28 | print("No valid viewpoints for child: {}".format(children)) 29 | continue 30 | print( 31 | "Viewpoints: {} for child: {}".format( 32 | len(dataset["goals_by_category"][goal_key]), children 33 | ) 34 | ) 35 | 36 | 37 | if __name__ == "__main__": 38 | parser = argparse.ArgumentParser() 39 | parser.add_argument("--path", type=str, default="data/ovon_dataset.json") 40 | args = parser.parse_args() 41 | 42 | test_dataset(args.path) 43 | -------------------------------------------------------------------------------- /ovon/trainers/inference_worker_with_kv.py: -------------------------------------------------------------------------------- 1 | from multiprocessing.context import BaseContext 2 | 3 | from habitat_baselines.common.tensor_dict import TensorDict 4 | from habitat_baselines.rl.ver.inference_worker import ( 5 | InferenceWorker, 6 | InferenceWorkerProcess, 7 | ) 8 | from habitat_baselines.rl.ver.worker_common import WorkerBase 9 | 10 | 11 | class InferenceWorkerWithKVProcess(InferenceWorkerProcess): 12 | def _update_storage_no_ver( 13 | self, prev_step: TensorDict, current_step: TensorDict, current_steps 14 | ): 15 | current_step.pop("recurrent_hidden_states") 16 | super()._update_storage_no_ver(prev_step, current_step, current_steps) 17 | 18 | def _update_storage_ver( 19 | self, prev_step: TensorDict, current_step: TensorDict, my_slice 20 | ): 21 | current_step.pop("recurrent_hidden_states") 22 | super()._update_storage_ver(prev_step, current_step, my_slice) 23 | 24 | 25 | class InferenceWorkerWithKV(InferenceWorker): 26 | def __init__(self, mp_ctx: BaseContext, use_kv: bool = True, *args, **kwargs): 27 | if use_kv: 28 | self.setup_queue = mp_ctx.SimpleQueue() 29 | WorkerBase.__init__( 30 | self, 31 | mp_ctx, 32 | InferenceWorkerWithKVProcess, 33 | self.setup_queue, 34 | *args, 35 | **kwargs, 36 | ) 37 | else: 38 | super().__init__(mp_ctx, *args, **kwargs) 39 | -------------------------------------------------------------------------------- /scripts/train/2-imagenav-ver.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=imagenav 3 | #SBATCH --output=slurm_logs/imagenav-ver-%j.out 4 | #SBATCH --error=slurm_logs/imagenav-ver-%j.err 5 | #SBATCH --gpus 4 6 | #SBATCH --nodes 1 7 | #SBATCH --cpus-per-task 10 8 | #SBATCH --ntasks-per-node 4 9 | #SBATCH --constraint=a40 10 | #SBATCH --partition=short 11 | #SBATCH --exclude=cheetah,samantha,xaea-12,kitt 12 | #SBATCH --signal=USR1@100 13 | #SBATCH --requeue 14 | 15 | export GLOG_minloglevel=2 16 | export HABITAT_SIM_LOG=quiet 17 | export MAGNUM_LOG=quiet 18 | 19 | MAIN_ADDR=$(scontrol show hostnames "${SLURM_JOB_NODELIST}" | head -n 1) 20 | export MAIN_ADDR 21 | 22 | source /srv/flash1/rramrakhya3/miniconda3/etc/profile.d/conda.sh 23 | conda deactivate 24 | conda activate ovon 25 | 26 | export PYTHONPATH=/srv/flash1/rramrakhya3/spring_2023/habitat-sim/src_python/ 27 | 28 | TENSORBOARD_DIR="tb/imagenav/ver/resnetclip_avgattnpool/seed_1/" 29 | CHECKPOINT_DIR="data/new_checkpoints/imagenav/ver/resnetclip_avgattnpool/seed_1/" 30 | DATA_PATH="data/datasets/imagenav/hm3d/v1_stretch" 31 | 32 | srun python -um ovon.run \ 33 | --run-type train \ 34 | --exp-config config/experiments/ver_imagenav.yaml \ 35 | habitat_baselines.num_environments=32 \ 36 | habitat_baselines.tensorboard_dir=${TENSORBOARD_DIR} \ 37 | habitat_baselines.checkpoint_folder=${CHECKPOINT_DIR} \ 38 | habitat.dataset.data_path=${DATA_PATH}/train/train.json.gz \ 39 | habitat.simulator.type="OVONSim-v0" \ 40 | habitat_baselines.log_interval=20 \ 41 | -------------------------------------------------------------------------------- /ovon/models/encoders/siglip_encoder.py: -------------------------------------------------------------------------------- 1 | import timm 2 | import torch 3 | import torch.nn as nn 4 | from habitat_baselines.common.tensor_dict import TensorDict 5 | from torchvision import transforms 6 | 7 | 8 | class SigLIPEncoder(nn.Module): 9 | def __init__(self): 10 | super().__init__() 11 | self.model = timm.create_model( 12 | model_name="vit_base_patch16_siglip_256", 13 | pretrained=True, 14 | num_classes=0, 15 | ) 16 | self.model = self.model.eval() 17 | self.transforms = transforms.Compose( 18 | [ 19 | transforms.Resize( 20 | size=(256, 256), interpolation=transforms.InterpolationMode.BICUBIC 21 | ), 22 | transforms.Normalize( 23 | mean=torch.tensor([0.5000, 0.5000, 0.5000]), 24 | std=torch.tensor([0.5000, 0.5000, 0.5000]), 25 | ), 26 | ] 27 | ) 28 | self.output_size = 768 29 | self.output_shape = (self.output_size,) 30 | 31 | def forward(self, observations: TensorDict, *args, **kwargs) -> torch.Tensor: 32 | rgb = observations["rgb"] 33 | rgb = rgb.permute(0, 3, 1, 2) # NHWC -> NCHW 34 | if rgb.dtype == torch.uint8: 35 | rgb = rgb.float() / 255.0 36 | else: 37 | assert (rgb >= 0.0).all() and (rgb <= 1.0).all() 38 | with torch.inference_mode(): 39 | # Output will be of size Bx3x224x224 40 | x = self.transforms(rgb) 41 | # Embedding will be 1x768 42 | x = self.model(x) 43 | 44 | return x 45 | -------------------------------------------------------------------------------- /ovon/models/encoders/dinov2_encoder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.hub 3 | import torch.nn as nn 4 | from torchvision import transforms 5 | 6 | 7 | class DINOV2Encoder(nn.Module): 8 | def __init__(self, backbone_name="dinov2_vitb14", output_size=768): 9 | super().__init__() 10 | self.model = torch.hub.load( 11 | repo_or_dir="facebookresearch/dinov2", model=backbone_name 12 | ) 13 | self.model.eval() 14 | self.model_transforms = self.make_depth_transform() 15 | self.output_size = output_size 16 | self.output_shape = (self.output_size,) 17 | 18 | def make_depth_transform(self): 19 | return transforms.Compose( 20 | [ 21 | transforms.Normalize( 22 | mean=(123.675, 116.28, 103.53), 23 | std=(58.395, 57.12, 57.375), 24 | ), 25 | ] 26 | ) 27 | 28 | def forward(self, observations: "TensorDict", *args, **kwargs) -> torch.Tensor: 29 | # rgb is a tensor of shape (batch_size, height, width, channels) 30 | rgb = observations["rgb"] 31 | 32 | # Assert that the rgb images are of type uint8 33 | assert rgb.dtype == torch.uint8 34 | 35 | # Assert that the height and width are both 224 36 | assert rgb.shape[1] == 224 and rgb.shape[2] == 224 37 | 38 | rgb = rgb.float() 39 | 40 | # PyTorch models expect the input in (batch, channels, height, width) format 41 | rgb = rgb.permute(0, 3, 1, 2) 42 | 43 | with torch.inference_mode(): 44 | x = self.model(self.model_transforms(rgb)) 45 | 46 | return x 47 | -------------------------------------------------------------------------------- /ovon/task/simulator.py: -------------------------------------------------------------------------------- 1 | import habitat_sim 2 | from habitat.core.registry import registry 3 | from habitat.sims.habitat_simulator.habitat_simulator import HabitatSim 4 | from omegaconf import DictConfig 5 | 6 | 7 | @registry.register_simulator(name="OVONSim-v0") 8 | class OVONSim(HabitatSim): 9 | def __init__(self, config: DictConfig) -> None: 10 | super().__init__(config) 11 | self.navmesh_settings = self.load_navmesh_settings() 12 | self.recompute_navmesh( 13 | self.pathfinder, 14 | self.navmesh_settings, 15 | include_static_objects=False, 16 | ) 17 | self.curr_scene_goals = {} 18 | 19 | def load_navmesh_settings(self): 20 | agent_cfg = self.habitat_config.agents.main_agent 21 | navmesh_settings = habitat_sim.NavMeshSettings() 22 | navmesh_settings.set_defaults() 23 | navmesh_settings.agent_height = agent_cfg.height 24 | navmesh_settings.agent_radius = agent_cfg.radius 25 | navmesh_settings.agent_max_climb = ( 26 | self.habitat_config.navmesh_settings.agent_max_climb 27 | ) 28 | navmesh_settings.cell_height = self.habitat_config.navmesh_settings.cell_height 29 | return navmesh_settings 30 | 31 | def reconfigure( 32 | self, 33 | habitat_config: DictConfig, 34 | should_close_on_new_scene: bool = True, 35 | ): 36 | is_same_scene = habitat_config.scene == self._current_scene 37 | super().reconfigure(habitat_config, should_close_on_new_scene) 38 | if not is_same_scene: 39 | self.recompute_navmesh( 40 | self.pathfinder, 41 | self.navmesh_settings, 42 | include_static_objects=False, 43 | ) 44 | self.curr_scene_goals = {} 45 | -------------------------------------------------------------------------------- /ovon/models/encoders/make_encoder.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | from gym import spaces 4 | 5 | from ovon.models.encoders.clip_encoder import ResNetCLIPEncoder 6 | from ovon.models.encoders.dinov2_encoder import DINOV2Encoder 7 | from ovon.models.encoders.siglip_encoder import SigLIPEncoder 8 | from ovon.models.encoders.vc1_encoder import VC1Encoder 9 | 10 | POSSIBLE_ENCODERS = [ 11 | "clip_attnpool", 12 | "clip_avgpool", 13 | "clip_avgattnpool", 14 | "vc1", 15 | "dinov2", 16 | "resnet", 17 | "siglip", 18 | ] 19 | 20 | 21 | def make_encoder(backbone: str, observation_space: spaces.Dict) -> Any: 22 | if backbone == "resnet50_clip_avgpool": 23 | backbone = "clip_avgpool" 24 | print("WARNING: resnet50_clip_avgpool is deprecated. Use clip_avgpool instead.") 25 | 26 | assert ( 27 | backbone in POSSIBLE_ENCODERS 28 | ), f"Backbone {backbone} not found. Possible encoders: {POSSIBLE_ENCODERS}" 29 | 30 | if "clip" in backbone: 31 | backbone_type = backbone.split("_")[1] 32 | return ResNetCLIPEncoder( 33 | observation_space, 34 | backbone_type=backbone_type, 35 | clip_model="RN50", 36 | ) 37 | elif backbone == "vc1": 38 | return VC1Encoder() 39 | elif backbone == "dinov2": 40 | return DINOV2Encoder() 41 | elif backbone == "siglip": 42 | return SigLIPEncoder() 43 | elif backbone == "resnet": 44 | resnet_baseplanes = 32 45 | from habitat_baselines.rl.ddppo.policy import resnet 46 | from habitat_baselines.rl.ddppo.policy.resnet_policy import ResNetEncoder 47 | 48 | return ResNetEncoder( 49 | observation_space=observation_space, 50 | baseplanes=resnet_baseplanes, 51 | ngroups=resnet_baseplanes // 2, 52 | make_backbone=resnet.resnet50, 53 | ) 54 | -------------------------------------------------------------------------------- /ovon/utils/plot_tsne.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import matplotlib.pyplot as plt 4 | import numpy as np 5 | import pandas as pd 6 | import seaborn as sns 7 | from sklearn.manifold import TSNE 8 | 9 | from ovon.utils.utils import load_pickle 10 | 11 | 12 | def plot_tsne(embeddings, output_path): 13 | features = list(embeddings.values()) 14 | categories = list(embeddings.keys()) 15 | 16 | tsne = TSNE(n_components=2, verbose=1, perplexity=20, n_iter=500) 17 | tsne_results = tsne.fit_transform(features) 18 | 19 | df = pd.DataFrame( 20 | { 21 | "tsne-2d-one": tsne_results[:, 0], 22 | "tsne-2d-two": tsne_results[:, 1], 23 | "labels": categories, 24 | } 25 | ) 26 | 27 | pallete_size = np.unique(categories).shape[0] 28 | 29 | colors = sns.color_palette("hls", pallete_size) 30 | color_map = {} 31 | for i in range(len(categories)): 32 | color_map[categories[i]] = colors[i] 33 | 34 | print(color_map) 35 | 36 | plt.figure(figsize=(16, 10)) 37 | sns.scatterplot( 38 | data=df, 39 | x="tsne-2d-one", 40 | y="tsne-2d-two", 41 | hue="labels", 42 | palette=color_map, 43 | legend="full", 44 | s=120, 45 | ) 46 | 47 | plt.savefig(output_path, bbox_inches="tight", dpi=300) 48 | return tsne_results 49 | 50 | 51 | def plot_embeddings(path, output_path): 52 | clip_embeddings = load_pickle(path) 53 | plot_tsne(clip_embeddings, output_path) 54 | 55 | 56 | def main(): 57 | parser = argparse.ArgumentParser() 58 | parser.add_argument("--path", type=str, required=True, help="path to the dataset") 59 | parser.add_argument( 60 | "--output-path", type=str, required=True, help="path to the dataset" 61 | ) 62 | 63 | args = parser.parse_args() 64 | 65 | plot_embeddings(args.path, args.output_path) 66 | 67 | 68 | if __name__ == "__main__": 69 | main() 70 | -------------------------------------------------------------------------------- /ovon/measurements/collision_penalty.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import Any 3 | 4 | from habitat import EmbodiedTask, Measure, Simulator, registry 5 | from habitat.config.default_structured_configs import MeasurementConfig 6 | from hydra.core.config_store import ConfigStore 7 | from omegaconf import DictConfig 8 | 9 | 10 | @registry.register_measure 11 | class CollisionPenalty(Measure): 12 | """ 13 | Returns a penalty value if the robot has collided. 14 | """ 15 | 16 | cls_uuid: str = "collision_penalty" 17 | 18 | def __init__(self, sim: Simulator, config: "DictConfig", *args: Any, **kwargs: Any): 19 | self._sim = sim 20 | self._config = config 21 | self._collision_penalty = config.collision_penalty 22 | super().__init__() 23 | 24 | def _get_uuid(self, *args: Any, **kwargs: Any) -> str: 25 | return self.cls_uuid 26 | 27 | def reset_metric(self, episode, task, *args: Any, **kwargs: Any): 28 | task.measurements.check_measure_dependencies(self.uuid, ["collisions"]) 29 | self.update_metric(episode=episode, task=task, *args, **kwargs) # type: ignore 30 | 31 | def update_metric(self, episode, task: EmbodiedTask, *args: Any, **kwargs: Any): 32 | collisions = task.measurements.measures["collisions"].get_metric() 33 | collided = collisions is not None and collisions["is_collision"] 34 | if collided: 35 | self._metric = -self._collision_penalty 36 | else: 37 | self._metric = 0 38 | 39 | 40 | @dataclass 41 | class CollisionPenaltyMeasurementConfig(MeasurementConfig): 42 | type: str = CollisionPenalty.__name__ 43 | collision_penalty: float = 0.003 44 | 45 | 46 | cs = ConfigStore.instance() 47 | cs.store( 48 | package=f"habitat.task.measurements.{CollisionPenalty.cls_uuid}", 49 | group="habitat/task/measurements", 50 | name=f"{CollisionPenalty.cls_uuid}", 51 | node=CollisionPenaltyMeasurementConfig, 52 | ) 53 | -------------------------------------------------------------------------------- /ovon/obs_transformers/relabel_imagegoal.py: -------------------------------------------------------------------------------- 1 | import copy 2 | from dataclasses import dataclass 3 | from typing import Dict 4 | 5 | import torch 6 | from gym import spaces 7 | from habitat_baselines.common.baseline_registry import baseline_registry 8 | from habitat_baselines.common.obs_transformers import ObservationTransformer 9 | from habitat_baselines.config.default_structured_configs import ( 10 | ObsTransformConfig, 11 | ) 12 | from hydra.core.config_store import ConfigStore 13 | from omegaconf import DictConfig 14 | 15 | from ovon.task.sensors import ClipImageGoalSensor, ImageGoalRotationSensor 16 | 17 | 18 | @baseline_registry.register_obs_transformer() 19 | class RelabelImageGoal(ObservationTransformer): 20 | """Renames ImageGoalRotationSensor to ClipImageGoalSensor""" 21 | 22 | def transform_observation_space(self, observation_space: spaces.Dict, **kwargs): 23 | assert ImageGoalRotationSensor.cls_uuid in observation_space.spaces 24 | observation_space = copy.deepcopy(observation_space) 25 | observation_space.spaces[ClipImageGoalSensor.cls_uuid] = ( 26 | observation_space.spaces.pop(ImageGoalRotationSensor.cls_uuid) 27 | ) 28 | return observation_space 29 | 30 | @classmethod 31 | def from_config(cls, config: DictConfig): 32 | return cls() 33 | 34 | def forward(self, observations: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]: 35 | observations[ClipImageGoalSensor.cls_uuid] = observations.pop( 36 | ImageGoalRotationSensor.cls_uuid 37 | ) 38 | return observations 39 | 40 | 41 | @dataclass 42 | class RelabelImageGoalConfig(ObsTransformConfig): 43 | type: str = RelabelImageGoal.__name__ 44 | 45 | 46 | cs = ConfigStore.instance() 47 | 48 | cs.store( 49 | package="habitat_baselines.rl.policy.obs_transforms.relabel_image_goal", 50 | group="habitat_baselines/rl/policy/obs_transforms", 51 | name="relabel_image_goal", 52 | node=RelabelImageGoalConfig, 53 | ) 54 | -------------------------------------------------------------------------------- /ovon/trainers/ppo_trainer_no_2d.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from gym import spaces 4 | from habitat_baselines import PPOTrainer 5 | from habitat_baselines.common.baseline_registry import baseline_registry 6 | from habitat_baselines.rl.ddppo.policy import PointNavResNetNet 7 | 8 | from ovon.utils.rollout_storage_no_2d import RolloutStorageNo2D 9 | 10 | 11 | @baseline_registry.register_trainer(name="ddppo_no_2d") 12 | @baseline_registry.register_trainer(name="ppo_no_2d") 13 | class PPONo2DTrainer(PPOTrainer): 14 | def _init_train(self, *args, **kwargs): 15 | super()._init_train(*args, **kwargs) 16 | # Hacky overwriting of existing RolloutStorage with a new one 17 | ppo_cfg = self.config.habitat_baselines.rl.ppo 18 | action_shape = self.rollouts.buffers["actions"].shape[2:] 19 | discrete_actions = self.rollouts.buffers["actions"].dtype == torch.long 20 | batch = self.rollouts.buffers["observations"][0] 21 | 22 | obs_space = spaces.Dict( 23 | { 24 | PointNavResNetNet.PRETRAINED_VISUAL_FEATURES_KEY: spaces.Box( 25 | low=np.finfo(np.float32).min, 26 | high=np.finfo(np.float32).max, 27 | shape=self._encoder.output_shape, 28 | dtype=np.float32, 29 | ), 30 | **self.obs_space.spaces, 31 | } 32 | ) 33 | 34 | self.rollouts = RolloutStorageNo2D( 35 | self.actor_critic.net.visual_encoder, 36 | batch, 37 | ppo_cfg.num_steps, 38 | self.envs.num_envs, 39 | obs_space, 40 | self.policy_action_space, 41 | ppo_cfg.hidden_size, 42 | num_recurrent_layers=self.actor_critic.net.num_recurrent_layers, 43 | is_double_buffered=ppo_cfg.use_double_buffered_sampler, 44 | action_shape=action_shape, 45 | discrete_actions=discrete_actions, 46 | ) 47 | self.rollouts.to(self.device) 48 | -------------------------------------------------------------------------------- /ovon/models/visual_encoders.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from vc_models.models.vit import model_utils 4 | 5 | 6 | class Vc1Wrapper(nn.Module): 7 | """ 8 | Wrapper for the VC1 visual encoder. This will automatically download the model if it's not already. 9 | """ 10 | 11 | def __init__(self, im_obs_space, model_id=None): 12 | super().__init__() 13 | if model_id is None: 14 | model_id = model_utils.VC1_BASE_NAME 15 | print(f"loading {model_id}.") 16 | ( 17 | self.net, 18 | self.embd_size, 19 | self.model_transforms, 20 | model_info, 21 | ) = model_utils.load_model(model_id) 22 | self._image_obs_keys = im_obs_space.spaces.keys() 23 | 24 | # Count total # of channels 25 | self._n_input_channels = sum( 26 | im_obs_space.spaces[k].shape[2] for k in self._image_obs_keys 27 | ) 28 | 29 | @property 30 | def is_blind(self): 31 | return self._n_input_channels == 0 32 | 33 | @torch.autocast("cuda") 34 | def forward(self, obs): 35 | # Extract tensors that are shape [batch_size, img_width, img_height, img_channels] 36 | feats = [] 37 | imgs = [v for k, v in obs.items() if k in self._image_obs_keys] 38 | for img in imgs: 39 | if img.shape[-1] != 3: 40 | img = torch.concat([img] * 3, dim=-1) 41 | scale_factor = 1.0 42 | else: 43 | scale_factor = 255.0 44 | 45 | img = self.model_transforms( 46 | img.permute(0, 3, 1, 2).contiguous() / scale_factor 47 | ) 48 | 49 | feats.append(self.net(img)) 50 | 51 | if len(feats) == 2: 52 | # feats = (feats[0] + feats[1])/2 53 | feats = torch.concat(feats, dim=-1) 54 | else: 55 | feats = feats[0] 56 | return feats 57 | 58 | @property 59 | def output_shape(self): 60 | return (self.embd_size * len(self._image_obs_keys),) 61 | -------------------------------------------------------------------------------- /scripts/train/1-ovon-ver.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=ovon 3 | #SBATCH --output=slurm_logs/ovon-ver-%j.out 4 | #SBATCH --error=slurm_logs/ovon-ver-%j.err 5 | #SBATCH --gpus 4 6 | #SBATCH --nodes 1 7 | #SBATCH --cpus-per-task 10 8 | #SBATCH --ntasks-per-node 4 9 | #SBATCH --constraint=a40 10 | #SBATCH --partition=short 11 | #SBATCH --exclude=cheetah,samantha,xaea-12,kitt 12 | #SBATCH --signal=USR1@100 13 | #SBATCH --requeue 14 | 15 | export GLOG_minloglevel=2 16 | export HABITAT_SIM_LOG=quiet 17 | export MAGNUM_LOG=quiet 18 | 19 | MAIN_ADDR=$(scontrol show hostnames "${SLURM_JOB_NODELIST}" | head -n 1) 20 | export MAIN_ADDR 21 | 22 | source /srv/flash1/rramrakhya3/miniconda3/etc/profile.d/conda.sh 23 | conda deactivate 24 | conda activate ovon 25 | 26 | export PYTHONPATH=/srv/flash1/rramrakhya3/spring_2023/habitat-sim/src_python/ 27 | 28 | TENSORBOARD_DIR="tb/ovon/ver/resnetclip_rgb_text/seed_2/" 29 | CHECKPOINT_DIR="data/new_checkpoints/ovon/ver/resnetclip_rgb_text/seed_2/" 30 | DATA_PATH="data/datasets/ovon/hm3d/v2" 31 | 32 | srun python -um ovon.run \ 33 | --run-type train \ 34 | --exp-config config/experiments/ver_objectnav.yaml \ 35 | habitat_baselines.trainer_name="ver" \ 36 | habitat_baselines.num_environments=32 \ 37 | habitat_baselines.rl.policy.name=PointNavResNetCLIPPolicy \ 38 | habitat_baselines.rl.ddppo.train_encoder=False \ 39 | habitat_baselines.rl.ddppo.backbone=resnet50_clip_avgpool \ 40 | habitat_baselines.tensorboard_dir=${TENSORBOARD_DIR} \ 41 | habitat_baselines.checkpoint_folder=${CHECKPOINT_DIR} \ 42 | habitat.dataset.data_path=${DATA_PATH}/train/train.json.gz \ 43 | +habitat/task/lab_sensors@habitat.task.lab_sensors.clip_objectgoal_sensor=clip_objectgoal_sensor \ 44 | ~habitat.task.lab_sensors.objectgoal_sensor \ 45 | habitat.task.lab_sensors.clip_objectgoal_sensor.cache=data/clip_embeddings/ovon_stretch_final_cache.pkl \ 46 | habitat.task.measurements.success.success_distance=0.25 \ 47 | habitat.dataset.type="OVON-v1" \ 48 | habitat.task.measurements.distance_to_goal.type=OVONDistanceToGoal \ 49 | habitat.simulator.type="OVONSim-v0" 50 | -------------------------------------------------------------------------------- /ovon/measurements/sum_reward.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | from typing import Any, List 3 | 4 | from habitat import EmbodiedTask, Measure, registry 5 | from habitat.config.default_structured_configs import MeasurementConfig 6 | from hydra.core.config_store import ConfigStore 7 | from omegaconf import DictConfig 8 | 9 | 10 | @registry.register_measure 11 | class SumReward(Measure): 12 | """ 13 | Sums various reward measures. 14 | """ 15 | 16 | cls_uuid: str = "sum_reward" 17 | 18 | def __init__(self, config: "DictConfig", *args: Any, **kwargs: Any): 19 | self._config = config 20 | self._reward_terms = config.reward_terms 21 | self._reward_coefficients = [float(i) for i in config.reward_coefficients] 22 | super().__init__() 23 | 24 | def _get_uuid(self, *args: Any, **kwargs: Any) -> str: 25 | return self.cls_uuid 26 | 27 | def reset_metric(self, episode, task, *args: Any, **kwargs: Any): 28 | task.measurements.check_measure_dependencies(self.uuid, self._reward_terms) 29 | self.update_metric(episode=episode, task=task, *args, **kwargs) # type: ignore 30 | 31 | def update_metric(self, episode, task: EmbodiedTask, *args: Any, **kwargs: Any): 32 | self._metric = 0 33 | for term, coefficient in zip(self._reward_terms, self._reward_coefficients): 34 | self._metric += coefficient * task.measurements.measures[term].get_metric() 35 | 36 | 37 | @dataclass 38 | class SumRewardMeasurementConfig(MeasurementConfig): 39 | type: str = SumReward.__name__ 40 | reward_terms: List[str] = field( 41 | # available options are "disk" and "tensorboard" 42 | default_factory=list 43 | ) 44 | reward_coefficients: List[str] = field( 45 | # available options are "disk" and "tensorboard" 46 | default_factory=list 47 | ) 48 | 49 | 50 | cs = ConfigStore.instance() 51 | cs.store( 52 | package=f"habitat.task.measurements.{SumReward.cls_uuid}", 53 | group="habitat/task/measurements", 54 | name=f"{SumReward.cls_uuid}", 55 | node=SumRewardMeasurementConfig, 56 | ) 57 | -------------------------------------------------------------------------------- /scripts/train/3-ovon-ver-scratch.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=ovon 3 | #SBATCH --output=slurm_logs/ovon-ver-%j.out 4 | #SBATCH --error=slurm_logs/ovon-ver-%j.err 5 | #SBATCH --gpus 1 6 | #SBATCH --nodes 1 7 | #SBATCH --cpus-per-task 10 8 | #SBATCH --ntasks-per-node 1 9 | #SBATCH --constraint=a40 10 | #SBATCH --partition=short 11 | #SBATCH --exclude=cheetah,samantha,xaea-12,kitt 12 | #SBATCH --signal=USR1@100 13 | #SBATCH --requeue 14 | 15 | export GLOG_minloglevel=2 16 | export HABITAT_SIM_LOG=quiet 17 | export MAGNUM_LOG=quiet 18 | 19 | MAIN_ADDR=$(scontrol show hostnames "${SLURM_JOB_NODELIST}" | head -n 1) 20 | export MAIN_ADDR 21 | 22 | source /srv/flash1/rramrakhya3/miniconda3/etc/profile.d/conda.sh 23 | conda deactivate 24 | conda activate ovon 25 | 26 | export PYTHONPATH=/srv/flash1/rramrakhya3/spring_2023/habitat-sim/src_python/ 27 | 28 | TENSORBOARD_DIR="tb/ovon/ver/resnet_scratch_clip_goal/seed_1/" 29 | CHECKPOINT_DIR="data/new_checkpoints/ovon/ver/resnet_scratch_clip_goal/seed_1/" 30 | DATA_PATH="data/datasets/ovon/hm3d/v5_final" 31 | 32 | srun python -um ovon.run \ 33 | --run-type train \ 34 | --exp-config config/experiments/ver_objectnav.yaml \ 35 | habitat_baselines.trainer_name="ver" \ 36 | habitat_baselines.num_environments=32 \ 37 | habitat_baselines.rl.policy.name=OVRLPolicy \ 38 | habitat_baselines.rl.ddppo.train_encoder=True \ 39 | habitat_baselines.rl.policy.backbone=resnet50 \ 40 | habitat_baselines.rl.policy.freeze_backbone=False \ 41 | habitat_baselines.tensorboard_dir=${TENSORBOARD_DIR} \ 42 | habitat_baselines.checkpoint_folder=${CHECKPOINT_DIR} \ 43 | habitat.dataset.data_path=${DATA_PATH}/train/train.json.gz \ 44 | +habitat/task/lab_sensors@habitat.task.lab_sensors.clip_objectgoal_sensor=clip_objectgoal_sensor \ 45 | ~habitat.task.lab_sensors.objectgoal_sensor \ 46 | habitat.task.lab_sensors.clip_objectgoal_sensor.cache=data/clip_embeddings/ovon_stretch_final_cache.pkl \ 47 | habitat.task.measurements.success.success_distance=0.25 \ 48 | habitat.dataset.type="OVON-v1" \ 49 | habitat.task.measurements.distance_to_goal.type=OVONDistanceToGoal \ 50 | habitat.simulator.type="OVONSim-v0" 51 | -------------------------------------------------------------------------------- /scripts/train/4-objectnav-transformer.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=ovon-tf 3 | #SBATCH --output=slurm_logs/%x-%j.out 4 | #SBATCH --error=slurm_logs/%x-%j.err 5 | #SBATCH --gpus a40:4 6 | #SBATCH --nodes 1 7 | #SBATCH --cpus-per-task 16 8 | #SBATCH --ntasks-per-node 4 9 | #SBATCH --time=48:00:00 10 | #SBATCH --signal=USR1@90 11 | #SBATCH --exclude=crushinator,major,chappie,deebot,xaea-12 12 | #SBATCH --requeue 13 | #SBATCH --partition=cvmlp-lab,overcap 14 | #SBATCH --qos=short 15 | 16 | # Copyright (c) Meta Platforms, Inc. and its affiliates. 17 | # This source code is licensed under the MIT license found in the 18 | # LICENSE file in the root directory of this source tree. 19 | 20 | export GLOG_minloglevel=2 21 | export MAGNUM_LOG=quiet 22 | export HABITAT_SIM_LOG=quiet 23 | 24 | MAIN_ADDR=$(scontrol show hostnames "${SLURM_JOB_NODELIST}" | head -n 1) 25 | export MAIN_ADDR 26 | 27 | source /srv/flash1/rramrakhya3/summer_2023/miniconda3/etc/profile.d/conda.sh 28 | conda deactivate 29 | conda activate ovon-v2 30 | 31 | cd /srv/flash1/rramrakhya3/spring_2023/ovon 32 | 33 | export PYTHONPATH= 34 | 35 | TENSORBOARD_DIR="tb/objectnav/ddppo/vc1_llama/seed_5/" 36 | CHECKPOINT_DIR="data/new_checkpoints/objectnav/ddppo/vc1_llama/seed_5/" 37 | DATA_PATH="data/datasets/objectnav/hm3d/v2" 38 | 39 | srun python ovon/run.py --exp-config config/experiments/rl_transformer_hm3d.yaml \ 40 | --run-type train \ 41 | habitat_baselines.checkpoint_folder=$CHECKPOINT_DIR/ \ 42 | habitat_baselines.tensorboard_dir=$TENSORBOARD_DIR \ 43 | habitat_baselines.num_environments=24 \ 44 | habitat_baselines.rl.policy.transformer_config.inter_episodes_attention=False \ 45 | habitat_baselines.rl.policy.transformer_config.add_sequence_idx_embed=False \ 46 | habitat_baselines.rl.policy.transformer_config.reset_position_index=False \ 47 | habitat_baselines.rl.policy.transformer_config.max_position_embeddings=2000 \ 48 | habitat_baselines.rl.policy.transformer_config.n_hidden=1024 \ 49 | habitat.environment.max_episode_steps=500 \ 50 | habitat.dataset.data_path=${DATA_PATH}/train/train.json.gz \ 51 | habitat_baselines.rl.ppo.training_precision="float32" 52 | -------------------------------------------------------------------------------- /ovon/obs_transformers/relabel_teacher_actions.py: -------------------------------------------------------------------------------- 1 | import copy 2 | from dataclasses import dataclass 3 | from typing import Dict 4 | 5 | import torch 6 | from gym import spaces 7 | from habitat_baselines.common.baseline_registry import baseline_registry 8 | from habitat_baselines.common.obs_transformers import ObservationTransformer 9 | from habitat_baselines.config.default_structured_configs import ObsTransformConfig 10 | from hydra.core.config_store import ConfigStore 11 | from omegaconf import DictConfig 12 | 13 | 14 | @baseline_registry.register_obs_transformer() 15 | class RelabelTeacherActions(ObservationTransformer): 16 | """Renames the entry corresponding to the given key string within the observations 17 | dict to 'teacher_actions'""" 18 | 19 | TEACHER_LABEL: str = "teacher_label" 20 | 21 | def __init__(self, teacher_label: str): 22 | super().__init__() 23 | self.teacher_label = teacher_label 24 | 25 | def transform_observation_space(self, observation_space: spaces.Dict, **kwargs): 26 | assert ( 27 | self.teacher_label in observation_space.spaces 28 | ), f"Teacher action key {self.teacher_label} not in observation space!" 29 | observation_space = copy.deepcopy(observation_space) 30 | observation_space.spaces[self.TEACHER_LABEL] = observation_space.spaces.pop( 31 | self.teacher_label 32 | ) 33 | return observation_space 34 | 35 | @classmethod 36 | def from_config(cls, config: DictConfig): 37 | return cls(config.teacher_label) 38 | 39 | def forward(self, observations: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]: 40 | observations[self.TEACHER_LABEL] = observations.pop(self.teacher_label) 41 | return observations 42 | 43 | 44 | @dataclass 45 | class RelabelTeacherActionsConfig(ObsTransformConfig): 46 | type: str = RelabelTeacherActions.__name__ 47 | teacher_label: str = "" 48 | 49 | 50 | cs = ConfigStore.instance() 51 | 52 | cs.store( 53 | package="habitat_baselines.rl.policy.obs_transforms.relabel_teacher_actions", 54 | group="habitat_baselines/rl/policy/obs_transforms", 55 | name="relabel_teacher_actions", 56 | node=RelabelTeacherActionsConfig, 57 | ) 58 | -------------------------------------------------------------------------------- /scripts/eval/4-objectnav-transformer.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=ovon 3 | #SBATCH --output=slurm_logs/eval/ovon-tf-%j.out 4 | #SBATCH --error=slurm_logs/eval/ovon-tf-%j.err 5 | #SBATCH --gpus a40:1 6 | #SBATCH --nodes 1 7 | #SBATCH --cpus-per-task 6 8 | #SBATCH --ntasks-per-node 1 9 | #SBATCH --partition=cvmlp-lab,overcap 10 | #SBATCH --qos=short 11 | #SBATCH --signal=USR1@100 12 | 13 | export GLOG_minloglevel=2 14 | export HABITAT_SIM_LOG=quiet 15 | export MAGNUM_LOG=quiet 16 | 17 | MAIN_ADDR=$(scontrol show hostnames "${SLURM_JOB_NODELIST}" | head -n 1) 18 | export MAIN_ADDR 19 | 20 | source /srv/flash1/rramrakhya3/summer_2023/miniconda3/etc/profile.d/conda.sh 21 | conda deactivate 22 | conda activate ovon-v2 23 | 24 | export PYTHONPATH= #/srv/flash1/rramrakhya3/spring_2023/habitat-sim/src_python/ 25 | 26 | DATA_PATH="data/datasets/objectnav/hm3d/v2/" 27 | eval_ckpt_path_dir="data/new_checkpoints/objectnav/ddppo/vc1_llama/seed_3/ckpt.0.pth" 28 | tensorboard_dir="tb/objectnav/ddppo/vc1_llama/seed_2/eval_debug/" 29 | split="val" 30 | 31 | echo "Evaluating ckpt: ${eval_ckpt_path_dir}" 32 | echo "Data path: ${DATA_PATH}/${split}/${split}.json.gz" 33 | 34 | python -um ovon.run \ 35 | --run-type eval \ 36 | --exp-config config/experiments/rl_transformer_hm3d.yaml \ 37 | -cvt \ 38 | habitat_baselines.trainer_name=transformer_ddppo \ 39 | habitat_baselines.tensorboard_dir=$tensorboard_dir \ 40 | habitat_baselines.eval_ckpt_path_dir=$eval_ckpt_path_dir \ 41 | habitat_baselines.checkpoint_folder=$eval_ckpt_path_dir \ 42 | habitat.dataset.data_path="${DATA_PATH}/${split}/${split}.json.gz" \ 43 | habitat_baselines.num_environments=2 \ 44 | habitat_baselines.rl.ppo.training_precision="float32" \ 45 | habitat_baselines.rl.policy.transformer_config.inter_episodes_attention=False \ 46 | habitat_baselines.rl.policy.transformer_config.add_sequence_idx_embed=False \ 47 | habitat_baselines.rl.policy.transformer_config.reset_position_index=False \ 48 | habitat_baselines.rl.policy.transformer_config.max_position_embeddings=2000 \ 49 | habitat.environment.max_episode_steps=500 \ 50 | habitat.dataset.data_path="data/datasets/objectnav/hm3d/v2/val/val.json.gz" \ 51 | habitat.dataset.split=val \ 52 | habitat_baselines.eval.use_ckpt_config=False \ 53 | habitat_baselines.load_resume_state_config=False \ 54 | habitat.simulator.habitat_sim_v0.allow_sliding=False \ 55 | habitat_baselines.eval.split=$split 56 | -------------------------------------------------------------------------------- /ovon/utils/rollout_storage_no_2d.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from habitat_baselines import RolloutStorage 4 | from habitat_baselines.common.tensor_dict import TensorDict 5 | from habitat_baselines.rl.ddppo.policy import PointNavResNetNet 6 | from torch import nn 7 | 8 | 9 | class RolloutStorageNo2D(RolloutStorage): 10 | """RolloutStorage variant that will store visual features extracted using a given 11 | visual encoder instead of raw images to save space.""" 12 | 13 | buffers: TensorDict 14 | visual_encoder: nn.Module 15 | 16 | def __init__( 17 | self, visual_encoder, initial_obs: Optional[TensorDict] = None, *args, **kwargs 18 | ): 19 | super().__init__(*args, **kwargs) 20 | # Remove any 2D observations from the rollout storage buffer 21 | delete_keys = [] 22 | for sensor in self.buffers["observations"]: 23 | if self.buffers["observations"][sensor].dim() >= 4: # NCHW -> 4 dims 24 | delete_keys.append(sensor) 25 | for k in delete_keys: 26 | del self.buffers["observations"][k] 27 | self.visual_encoder = visual_encoder 28 | if initial_obs is not None: 29 | self.buffers["observations"][0] = self.filter_obs(initial_obs) 30 | 31 | def filter_obs(self, obs: TensorDict) -> TensorDict: 32 | filtered_obs = TensorDict() 33 | for sensor in obs: 34 | # Filter out 2D observations 35 | if obs[sensor].dim() < 4: 36 | filtered_obs[sensor] = obs[sensor] 37 | # Extract visual features from 2D observations 38 | filtered_obs[PointNavResNetNet.PRETRAINED_VISUAL_FEATURES_KEY] = ( 39 | self.visual_encoder(obs) 40 | ) 41 | return filtered_obs 42 | 43 | def insert( 44 | self, 45 | next_observations=None, 46 | next_recurrent_hidden_states=None, 47 | actions=None, 48 | action_log_probs=None, 49 | value_preds=None, 50 | rewards=None, 51 | next_masks=None, 52 | buffer_index: int = 0, 53 | ): 54 | if next_observations is not None: 55 | filtered_next_observations = self.filter_obs(next_observations) 56 | else: 57 | filtered_next_observations = None 58 | super().insert( # noqa 59 | filtered_next_observations, 60 | next_recurrent_hidden_states, 61 | actions, 62 | action_log_probs, 63 | value_preds, 64 | rewards, 65 | next_masks, 66 | buffer_index, 67 | ) 68 | -------------------------------------------------------------------------------- /ovon/models/encoders/cma_xattn.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch import Tensor 5 | 6 | 7 | class CrossModalAttention(nn.Module): 8 | def __init__( 9 | self, text_embedding_dim: int, rgb_embedding_dim: int, hidden_size: int = 512 10 | ) -> None: 11 | super().__init__() 12 | 13 | # Linear transformation for the text query and RGB key-value pairs 14 | self.text_q = nn.Linear(text_embedding_dim, hidden_size // 2) 15 | self.rgb_kv = nn.Conv1d(rgb_embedding_dim, hidden_size, 1) 16 | 17 | # Scale for attention 18 | self.register_buffer("_scale", torch.tensor(1.0 / ((hidden_size // 2) ** 0.5))) 19 | 20 | self._hidden_size = hidden_size 21 | 22 | @property 23 | def output_size(self) -> int: 24 | return self._hidden_size // 2 25 | 26 | def _attn(self, q: Tensor, k: Tensor, v: Tensor) -> Tensor: 27 | logits = torch.einsum("nc, nci -> ni", q, k) 28 | attn = F.softmax(logits * self._scale, dim=1) 29 | return torch.einsum("ni, nci -> nc", attn, v) 30 | 31 | def forward(self, text_embedding: Tensor, rgb_embedding: Tensor) -> Tensor: 32 | """ 33 | Args: 34 | text_embedding: [batch_size, text_embedding_dim] tensor (language) 35 | rgb_embedding: [batch_size, rgb_embedding_dim] tensor (visual) 36 | 37 | Returns: 38 | [batch_size, embed_dim] tensor 39 | """ 40 | # Reshape rgb_embedding tensor to [batch_size, rgb_embedding_dim, 1] 41 | rgb_embedding_reshaped = rgb_embedding.unsqueeze(2) 42 | rgb_kv = self.rgb_kv(rgb_embedding_reshaped) 43 | rgb_k, rgb_v = torch.split(rgb_kv, self.text_q.out_features, dim=1) 44 | text_q = self.text_q(text_embedding) 45 | rgb_embedding = self._attn(text_q, rgb_k, rgb_v) 46 | 47 | return rgb_embedding 48 | 49 | 50 | if __name__ == "__main__": 51 | # Define embedding dimensions 52 | text_embedding_dim = 1024 53 | rgb_embedding_dim = 1024 54 | 55 | # Instantiate the model 56 | model = CrossModalAttention(text_embedding_dim, rgb_embedding_dim, hidden_size=512) 57 | print(f"Model: \n{model}\n") 58 | 59 | # Generate random embeddings 60 | batch_size = 1 61 | text_embedding = torch.rand(batch_size, text_embedding_dim) 62 | rgb_embedding = torch.rand(batch_size, rgb_embedding_dim) 63 | 64 | # Pass embeddings through the model 65 | output = model(text_embedding, rgb_embedding) 66 | 67 | # Print output 68 | print(f"Output Shape: {output.shape}") 69 | -------------------------------------------------------------------------------- /ovon/utils/visualize_policy_embeddings.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import numpy as np 4 | import torch 5 | from gym import spaces 6 | 7 | from ovon.models.clip_policy import PointNavResNetCLIPPolicy 8 | from ovon.utils.plot_tsne import plot_tsne 9 | from ovon.utils.utils import save_pickle 10 | 11 | 12 | def visualize_policy_embeddings(checkpoint_path, output_path): 13 | checkpoint = torch.load(checkpoint_path, map_location=torch.device("cpu")) 14 | print(checkpoint.keys()) 15 | 16 | OBJECT_MAPPING = { 17 | "chair": 0, 18 | "bed": 1, 19 | "plant": 2, 20 | "toilet": 3, 21 | "tv_monitor": 4, 22 | "sofa": 5, 23 | } 24 | 25 | h, w = ( 26 | 480, 27 | 640, 28 | ) 29 | 30 | observation_space = { 31 | "compass": spaces.Box(low=-np.pi, high=np.pi, shape=(1,), dtype=np.float32), 32 | "gps": spaces.Box( 33 | low=np.finfo(np.float32).min, 34 | high=np.finfo(np.float32).max, 35 | shape=(2,), 36 | dtype=np.float32, 37 | ), 38 | "rgb": spaces.Box( 39 | low=0, 40 | high=255, 41 | shape=(h, w, 3), 42 | dtype=np.uint8, 43 | ), 44 | "objectgoal": spaces.Box( 45 | low=0, high=len(OBJECT_MAPPING) - 1, shape=(1,), dtype=np.int64 46 | ), 47 | } 48 | 49 | observation_space = spaces.Dict(observation_space) 50 | 51 | action_space = spaces.Discrete(6) 52 | 53 | policy = PointNavResNetCLIPPolicy.from_config( 54 | checkpoint["config"], observation_space, action_space 55 | ) 56 | policy.load_state_dict( 57 | {k.replace("actor_critic.", ""): v for k, v in checkpoint["state_dict"].items()} 58 | ) 59 | 60 | policy.eval() 61 | print("Policy initialized....") 62 | 63 | embeddings = {} 64 | for cat, cat_id in OBJECT_MAPPING.items(): 65 | out = policy.net.obj_categories_embedding(torch.tensor(cat_id)) 66 | embeddings[cat] = out.detach().numpy() 67 | 68 | save_pickle(embeddings, "data/clip_embeddings/hm3d_onehot.pkl") 69 | plot_tsne(embeddings, output_path) 70 | 71 | 72 | if __name__ == "__main__": 73 | parser = argparse.ArgumentParser() 74 | parser.add_argument( 75 | "--checkpoint", 76 | type=str, 77 | required=True, 78 | help="path to the habitat baselines file", 79 | ) 80 | parser.add_argument( 81 | "--output-path", type=str, required=True, help="path to the TSNE plot" 82 | ) 83 | 84 | args = parser.parse_args() 85 | visualize_policy_embeddings(args.checkpoint, args.output_path) 86 | -------------------------------------------------------------------------------- /scripts/eval/3-ovon-ver-eval-analysis.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=ovon 3 | #SBATCH --output=slurm_logs/eval/ovon-ver-%j.out 4 | #SBATCH --error=slurm_logs/eval/ovon-ver-%j.err 5 | #SBATCH --gpus 1 6 | #SBATCH --nodes 1 7 | #SBATCH --cpus-per-task 10 8 | #SBATCH --ntasks-per-node 1 9 | #SBATCH --constraint="a40|rtx_6000" 10 | #SBATCH --partition=short 11 | #SBATCH --exclude=cheetah,samantha,xaea-12,kitt,calculon,vicki,neo,kipp,ripl-s1,tars 12 | #SBATCH --signal=USR1@100 13 | 14 | export GLOG_minloglevel=2 15 | export HABITAT_SIM_LOG=quiet 16 | export MAGNUM_LOG=quiet 17 | 18 | MAIN_ADDR=$(scontrol show hostnames "${SLURM_JOB_NODELIST}" | head -n 1) 19 | export MAIN_ADDR 20 | 21 | source /srv/flash1/rramrakhya3/miniconda3/etc/profile.d/conda.sh 22 | conda deactivate 23 | conda activate ovon 24 | 25 | export PYTHONPATH=/srv/flash1/rramrakhya3/spring_2023/habitat-sim/src_python/ 26 | 27 | DATA_PATH="data/datasets/ovon/hm3d/v5_final/" 28 | eval_ckpt_path_dir="data/new_checkpoints/ovon/ver/resnetclip_rgb_text/seed_1/ckpt.80.pth" 29 | tensorboard_dir="tb/ovon/ver/resnetclip_rgb_text/seed_1/eval_val_seen_debug/" 30 | split="val_seen" 31 | OVON_EPISODES_JSON="data/analysis/episode_metrics/rl_ckpt_80_additional_metrics_${split}.json" 32 | 33 | echo "Evaluating ckpt: ${eval_ckpt_path_dir}" 34 | echo "Data path: ${DATA_PATH}/${split}/${split}.json.gz" 35 | 36 | srun python -um ovon.run \ 37 | --run-type eval \ 38 | --exp-config config/experiments/ver_objectnav.yaml \ 39 | -cvt \ 40 | habitat_baselines.num_environments=2 \ 41 | habitat_baselines.test_episode_count=4 \ 42 | habitat_baselines.trainer_name=ver_pirlnav \ 43 | habitat_baselines.rl.policy.name=PointNavResNetCLIPPolicy \ 44 | habitat_baselines.tensorboard_dir=$tensorboard_dir \ 45 | habitat_baselines.eval_ckpt_path_dir=$eval_ckpt_path_dir \ 46 | habitat_baselines.checkpoint_folder=$eval_ckpt_path_dir \ 47 | habitat.dataset.data_path="${DATA_PATH}/${split}/${split}.json.gz" \ 48 | +habitat/task/lab_sensors@habitat.task.lab_sensors.clip_objectgoal_sensor=clip_objectgoal_sensor \ 49 | habitat.task.lab_sensors.clip_objectgoal_sensor.cache=data/clip_embeddings/ovon_stretch_cache.pkl \ 50 | habitat.task.measurements.success.success_distance=0.25 \ 51 | habitat.dataset.type="OVON-v1" \ 52 | habitat.task.measurements.distance_to_goal.type=OVONDistanceToGoal \ 53 | +habitat/task/measurements@habitat.task.measurements.ovon_object_goal_id=ovon_object_goal_id \ 54 | +habitat/task/measurements@habitat.task.measurements.failure_modes=failure_modes \ 55 | habitat.simulator.type="OVONSim-v0" \ 56 | habitat_baselines.eval.use_ckpt_config=False \ 57 | habitat_baselines.load_resume_state_config=False \ 58 | habitat.simulator.habitat_sim_v0.allow_sliding=False \ 59 | habitat_baselines.eval.split=$split 60 | 61 | touch $checkpoint_counter 62 | -------------------------------------------------------------------------------- /ovon/models/encoders/depth_encoder.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from gym import spaces 4 | from gym.spaces import Dict as SpaceDict 5 | from habitat_baselines.rl.ddppo.policy.resnet import resnet18 6 | from habitat_baselines.rl.ddppo.policy.resnet_policy import ResNetEncoder 7 | from torch import nn as nn 8 | 9 | 10 | class ResNet18DepthEncoder(nn.Module): 11 | def __init__(self, depth_encoder, visual_fc): 12 | super().__init__() 13 | self.encoder = depth_encoder 14 | self.visual_fc = visual_fc 15 | 16 | def forward(self, x): 17 | x = self.encoder(x) 18 | x = self.visual_fc(x) 19 | return x 20 | 21 | def load_state_dict(self, state_dict, strict: bool = True): 22 | # TODO: allow dicts trained with both attn and avg pool to be loaded 23 | ignore_attnpool = False 24 | if ignore_attnpool: 25 | pass 26 | return super().load_state_dict(state_dict, strict=strict) 27 | 28 | 29 | def copy_depth_encoder(depth_ckpt): 30 | """ 31 | Returns an encoder that stacks the encoder and visual_fc of the provided 32 | depth checkpoint 33 | :param depth_ckpt: path to a resnet18 depth pointnav policy 34 | :return: nn.Module representing the backbone of the depth policy 35 | """ 36 | # Initialize encoder and fc layers 37 | base_planes = 32 38 | ngroups = 32 39 | spatial_size = 128 40 | 41 | observation_space = SpaceDict( 42 | { 43 | "depth": spaces.Box( 44 | low=0.0, high=1.0, shape=(256, 256, 1), dtype=np.float32 45 | ), 46 | } 47 | ) 48 | depth_encoder = ResNetEncoder( 49 | observation_space, 50 | base_planes, 51 | ngroups, 52 | spatial_size, 53 | make_backbone=resnet18, 54 | ) 55 | 56 | flat_output_shape = 2048 57 | hidden_size = 512 58 | visual_fc = nn.Sequential( 59 | nn.Flatten(), 60 | nn.Linear(flat_output_shape, hidden_size), 61 | nn.ReLU(True), 62 | ) 63 | 64 | pretrained_state = torch.load(depth_ckpt, map_location="cpu") 65 | 66 | # Load weights into depth encoder 67 | depth_encoder_state_dict = { 68 | k[len("actor_critic.net.visual_encoder.") :]: v 69 | for k, v in pretrained_state["state_dict"].items() 70 | if k.startswith("actor_critic.net.visual_encoder.") 71 | } 72 | depth_encoder.load_state_dict(depth_encoder_state_dict) 73 | 74 | # Load weights in fc layers 75 | visual_fc_state_dict = { 76 | k[len("actor_critic.net.visual_fc.") :]: v 77 | for k, v in pretrained_state["state_dict"].items() 78 | if k.startswith("actor_critic.net.visual_fc.") 79 | } 80 | visual_fc.load_state_dict(visual_fc_state_dict) 81 | 82 | modified_depth_encoder = ResNet18DepthEncoder(depth_encoder, visual_fc) 83 | 84 | return modified_depth_encoder 85 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=61.0"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "ovon" 7 | version = "0.1" 8 | description = "open-vocab-objectnav" 9 | authors = [ 10 | {name = "Naoki Yokoyama", email = "naokiyokoyama@github.com"}, 11 | ] 12 | readme = "README.md" 13 | requires-python = ">=3.7" 14 | 15 | [project.optional-dependencies] 16 | dev = [ 17 | "pre-commit >= 2.21.0", 18 | ] 19 | 20 | [project.urls] 21 | "Homepage" = "github.com" 22 | "GitHub" = "https://github.com/naokiyokoyama/ovon" 23 | 24 | [tool.setuptools] 25 | packages = ["ovon"] 26 | 27 | [tool.ruff] 28 | # Enable pycodestyle (`E`), Pyflakes (`F`), and import sorting (`I`) 29 | select = ["E", "F", "I"] 30 | ignore = [] 31 | 32 | # Allow autofix for all enabled rules (when `--fix`) is provided. 33 | fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"] 34 | unfixable = [] 35 | 36 | # Exclude a variety of commonly ignored directories. 37 | exclude = [ 38 | ".bzr", 39 | ".direnv", 40 | ".eggs", 41 | ".git", 42 | ".hg", 43 | ".mypy_cache", 44 | ".nox", 45 | ".pants.d", 46 | ".pytype", 47 | ".ruff_cache", 48 | ".svn", 49 | ".tox", 50 | ".venv", 51 | "__pypackages__", 52 | "_build", 53 | "buck-out", 54 | "build", 55 | "dist", 56 | "node_modules", 57 | "venv", 58 | "docker/ros", 59 | ] 60 | 61 | # Same as Black. 62 | line-length = 88 63 | 64 | # Allow unused variables when underscore-prefixed. 65 | dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$" 66 | 67 | # Assume Python 3.9. 68 | target-version = "py39" 69 | 70 | [tool.ruff.per-file-ignores] 71 | "__init__.py" = ["F401"] 72 | 73 | [tool.ruff.mccabe] 74 | # Unlike Flake8, default to a complexity level of 10. 75 | max-complexity = 10 76 | 77 | [tool.black] 78 | line-length = 88 79 | target-version = ['py39'] 80 | include = '\.pyi?$' 81 | # `extend-exclude` is not honored when `black` is passed a file path explicitly, 82 | # as is typical when `black` is invoked via `pre-commit`. 83 | force-exclude = ''' 84 | /( 85 | docker/ros/.* 86 | )/ 87 | ''' 88 | 89 | preview = true 90 | 91 | # mypy configuration 92 | [tool.mypy] 93 | python_version = "3.9" 94 | disallow_untyped_defs = true 95 | ignore_missing_imports = true 96 | explicit_package_bases = true 97 | check_untyped_defs = true 98 | strict_equality = true 99 | warn_unreachable = true 100 | warn_redundant_casts = true 101 | no_implicit_optional = true 102 | files = ['ovon'] 103 | exclude = '^(docker|.*external|.*thirdparty|.*install|.*build|.*_experimental)/' 104 | -------------------------------------------------------------------------------- /ovon/trainers/ver_rollout_storage_with_kv.py: -------------------------------------------------------------------------------- 1 | from typing import Iterator, Optional 2 | 3 | import torch 4 | from habitat_baselines.common.tensor_dict import DictTree, TensorDict 5 | from habitat_baselines.rl.ver.ver_rollout_storage import ( 6 | VERRolloutStorage, 7 | generate_ver_mini_batches, 8 | ) 9 | 10 | 11 | def build_rnn_build_seq_info( 12 | device: torch.device, 13 | episode_ids, 14 | ) -> TensorDict: 15 | rnn_build_seq_info = TensorDict() 16 | rnn_build_seq_info["episode_ids"] = ( 17 | torch.from_numpy(episode_ids).to(device=device).reshape(-1, 1) 18 | ) 19 | 20 | return rnn_build_seq_info 21 | 22 | 23 | class VERRolloutStorageWithKVCache(VERRolloutStorage): 24 | def __init__( 25 | self, 26 | num_layers: int, 27 | num_heads: int, 28 | max_context_length: int, 29 | head_dim: int, 30 | *args, 31 | **kwargs, 32 | ): 33 | super().__init__(*args, **kwargs) 34 | self._aux_buffers["next_hidden_states"] = torch.zeros( 35 | self._num_envs, 36 | num_layers, 37 | 2, # key, value 38 | num_heads, 39 | max_context_length - 1, 40 | head_dim, 41 | device=self.buffers["recurrent_hidden_states"].device, 42 | ) 43 | 44 | self._set_aux_buffers() 45 | 46 | def recurrent_generator( 47 | self, 48 | advantages: Optional[torch.Tensor], 49 | num_mini_batch: int, 50 | ) -> Iterator[DictTree]: 51 | """ 52 | An exact copy of the recurrent_generator method from the parent class, the only 53 | difference in behaviour is: 54 | - The new definition of build_rnn_build_seq_info (above) is used instead. 55 | - "recurrent_hidden_states" is not set 56 | """ 57 | if not self.variable_experience: 58 | yield from super().recurrent_generator(advantages, num_mini_batch) 59 | else: 60 | for mb_inds in generate_ver_mini_batches( 61 | num_mini_batch, 62 | self.sequence_lengths, 63 | self.num_seqs_at_step, 64 | self.select_inds, 65 | self.last_sequence_in_batch_mask, 66 | self.episode_ids_cpu, 67 | ): 68 | mb_inds_cpu = torch.from_numpy(mb_inds) 69 | mb_inds = mb_inds_cpu.to(device=self.device) 70 | 71 | if not self.variable_experience: 72 | batch = self.buffers.map(lambda t: t.flatten(0, 1))[mb_inds] 73 | if advantages is not None: 74 | batch["advantages"] = advantages.flatten(0, 1)[mb_inds] 75 | else: 76 | batch = self.buffers[mb_inds] 77 | if advantages is not None: 78 | batch["advantages"] = advantages[mb_inds] 79 | 80 | batch["rnn_build_seq_info"] = build_rnn_build_seq_info( 81 | device=self.device, 82 | episode_ids=self.episode_ids_cpu[mb_inds_cpu], 83 | ) 84 | 85 | yield batch.to_tree() 86 | -------------------------------------------------------------------------------- /config/experiments/rnn_rl.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | defaults: 4 | - /habitat_baselines: habitat_baselines_rl_config_base 5 | - /habitat_baselines/rl/policy/obs_transforms: 6 | - resize 7 | - objectnav_stretch_hm3d 8 | - override /habitat/task/lab_sensors: 9 | - clip_objectgoal_sensor 10 | - _self_ 11 | 12 | habitat: 13 | environment: 14 | iterator_options: 15 | max_scene_repeat_steps: 50000 16 | task: 17 | success_reward: 2.5 18 | slack_reward: -1e-3 19 | lab_sensors: 20 | clip_objectgoal_sensor: 21 | cache: data/text_embeddings/siglip.pkl 22 | measurements: 23 | success: 24 | success_distance: 0.25 25 | distance_to_goal: 26 | type: OVONDistanceToGoal 27 | dataset: 28 | type: "OVON-v1" 29 | split: train 30 | data_path: data/datasets/ovon/hm3d/v1/{split}/{split}.json.gz 31 | simulator: 32 | type: "OVONSim-v0" 33 | navmesh_settings: 34 | agent_max_climb: 0.1 35 | cell_height: 0.05 36 | 37 | habitat_baselines: 38 | torch_gpu_id: 0 39 | tensorboard_dir: "tb" 40 | video_dir: "video_dir" 41 | test_episode_count: -1 42 | eval_ckpt_path_dir: "data/new_checkpoints" 43 | num_environments: 32 44 | checkpoint_folder: "data/new_checkpoints" 45 | trainer_name: "ver" 46 | num_updates: -1 47 | total_num_steps: 150000000 48 | log_interval: 10 49 | num_checkpoints: 50 50 | # Force PyTorch to be single threaded as 51 | # this improves performance considerably 52 | force_torch_single_threaded: True 53 | 54 | eval: 55 | split: "val" 56 | 57 | rl: 58 | 59 | policy: 60 | name: "PointNavResNetCLIPPolicy" 61 | backbone: "siglip" 62 | fusion_type: "concat" 63 | use_vis_query: True 64 | use_residual: True 65 | residual_vision: True 66 | rgb_only: False 67 | 68 | ppo: 69 | # ppo params 70 | clip_param: 0.2 71 | ppo_epoch: 1 72 | num_mini_batch: 2 73 | value_loss_coef: 0.5 74 | entropy_coef: 0.01 75 | lr: 2.5e-4 76 | eps: 1e-5 77 | max_grad_norm: 0.2 78 | num_steps: 100 79 | use_gae: True 80 | gamma: 0.99 81 | tau: 0.95 82 | use_linear_clip_decay: False 83 | use_linear_lr_decay: False 84 | reward_window_size: 50 85 | 86 | use_normalized_advantage: False 87 | 88 | hidden_size: 1024 89 | 90 | ddppo: 91 | sync_frac: 0.6 92 | # The PyTorch distributed backend to use 93 | distrib_backend: NCCL 94 | # Visual encoder backbone 95 | pretrained_weights: data/ddppo-models/gibson-2plus-resnet50.pth 96 | # Initialize with pretrained weights 97 | pretrained: False 98 | # Initialize just the visual encoder backbone with pretrained weights 99 | pretrained_encoder: False 100 | # Whether or not the visual encoder backbone will be trained. 101 | train_encoder: False 102 | # Whether or not to reset the critic linear layer 103 | reset_critic: True 104 | 105 | # Model parameters 106 | rnn_type: LSTM 107 | num_recurrent_layers: 4 108 | -------------------------------------------------------------------------------- /ovon/utils/visualize/statistics.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import glob 3 | import math 4 | import os 5 | 6 | import matplotlib.pyplot as plt 7 | import pandas as pd 8 | import seaborn as sns 9 | from tqdm import tqdm 10 | 11 | from ovon.utils.utils import load_dataset 12 | 13 | 14 | def plot_statistics(path, output_path, split="train"): 15 | os.makedirs(os.path.dirname(output_path), exist_ok=True) 16 | 17 | geodesic_distances = [] 18 | euclidean_distances = [] 19 | files = glob.glob(os.path.join(path, "*json.gz")) 20 | 21 | categories = {} 22 | for f in tqdm(files): 23 | dataset = load_dataset(f) 24 | for ep in dataset["episodes"]: 25 | geodesic_distances.append(ep["info"]["geodesic_distance"]) 26 | euclidean_distances.append(ep["info"]["euclidean_distance"]) 27 | categories[ep["object_category"]] = ( 28 | categories.get(ep["object_category"], 0) + 1 29 | ) 30 | 31 | # Plot distances for visualization 32 | plt.figure(figsize=(8, 8)) 33 | hist_data = list(filter(math.isfinite, geodesic_distances)) 34 | hist_data = pd.DataFrame.from_dict({"Geodesic distance": hist_data}) 35 | ax = sns.histplot(data=hist_data, x="Geodesic distance") 36 | 37 | ax.set_xticks(range(0, 32, 2)) 38 | 39 | plt.title("Geodesic distance to closest goal") 40 | plt.tight_layout() 41 | plt.savefig( 42 | os.path.join(output_path, "ovon_{}_geodesic_distances.png".format(split)) 43 | ) 44 | 45 | plt.figure(figsize=(8, 8)) 46 | hist_data = list( 47 | filter( 48 | math.isfinite, 49 | [g / e for g, e in zip(geodesic_distances, euclidean_distances)], 50 | ) 51 | ) 52 | hist_data = pd.DataFrame.from_dict({"Euc Geo ratio": hist_data}) 53 | ax = sns.histplot(data=hist_data, x="Euc Geo ratio") 54 | 55 | ax.set_ylim(0, 5000) 56 | ax.set_xlim(0, 5) 57 | ax.set_xticks(range(0, 5, 1)) 58 | 59 | plt.title("Euc Geo ratio") 60 | plt.tight_layout() 61 | plt.savefig(os.path.join(output_path, "ovon_{}_euc_geo_ratio.png".format(split))) 62 | 63 | categories = { 64 | "objects": list(categories.keys()), 65 | "frequency": list(categories.values()), 66 | } 67 | 68 | df = pd.DataFrame.from_dict(categories) 69 | df.sort_values(by="frequency", inplace=True, ascending=False) 70 | print(df.columns) 71 | 72 | fig, axs = plt.subplots(1, 1, figsize=(8, 50)) 73 | 74 | sns.barplot(data=df, x="frequency", y="objects", ax=axs) 75 | 76 | fig.savefig( 77 | os.path.join(output_path, "ovon_{}_categories.png".format(split)), 78 | dpi=100, 79 | bbox_inches="tight", 80 | pad_inches=0.1, 81 | transparent=False, 82 | ) 83 | 84 | 85 | if __name__ == "__main__": 86 | parser = argparse.ArgumentParser() 87 | parser.add_argument( 88 | "--path", 89 | type=str, 90 | default="data/datasets/ovon/hm3d/v4_stretch/val_seen/content/", 91 | ) 92 | parser.add_argument("--output-path", type=str, default="val_unseen.png") 93 | parser.add_argument("--split", type=str, default="train") 94 | args = parser.parse_args() 95 | 96 | plot_statistics(args.path, args.output_path, args.split) 97 | -------------------------------------------------------------------------------- /ovon/models/encoders/visual_encoder.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | import numpy as np 4 | import torch 5 | from habitat_baselines.rl.ddppo.policy.running_mean_and_var import ( 6 | RunningMeanAndVar, 7 | ) 8 | from torch import nn as nn 9 | from torch.nn import functional as F 10 | 11 | from ovon.models.encoders import resnet_gn as resnet 12 | 13 | 14 | class VisualEncoder(nn.Module): 15 | def __init__( 16 | self, 17 | image_size: int, 18 | backbone: str, 19 | input_channels: int = 3, 20 | resnet_baseplanes: int = 32, 21 | resnet_ngroups: int = 32, 22 | normalize_visual_inputs: bool = True, 23 | avgpooled_image: bool = False, 24 | drop_path_rate: float = 0.0, 25 | visual_transform: Any = None, 26 | num_environments: int = 1, 27 | ): 28 | super().__init__() 29 | self.avgpooled_image = avgpooled_image 30 | self.is_blind = False 31 | self.visual_transform = visual_transform 32 | self.num_environments = num_environments 33 | 34 | if normalize_visual_inputs: 35 | self.running_mean_and_var: nn.Module = RunningMeanAndVar(input_channels) 36 | else: 37 | self.running_mean_and_var = nn.Sequential() 38 | 39 | if "resnet" in backbone: 40 | make_backbone = getattr(resnet, backbone) 41 | self.backbone = make_backbone( 42 | input_channels, resnet_baseplanes, resnet_ngroups 43 | ) 44 | 45 | spatial_size = image_size 46 | if self.avgpooled_image: 47 | spatial_size = image_size // 2 48 | 49 | final_spatial = int(spatial_size * self.backbone.final_spatial_compress) 50 | after_compression_flat_size = 2048 51 | num_compression_channels = int( 52 | round(after_compression_flat_size / (final_spatial**2)) 53 | ) 54 | self.compression = nn.Sequential( 55 | nn.Conv2d( 56 | self.backbone.final_channels, 57 | num_compression_channels, 58 | kernel_size=3, 59 | padding=1, 60 | bias=False, 61 | ), 62 | nn.GroupNorm(1, num_compression_channels), 63 | nn.ReLU(True), 64 | ) 65 | 66 | output_shape = ( 67 | num_compression_channels, 68 | final_spatial, 69 | final_spatial, 70 | ) 71 | self.output_shape = output_shape 72 | self.output_size = np.prod(output_shape) 73 | else: 74 | raise ValueError("unknown backbone {}".format(backbone)) 75 | 76 | def forward(self, observations: torch.Tensor, N: int = None) -> torch.Tensor: # type: ignore 77 | num_environments = self.num_environments 78 | if N is not None: 79 | num_environments = N 80 | 81 | rgb = observations["rgb"] 82 | x = self.visual_transform(rgb, num_environments) 83 | 84 | if ( 85 | self.avgpooled_image 86 | ): # For compatibility with the habitat_baselines implementation 87 | x = F.avg_pool2d(x, 2) 88 | x = self.running_mean_and_var(x) 89 | x = self.backbone(x) 90 | x = self.compression(x) 91 | return x 92 | -------------------------------------------------------------------------------- /config/experiments/transformer_rl.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | defaults: 4 | - /habitat_baselines: habitat_baselines_rl_config_base 5 | - /habitat_baselines/rl/policy/obs_transforms: 6 | - resize 7 | - objectnav_stretch_hm3d 8 | - override /habitat/task/lab_sensors: 9 | - clip_objectgoal_sensor 10 | - step_id_sensor 11 | - _self_ 12 | 13 | habitat: 14 | environment: 15 | iterator_options: 16 | max_scene_repeat_steps: 50000 17 | task: 18 | success_reward: 2.5 19 | slack_reward: -1e-3 20 | lab_sensors: 21 | clip_objectgoal_sensor: 22 | cache: data/text_embeddings/siglip.pkl 23 | measurements: 24 | success: 25 | success_distance: 0.25 26 | distance_to_goal: 27 | type: OVONDistanceToGoal 28 | dataset: 29 | type: "OVON-v1" 30 | split: train 31 | data_path: data/datasets/ovon/hm3d/v1/{split}/{split}.json.gz 32 | simulator: 33 | type: "OVONSim-v0" 34 | navmesh_settings: 35 | agent_max_climb: 0.1 36 | cell_height: 0.05 37 | 38 | habitat_baselines: 39 | torch_gpu_id: 0 40 | tensorboard_dir: "tb" 41 | video_dir: "video_dir" 42 | test_episode_count: -1 43 | eval_ckpt_path_dir: "data/new_checkpoints" 44 | num_environments: 32 45 | checkpoint_folder: "data/new_checkpoints" 46 | trainer_name: "ver_transformer" 47 | num_updates: -1 48 | total_num_steps: 150000000 49 | log_interval: 10 50 | num_checkpoints: 50 51 | # Force PyTorch to be single threaded as 52 | # this improves performance considerably 53 | force_torch_single_threaded: True 54 | 55 | eval: 56 | split: "val" 57 | 58 | rl: 59 | 60 | policy: 61 | name: "OVONTransformerPolicy" 62 | backbone: "siglip" 63 | fusion_type: "concat" 64 | use_vis_query: True 65 | use_residual: True 66 | residual_vision: True 67 | rgb_only: False 68 | transformer_config: 69 | model_name: "llama" 70 | n_layers: 4 71 | n_heads: 8 72 | n_hidden: 512 73 | n_mlp_hidden: 1024 74 | max_context_length: 100 75 | shuffle_pos_id_for_update: True 76 | 77 | ppo: 78 | # ppo params 79 | clip_param: 0.2 80 | ppo_epoch: 1 81 | num_mini_batch: 2 82 | value_loss_coef: 0.5 83 | entropy_coef: 0.01 84 | lr: 2.5e-4 85 | eps: 1e-5 86 | max_grad_norm: 0.2 87 | num_steps: 100 88 | use_gae: True 89 | gamma: 0.99 90 | tau: 0.95 91 | use_linear_clip_decay: False 92 | use_linear_lr_decay: False 93 | reward_window_size: 50 94 | 95 | use_normalized_advantage: False 96 | 97 | hidden_size: 512 98 | 99 | ddppo: 100 | sync_frac: 0.6 101 | # The PyTorch distributed backend to use 102 | distrib_backend: NCCL 103 | # Visual encoder backbone 104 | pretrained_weights: data/ddppo-models/gibson-2plus-resnet50.pth 105 | # Initialize with pretrained weights 106 | pretrained: False 107 | # Initialize just the visual encoder backbone with pretrained weights 108 | pretrained_encoder: False 109 | # Whether or not the visual encoder backbone will be trained. 110 | train_encoder: False 111 | # Whether or not to reset the critic linear layer 112 | reset_critic: True 113 | -------------------------------------------------------------------------------- /ovon/task/rewards.py: -------------------------------------------------------------------------------- 1 | from typing import TYPE_CHECKING, Any, Optional 2 | 3 | import numpy as np 4 | from habitat.core.embodied_task import EmbodiedTask, Measure 5 | from habitat.core.registry import registry 6 | from habitat.core.simulator import Simulator 7 | from habitat.tasks.nav.nav import DistanceToGoal, Success 8 | 9 | from ovon.measurements.imagenav import AngleSuccess, AngleToGoal 10 | 11 | if TYPE_CHECKING: 12 | from omegaconf import DictConfig 13 | 14 | 15 | @registry.register_measure 16 | class ImageNavReward(Measure): 17 | cls_uuid: str = "imagenav_reward" 18 | 19 | def __init__(self, *args: Any, sim: Simulator, config: "DictConfig", **kwargs: Any): 20 | super().__init__(**kwargs) 21 | self._sim = sim 22 | self._config = config 23 | self._previous_dtg: Optional[float] = None 24 | self._previous_atg: Optional[float] = None 25 | 26 | def _get_uuid(self, *args: Any, **kwargs: Any) -> str: 27 | return self.cls_uuid 28 | 29 | def reset_metric( 30 | self, 31 | *args: Any, 32 | task: EmbodiedTask, 33 | **kwargs: Any, 34 | ): 35 | task.measurements.check_measure_dependencies( 36 | self.uuid, 37 | [ 38 | Success.cls_uuid, 39 | DistanceToGoal.cls_uuid, 40 | AngleToGoal.cls_uuid, 41 | AngleSuccess.cls_uuid, 42 | ], 43 | ) 44 | self._metric = None 45 | self._previous_dtg = None 46 | self._previous_atg = None 47 | self.update_metric(task=task) 48 | 49 | def update_metric(self, *args: Any, task: EmbodiedTask, **kwargs: Any): 50 | # success reward 51 | success = task.measurements.measures[Success.cls_uuid].get_metric() 52 | success_reward = self._config.success_reward if success else 0.0 53 | 54 | # distance-to-goal reward 55 | dtg = task.measurements.measures[DistanceToGoal.cls_uuid].get_metric() 56 | if self._previous_dtg is None: 57 | self._previous_dtg = dtg 58 | add_dtg = self._config.use_dtg_reward 59 | dtg_reward = self._previous_dtg - dtg if add_dtg else 0.0 60 | self._previous_dtg = dtg 61 | 62 | # angle-to-goal reward 63 | atg = task.measurements.measures[AngleToGoal.cls_uuid].get_metric() 64 | add_atg = self._config.use_atg_reward 65 | if self._config.use_atg_fix: 66 | if dtg > self._config.atg_reward_distance: 67 | atg = np.pi 68 | else: 69 | if dtg > self._config.atg_reward_distance: 70 | add_atg = False 71 | if self._previous_atg is None: 72 | self._previous_atg = atg 73 | angle_reward = self._previous_atg - atg if add_atg else 0.0 74 | self._previous_atg = atg 75 | 76 | # angle success reward 77 | angle_success = task.measurements.measures[AngleSuccess.cls_uuid].get_metric() 78 | angle_success_reward = ( 79 | self._config.angle_success_reward if angle_success else 0.0 80 | ) 81 | 82 | # slack penalty 83 | slack_penalty = self._config.slack_penalty 84 | 85 | # reward 86 | self._metric = ( 87 | success_reward 88 | + dtg_reward 89 | + angle_reward 90 | + angle_success_reward 91 | + slack_penalty 92 | ) 93 | -------------------------------------------------------------------------------- /ovon/utils/dagger_environment_worker.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Copyright (c) Meta Platforms, Inc. and its affiliates. 4 | # This source code is licensed under the MIT license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import math 8 | from multiprocessing.context import BaseContext 9 | from typing import ( 10 | TYPE_CHECKING, 11 | List, 12 | Optional, 13 | ) 14 | 15 | import attr 16 | from habitat_baselines.rl.ver.environment_worker import ( 17 | EnvironmentWorker, 18 | EnvironmentWorkerProcess, 19 | _create_worker_configs, 20 | ) 21 | from habitat_baselines.rl.ver.worker_common import WorkerBase, WorkerQueues 22 | 23 | if TYPE_CHECKING: 24 | from omegaconf import DictConfig 25 | 26 | 27 | @attr.s(auto_attribs=True, auto_detect=True) 28 | class ILEnvironmentWorkerProcess(EnvironmentWorkerProcess): 29 | teacher_label: Optional[str] = None 30 | 31 | def _step_env(self, action): 32 | with self.timer.avg_time("step env"): 33 | obs, reward, done, info = self.env.step( 34 | self._last_obs[self.teacher_label].item() 35 | ) 36 | # ^only line different from the original EnvironmentWorkerProcess._step_env 37 | self._step_id += 1 38 | 39 | if not math.isfinite(reward): 40 | reward = -1.0 41 | 42 | with self.timer.avg_time("reset env"): 43 | if done: 44 | self._episode_id += 1 45 | self._step_id = 0 46 | if self.auto_reset_done: 47 | obs = self.env.reset() 48 | 49 | return obs, reward, done, info 50 | 51 | 52 | class ILEnvironmentWorker(EnvironmentWorker): 53 | def __init__( 54 | self, 55 | mp_ctx: BaseContext, 56 | env_idx: int, 57 | env_config, 58 | auto_reset_done, 59 | queues: WorkerQueues, 60 | ): 61 | teacher_label = None 62 | obs_trans_conf = env_config.habitat_baselines.rl.policy.obs_transforms 63 | if hasattr(env_config.habitat_baselines.rl.policy, "obs_transforms"): 64 | for obs_transform_config in obs_trans_conf.values(): 65 | if hasattr(obs_transform_config, "teacher_label"): 66 | teacher_label = obs_transform_config.teacher_label 67 | break 68 | assert teacher_label is not None, "teacher_label not found in config" 69 | WorkerBase.__init__( 70 | self, 71 | mp_ctx, 72 | ILEnvironmentWorkerProcess, 73 | env_idx, 74 | env_config, 75 | auto_reset_done, 76 | queues, 77 | teacher_label=teacher_label, 78 | ) 79 | self.env_worker_queue = queues.environments[env_idx] 80 | 81 | 82 | def _construct_il_environment_workers_impl( 83 | configs, 84 | auto_reset_done, 85 | mp_ctx: BaseContext, 86 | queues: WorkerQueues, 87 | ): 88 | num_environments = len(configs) 89 | workers = [] 90 | for i in range(num_environments): 91 | w = ILEnvironmentWorker(mp_ctx, i, configs[i], auto_reset_done, queues) 92 | workers.append(w) 93 | 94 | return workers 95 | 96 | 97 | def construct_il_environment_workers( 98 | config: "DictConfig", 99 | mp_ctx: BaseContext, 100 | worker_queues: WorkerQueues, 101 | ) -> List[EnvironmentWorker]: 102 | configs = _create_worker_configs(config) 103 | 104 | return _construct_il_environment_workers_impl(configs, True, mp_ctx, worker_queues) 105 | -------------------------------------------------------------------------------- /config/experiments/rnn_rl_finetune.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | defaults: 4 | - /habitat_baselines: habitat_baselines_rl_config_base 5 | - /habitat_baselines/rl/policy/obs_transforms: 6 | - resize 7 | - objectnav_stretch_hm3d 8 | - override /habitat/task/lab_sensors: 9 | - clip_objectgoal_sensor 10 | - override /habitat/task/measurements: 11 | - distance_to_goal 12 | - success 13 | - spl 14 | - soft_spl 15 | - collisions 16 | - collision_penalty 17 | - _self_ 18 | 19 | habitat: 20 | environment: 21 | iterator_options: 22 | max_scene_repeat_steps: 50000 23 | task: 24 | success_reward: 5.0 25 | slack_reward: -1e-3 26 | reward_measure: "collision_penalty" 27 | lab_sensors: 28 | clip_objectgoal_sensor: 29 | cache: data/text_embeddings/siglip.pkl 30 | measurements: 31 | success: 32 | success_distance: 0.25 33 | distance_to_goal: 34 | type: OVONDistanceToGoal 35 | dataset: 36 | type: "OVON-v1" 37 | split: train 38 | data_path: data/datasets/ovon/hm3d/v1/{split}/{split}.json.gz 39 | simulator: 40 | type: "OVONSim-v0" 41 | navmesh_settings: 42 | agent_max_climb: 0.1 43 | cell_height: 0.05 44 | 45 | habitat_baselines: 46 | torch_gpu_id: 0 47 | tensorboard_dir: "tb" 48 | video_dir: "video_dir" 49 | test_episode_count: -1 50 | eval_ckpt_path_dir: "data/new_checkpoints" 51 | num_environments: 32 52 | checkpoint_folder: "data/new_checkpoints" 53 | trainer_name: "ver" 54 | num_updates: -1 55 | total_num_steps: 150000000 56 | log_interval: 10 57 | num_checkpoints: 50 58 | # Force PyTorch to be single threaded as 59 | # this improves performance considerably 60 | force_torch_single_threaded: True 61 | 62 | eval: 63 | split: "val" 64 | 65 | rl: 66 | 67 | policy: 68 | name: "PointNavResNetCLIPPolicy" 69 | backbone: "siglip" 70 | fusion_type: "concat" 71 | use_vis_query: True 72 | use_residual: True 73 | residual_vision: True 74 | rgb_only: False 75 | 76 | finetune: 77 | enabled: True 78 | lr: 1.5e-5 79 | start_actor_warmup_at: 750 80 | start_actor_update_at: 1000 81 | start_critic_warmup_at: 500 82 | start_critic_update_at: 1000 83 | 84 | ppo: 85 | # ppo params 86 | clip_param: 0.2 87 | ppo_epoch: 1 88 | num_mini_batch: 2 89 | value_loss_coef: 0.5 90 | entropy_coef: 0.01 91 | lr: 2.5e-4 92 | eps: 1e-5 93 | max_grad_norm: 0.2 94 | num_steps: 100 95 | use_gae: True 96 | gamma: 0.99 97 | tau: 0.95 98 | use_linear_clip_decay: False 99 | use_linear_lr_decay: True 100 | reward_window_size: 50 101 | 102 | use_normalized_advantage: False 103 | 104 | hidden_size: 1024 105 | 106 | ddppo: 107 | sync_frac: 0.6 108 | # The PyTorch distributed backend to use 109 | distrib_backend: NCCL 110 | # Visual encoder backbone 111 | pretrained_weights: data/ddppo-models/gibson-2plus-resnet50.pth 112 | # Initialize with pretrained weights 113 | pretrained: False 114 | # Initialize just the visual encoder backbone with pretrained weights 115 | pretrained_encoder: False 116 | # Whether or not the visual encoder backbone will be trained. 117 | train_encoder: False 118 | # Whether or not to reset the critic linear layer 119 | reset_critic: True 120 | 121 | # Model parameters 122 | rnn_type: LSTM 123 | num_recurrent_layers: 4 124 | -------------------------------------------------------------------------------- /ovon/utils/analysis/fixes_to_provided_mappings.csv: -------------------------------------------------------------------------------- 1 | Raw,Corrected 2 | All Raw that map to “lamp”, 3 | lamp,lamp 4 | lamp ,lamp 5 | lamps,lamp 6 | lmap,lamp 7 | unknown/ probably lamp,lamp 8 | łamp,lamp 9 | , 10 | All Raw that map to “table lamp”, 11 | table lamp,lamp 12 | table lamp,lamp 13 | unknown/ probably table lamp,table lamp 14 | , 15 | All Raw that map to “plant”, 16 | dacorative plant,plant 17 | ornament plant,plant 18 | ornamental plant,plant 19 | plant,plant 20 | plants,plant 21 | table plant,plant 22 | twigs in vase,plant 23 | unknown/ probably decorative plant,plant 24 | vegetation,plant 25 | , 26 | All Raw that map to “flower”, 27 | flower,flower 28 | flower ,flower 29 | FLOWER,flower 30 | flower,flower 31 | flowers,flower 32 | ornament flower,flower 33 | vase with flower,flower 34 | , 35 | All Raw that map to “couch”, 36 | couch,couch 37 | , 38 | All Raw that map to “sofa”, 39 | unknown/ probably sofa,sofa 40 | sofa,sofa 41 | , 42 | All Raw that map to “box”, 43 | box,box 44 | tabletop box,box 45 | tea box,box 46 | tin,box 47 | towel box,box 48 | unknown/ probably box,box 49 | unknown/ probably desk or box,box 50 | , 51 | All Raw that map to “boxes”, 52 | boxes,boxes 53 | tea boxes ,boxes 54 | tea boxes,boxes 55 | pile of boxes,boxes 56 | , 57 | All Raw that map to “refrigerator”, 58 | refridgerator,refrigerator 59 | refrigearator,refrigerator 60 | refrigerator,refrigerator 61 | wine refrigerator,refrigerator 62 | , 63 | All Raw that map to “fridge”, 64 | fridge,fridge 65 | unknown/ probably fridge,fridge 66 | , 67 | All Raw that map to “picture”, 68 | art,picture 69 | framed picture,picture 70 | image,picture 71 | pictrure,picture 72 | picture,picture 73 | picture ,picture 74 | pictures,picture 75 | picure,picture 76 | unknown picture/window,picture 77 | unknown/ probably framed picture,picture 78 | wall decoration,picture 79 | wall frame,picture 80 | , 81 | All Raw that map to “poster”, 82 | poster,picture 83 | poster figure,poster 84 | , 85 | All Raw that map to “bedside table”, 86 | nighstand,nightstand 87 | night stand,nightstand 88 | night table,nightstand 89 | nightsand,nightstand 90 | nightstand,nightstand 91 | nigtstand,nightstand 92 | , 93 | All Raw that map to “bedside table”, 94 | beside table,bedside table 95 | bedside table,bedside table 96 | , 97 | All Raw that map to “paper”, 98 | paper,paper 99 | paper ,paper 100 | paper,paper 101 | , 102 | All Raw that map to “papers”, 103 | papers,papers 104 | pile of papers,papers 105 | unknown/ probably letters,papers 106 | , 107 | All Raw that map to “ironing board”, 108 | ironing board,ironing board 109 | ironing board ,ironing board 110 | , 111 | All Raw that map to “iron board”, 112 | iron board,iron board 113 | , 114 | All Raw that map to “cup”, 115 | cup,cup 116 | toothbrush cup,cup 117 | toothbrush holder,cup 118 | unknown /probably cup,cup 119 | unknown/ proably cup,cup 120 | , 121 | All Raw that map to “cups”, 122 | cups,cups 123 | pile of cups,cups 124 | unknown/ probably cups,cups 125 | , 126 | All Raw that map to “drawer”, 127 | drawer,drawer 128 | wardrobe drawer,drawer 129 | , 130 | All Raw that map to “drawers”, 131 | drawers,drawers 132 | office drawer,drawers 133 | organizer drawers,drawers 134 | , 135 | All Raw that map to “paper towel”, 136 | paper towel,paper towel 137 | , 138 | All Raw that map to “paper towels”, 139 | unknown/ probably paper towel dispenser,paper towels 140 | unknown/ probably paper towel holder,paper towels 141 | paper towels,paper towel 142 | , 143 | All Raw that map to “weight”, 144 | weight,weight 145 | , 146 | , 147 | All Raw that map to “weights”, 148 | workout weight,weights 149 | weights,weights 150 | -------------------------------------------------------------------------------- /config/experiments/rnn_dagger.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | defaults: 4 | - /habitat_baselines: habitat_baselines_rl_config_base 5 | - /habitat_baselines/rl/policy/obs_transforms: 6 | - resize 7 | - relabel_teacher_actions 8 | - /habitat/task/measurements: 9 | - frontier_exploration_map 10 | - objectnav_stretch_hm3d 11 | - override /habitat/task/lab_sensors: 12 | - objnav_explorer 13 | - clip_objectgoal_sensor 14 | - _self_ 15 | 16 | habitat: 17 | environment: 18 | iterator_options: 19 | max_scene_repeat_steps: 50000 20 | task: 21 | success_reward: 2.5 22 | slack_reward: -1e-3 23 | lab_sensors: 24 | objnav_explorer: 25 | map_resolution: 128 26 | beeline_dist_thresh: 3.5 27 | visibility_dist: 3.0 28 | area_thresh: 1.5 29 | success_distance: 0.25 30 | fov: 42 31 | clip_objectgoal_sensor: 32 | cache: data/text_embeddings/siglip.pkl 33 | measurements: 34 | success: 35 | success_distance: 0.25 36 | distance_to_goal: 37 | type: OVONDistanceToGoal 38 | dataset: 39 | type: "OVON-v1" 40 | split: train 41 | data_path: data/datasets/ovon/hm3d/v1/{split}/{split}.json.gz 42 | simulator: 43 | type: "OVONSim-v0" 44 | navmesh_settings: 45 | agent_max_climb: 0.1 46 | cell_height: 0.05 47 | habitat_sim_v0: 48 | allow_sliding: True 49 | 50 | habitat_baselines: 51 | torch_gpu_id: 0 52 | tensorboard_dir: "tb" 53 | video_dir: "video_dir" 54 | test_episode_count: -1 55 | eval_ckpt_path_dir: "data/new_checkpoints" 56 | num_environments: 32 57 | checkpoint_folder: "data/new_checkpoints" 58 | trainer_name: "ver_dagger" 59 | num_updates: -1 60 | total_num_steps: 150000000 61 | log_interval: 10 62 | num_checkpoints: 50 63 | # Force PyTorch to be single threaded as 64 | # this improves performance considerably 65 | force_torch_single_threaded: True 66 | 67 | eval: 68 | split: "val" 69 | 70 | rl: 71 | 72 | policy: 73 | name: "PointNavResNetCLIPPolicy" 74 | backbone: "siglip" 75 | fusion_type: "concat" 76 | use_vis_query: True 77 | use_residual: True 78 | residual_vision: True 79 | rgb_only: False 80 | obs_transforms: 81 | relabel_teacher_actions: 82 | teacher_label: "objnav_explorer" 83 | 84 | ppo: 85 | # ppo params 86 | clip_param: 0.2 87 | ppo_epoch: 1 88 | num_mini_batch: 2 89 | value_loss_coef: 0.5 90 | entropy_coef: 0.01 91 | lr: 2.5e-4 92 | eps: 1e-5 93 | max_grad_norm: 0.2 94 | num_steps: 100 95 | use_gae: True 96 | gamma: 0.99 97 | tau: 0.95 98 | use_linear_clip_decay: False 99 | use_linear_lr_decay: False 100 | reward_window_size: 50 101 | 102 | use_normalized_advantage: False 103 | 104 | hidden_size: 1024 105 | 106 | ddppo: 107 | sync_frac: 0.6 108 | # The PyTorch distributed backend to use 109 | distrib_backend: NCCL 110 | # Visual encoder backbone 111 | pretrained_weights: data/ddppo-models/gibson-2plus-resnet50.pth 112 | # Initialize with pretrained weights 113 | pretrained: False 114 | # Initialize just the visual encoder backbone with pretrained weights 115 | pretrained_encoder: False 116 | # Whether or not the visual encoder backbone will be trained. 117 | train_encoder: False 118 | # Whether or not to reset the critic linear layer 119 | reset_critic: True 120 | 121 | # Model parameters 122 | rnn_type: LSTM 123 | num_recurrent_layers: 4 124 | -------------------------------------------------------------------------------- /ovon/utils/analysis/postprocess_meta.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from collections import defaultdict 3 | 4 | import clip 5 | import matplotlib.pyplot as plt 6 | import numpy as np 7 | import pandas as pd 8 | import seaborn as sns 9 | import torch 10 | 11 | from ovon.utils.utils import load_json, write_json 12 | 13 | PROMPT = "{category}" 14 | 15 | 16 | def postprocess_meta(input_path, output_path): 17 | df = pd.read_csv(input_path) 18 | 19 | categories_by_region = defaultdict(list) 20 | for idx, row in df.iterrows(): 21 | categories_by_region[row["Region Proposal"]].append(row["Category"]) 22 | 23 | for k, v in categories_by_region.items(): 24 | print(k, len(v)) 25 | 26 | write_json(categories_by_region, output_path) 27 | 28 | 29 | def clip_embeddings(clip_m, prompts): 30 | tokens = [] 31 | for prompt in prompts: 32 | tokens.append(clip.tokenize(prompt, context_length=77).numpy()) 33 | 34 | batch = torch.tensor(np.array(tokens)).cuda() 35 | with torch.no_grad(): 36 | text_embedding = clip_m.encode_text(batch.flatten(0, 1)).float() 37 | return text_embedding 38 | 39 | 40 | def max_similarity(clip_m, category, val_seen_categories): 41 | categories = val_seen_categories.copy() 42 | if category in categories: 43 | categories.remove(category) 44 | 45 | prompt = PROMPT.format(category=category) 46 | text_embedding = clip_embeddings(clip_m, [prompt] + categories) 47 | return ( 48 | torch.cosine_similarity(text_embedding[0].unsqueeze(0), text_embedding[1:]) 49 | .max() 50 | .item() 51 | ) 52 | 53 | 54 | def semantic_failures(input_path): 55 | clip_m, preprocess = clip.load("RN50", device="cuda") 56 | 57 | records = load_json(input_path) 58 | 59 | failures = 0 60 | defaultdict(int) 61 | for k, v in records.items(): 62 | failures += 1 - v["success"] 63 | if not v["success"]: 64 | nearest_objects = v["failure_modes.objects_within_2m"].split(",") 65 | category = v["target"] 66 | if category in nearest_objects: 67 | nearest_objects.remove(category) 68 | 69 | 70 | def failure_metrics(input_path, output_path): 71 | records = load_json(input_path) 72 | 73 | failures = 0 74 | failure_modes = defaultdict(int) 75 | for k, v in records.items(): 76 | failures += 1 - v["success"] 77 | if not v["success"]: 78 | for kk in v.keys(): 79 | if kk in [ 80 | "failure_modes.recognition", 81 | "failure_modes.exploration", 82 | "failure_modes.last_mile_nav", 83 | "failure_modes.stop_failure", 84 | ]: 85 | failure_modes[kk] += v[kk] 86 | 87 | failure_modes = {k: v / failures for k, v in failure_modes.items()} 88 | labels = list(failure_modes.keys()) 89 | metrics = list(failure_modes.values()) 90 | 91 | colors = sns.color_palette("pastel")[0:5] 92 | 93 | # create pie chart 94 | fig, ax = plt.subplots(figsize=(6, 3)) 95 | ax.pie(metrics, labels=labels, colors=colors, autopct="%.0f%%") 96 | fig.savefig(output_path, bbox_inches="tight") 97 | 98 | print(failure_modes) 99 | 100 | 101 | if __name__ == "__main__": 102 | parser = argparse.ArgumentParser() 103 | parser.add_argument( 104 | "--input-path", type=str, help="Path to the meta csv file", required=True 105 | ) 106 | parser.add_argument( 107 | "--output-path", type=str, help="Path to the output file", required=True 108 | ) 109 | args = parser.parse_args() 110 | # postprocess_meta(args.input_path, args.output_path) 111 | failure_metrics(args.input_path, args.output_path) 112 | -------------------------------------------------------------------------------- /config/experiments/transformer_rl_finetune.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | defaults: 4 | - /habitat_baselines: habitat_baselines_rl_config_base 5 | - /habitat_baselines/rl/policy/obs_transforms: 6 | - resize 7 | - objectnav_stretch_hm3d 8 | - override /habitat/task/lab_sensors: 9 | - clip_objectgoal_sensor 10 | - step_id_sensor 11 | - override /habitat/task/measurements: 12 | - distance_to_goal 13 | - success 14 | - spl 15 | - soft_spl 16 | - collisions 17 | - collision_penalty 18 | - _self_ 19 | 20 | habitat: 21 | environment: 22 | iterator_options: 23 | max_scene_repeat_steps: 50000 24 | task: 25 | success_reward: 5.0 26 | slack_reward: -1e-3 27 | reward_measure: "collision_penalty" 28 | lab_sensors: 29 | clip_objectgoal_sensor: 30 | cache: data/text_embeddings/siglip.pkl 31 | measurements: 32 | success: 33 | success_distance: 0.25 34 | distance_to_goal: 35 | type: OVONDistanceToGoal 36 | dataset: 37 | type: "OVON-v1" 38 | split: train 39 | data_path: data/datasets/ovon/hm3d/v1/{split}/{split}.json.gz 40 | simulator: 41 | type: "OVONSim-v0" 42 | navmesh_settings: 43 | agent_max_climb: 0.1 44 | cell_height: 0.05 45 | 46 | habitat_baselines: 47 | torch_gpu_id: 0 48 | tensorboard_dir: "tb" 49 | video_dir: "video_dir" 50 | test_episode_count: -1 51 | eval_ckpt_path_dir: "data/new_checkpoints" 52 | num_environments: 32 53 | checkpoint_folder: "data/new_checkpoints" 54 | trainer_name: "ver_transformer" 55 | num_updates: -1 56 | total_num_steps: 150000000 57 | log_interval: 10 58 | num_checkpoints: 50 59 | # Force PyTorch to be single threaded as 60 | # this improves performance considerably 61 | force_torch_single_threaded: True 62 | 63 | eval: 64 | split: "val" 65 | 66 | rl: 67 | 68 | policy: 69 | name: "OVONTransformerPolicy" 70 | backbone: "siglip" 71 | fusion_type: "concat" 72 | use_vis_query: True 73 | use_residual: True 74 | residual_vision: True 75 | rgb_only: False 76 | transformer_config: 77 | model_name: "llama" 78 | n_layers: 4 79 | n_heads: 8 80 | n_hidden: 512 81 | n_mlp_hidden: 1024 82 | max_context_length: 100 83 | shuffle_pos_id_for_update: True 84 | 85 | finetune: 86 | enabled: True 87 | lr: 1.5e-5 88 | start_actor_warmup_at: 750 89 | start_actor_update_at: 1000 90 | start_critic_warmup_at: 500 91 | start_critic_update_at: 1000 92 | 93 | ppo: 94 | # ppo params 95 | clip_param: 0.2 96 | ppo_epoch: 1 97 | num_mini_batch: 2 98 | value_loss_coef: 0.5 99 | entropy_coef: 0.01 100 | lr: 2.5e-4 101 | eps: 1e-5 102 | max_grad_norm: 0.2 103 | num_steps: 100 104 | use_gae: True 105 | gamma: 0.99 106 | tau: 0.95 107 | use_linear_clip_decay: False 108 | use_linear_lr_decay: True 109 | reward_window_size: 50 110 | 111 | use_normalized_advantage: False 112 | 113 | hidden_size: 512 114 | 115 | ddppo: 116 | sync_frac: 0.6 117 | # The PyTorch distributed backend to use 118 | distrib_backend: NCCL 119 | # Visual encoder backbone 120 | pretrained_weights: data/ddppo-models/gibson-2plus-resnet50.pth 121 | # Initialize with pretrained weights 122 | pretrained: False 123 | # Initialize just the visual encoder backbone with pretrained weights 124 | pretrained_encoder: False 125 | # Whether or not the visual encoder backbone will be trained. 126 | train_encoder: False 127 | # Whether or not to reset the critic linear layer 128 | reset_critic: True 129 | -------------------------------------------------------------------------------- /config/experiments/transformer_dagger.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | defaults: 4 | - /habitat_baselines: habitat_baselines_rl_config_base 5 | - /habitat_baselines/rl/policy/obs_transforms: 6 | - resize 7 | - relabel_teacher_actions 8 | - /habitat/task/measurements: 9 | - frontier_exploration_map 10 | - objectnav_stretch_hm3d 11 | - override /habitat/task/lab_sensors: 12 | - objnav_explorer 13 | - clip_objectgoal_sensor 14 | - step_id_sensor 15 | - _self_ 16 | 17 | habitat: 18 | environment: 19 | iterator_options: 20 | max_scene_repeat_steps: 50000 21 | task: 22 | success_reward: 2.5 23 | slack_reward: -1e-3 24 | lab_sensors: 25 | objnav_explorer: 26 | map_resolution: 128 27 | beeline_dist_thresh: 3.5 28 | visibility_dist: 3.0 29 | area_thresh: 1.5 30 | success_distance: 0.25 31 | fov: 42 32 | clip_objectgoal_sensor: 33 | cache: data/text_embeddings/siglip.pkl 34 | measurements: 35 | success: 36 | success_distance: 0.25 37 | distance_to_goal: 38 | type: OVONDistanceToGoal 39 | dataset: 40 | type: "OVON-v1" 41 | split: train 42 | data_path: data/datasets/ovon/hm3d/v1/{split}/{split}.json.gz 43 | simulator: 44 | type: "OVONSim-v0" 45 | navmesh_settings: 46 | agent_max_climb: 0.1 47 | cell_height: 0.05 48 | habitat_sim_v0: 49 | allow_sliding: True 50 | 51 | habitat_baselines: 52 | torch_gpu_id: 0 53 | tensorboard_dir: "tb" 54 | video_dir: "video_dir" 55 | test_episode_count: -1 56 | eval_ckpt_path_dir: "data/new_checkpoints" 57 | num_environments: 32 58 | checkpoint_folder: "data/new_checkpoints" 59 | trainer_name: "ver_dagger" 60 | num_updates: -1 61 | total_num_steps: 150000000 62 | log_interval: 10 63 | num_checkpoints: 50 64 | # Force PyTorch to be single threaded as 65 | # this improves performance considerably 66 | force_torch_single_threaded: True 67 | 68 | eval: 69 | split: "val_seen" 70 | 71 | rl: 72 | 73 | policy: 74 | name: "OVONTransformerPolicy" 75 | backbone: "siglip" 76 | fusion_type: "concat" 77 | use_vis_query: True 78 | use_residual: True 79 | residual_vision: True 80 | rgb_only: False 81 | obs_transforms: 82 | relabel_teacher_actions: 83 | teacher_label: "objnav_explorer" 84 | transformer_config: 85 | model_name: "llama" 86 | n_layers: 4 87 | n_heads: 8 88 | n_hidden: 512 89 | n_mlp_hidden: 1024 90 | max_context_length: 100 91 | shuffle_pos_id_for_update: True 92 | 93 | ppo: 94 | # ppo params 95 | clip_param: 0.2 96 | ppo_epoch: 1 97 | num_mini_batch: 2 98 | value_loss_coef: 0.5 99 | entropy_coef: 0.01 100 | lr: 2.5e-4 101 | eps: 1e-5 102 | max_grad_norm: 0.2 103 | num_steps: 100 104 | use_gae: True 105 | gamma: 0.99 106 | tau: 0.95 107 | use_linear_clip_decay: False 108 | use_linear_lr_decay: False 109 | reward_window_size: 50 110 | 111 | use_normalized_advantage: False 112 | 113 | hidden_size: 512 114 | 115 | ddppo: 116 | sync_frac: 0.6 117 | # The PyTorch distributed backend to use 118 | distrib_backend: NCCL 119 | # Visual encoder backbone 120 | pretrained_weights: data/ddppo-models/gibson-2plus-resnet50.pth 121 | # Initialize with pretrained weights 122 | pretrained: False 123 | # Initialize just the visual encoder backbone with pretrained weights 124 | pretrained_encoder: False 125 | # Whether or not the visual encoder backbone will be trained. 126 | train_encoder: False 127 | # Whether or not to reset the critic linear layer 128 | reset_critic: True 129 | -------------------------------------------------------------------------------- /ovon/utils/shuffle_episodes.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import glob 3 | import os 4 | from collections import defaultdict 5 | 6 | from ovon.utils.utils import load_dataset, load_json, write_dataset 7 | 8 | 9 | def shuffle_episodes( 10 | dataset_path: str, 11 | output_path: str, 12 | meta_path: str = "data/hm3d_meta/val_splits.json", 13 | ): 14 | category_per_splits = load_json(meta_path) 15 | splits = list(category_per_splits.keys()) 16 | print(splits) 17 | 18 | scenes = glob.glob(os.path.join(dataset_path, splits[0], "content/*json.gz")) 19 | 20 | # make directories 21 | for split in splits: 22 | os.makedirs(os.path.join(output_path, split, "content"), exist_ok=True) 23 | 24 | for scene in scenes: 25 | scene_id = scene.split("/")[-1] 26 | scene_name = scene_id.split(".")[0] 27 | 28 | goals_by_category = {} 29 | episodes_by_category = defaultdict(list) 30 | for split in splits: 31 | path = os.path.join(dataset_path, split, "content", scene_id) 32 | dataset = load_dataset(path) 33 | 34 | for goal_key, goal in dataset["goals_by_category"].items(): 35 | goals_by_category[goal_key] = goal 36 | 37 | for episode in dataset["episodes"]: 38 | episodes_by_category[episode["object_category"]].append(episode) 39 | 40 | for split in splits: 41 | path = os.path.join(dataset_path, split, "content", scene_id) 42 | dataset = load_dataset(path) 43 | 44 | goals_before = len(dataset["goals_by_category"].keys()) 45 | episodes_before = len(dataset["episodes"]) 46 | 47 | dataset["goals_by_category"] = {} 48 | dataset["episodes"] = [] 49 | 50 | for key in category_per_splits[split]: 51 | g_key = "{}.basis.glb_{}".format(scene_name, key) 52 | if goals_by_category.get(g_key) is None: 53 | continue 54 | dataset["goals_by_category"][g_key] = goals_by_category[g_key] 55 | dataset["episodes"].extend(episodes_by_category[key]) 56 | print( 57 | "Split: {}, # of goals: {}/{}, # of episodes: {}/{}".format( 58 | split, 59 | len(dataset["goals_by_category"].keys()), 60 | goals_before, 61 | len(dataset["episodes"]), 62 | episodes_before, 63 | ) 64 | ) 65 | 66 | op = os.path.join(output_path, split, "content", scene_id) 67 | print("Output: {}".format(op)) 68 | write_dataset(dataset, op) 69 | 70 | print("\n") 71 | for split in splits: 72 | files = glob.glob(os.path.join(output_path, split, "content/*json.gz")) 73 | 74 | goals = [] 75 | episodes = 0 76 | for f in files: 77 | dataset = load_dataset(f) 78 | episodes += len(dataset["episodes"]) 79 | 80 | goal_keys = [k.split("_")[-1] for k in dataset["goals_by_category"].keys()] 81 | goals.extend(goal_keys) 82 | 83 | diff = set(category_per_splits[split]).difference(set(goals)) 84 | print( 85 | "Validating Split: {}, # of goals: {}, # of episodes: {}, Difference: {}" 86 | .format(split, len(set(goals)), episodes, len(diff)) 87 | ) 88 | 89 | 90 | if __name__ == "__main__": 91 | parser = argparse.ArgumentParser() 92 | parser.add_argument( 93 | "--dataset_path", type=str, default="data/hm3d_meta/val_splits.json" 94 | ) 95 | parser.add_argument( 96 | "--output_path", type=str, default="data/hm3d_meta/val_splits.json" 97 | ) 98 | 99 | args = parser.parse_args() 100 | 101 | shuffle_episodes(args.dataset_path, args.output_path) 102 | -------------------------------------------------------------------------------- /ovon/utils/analysis/Issues_with_provided_mappings.csv: -------------------------------------------------------------------------------- 1 | "Raw","Corrected " 2 | "All Raw that map to “lamp”", 3 | "lamp","lamp" 4 | "lamp ","lamp" 5 | "lamps","lamp" 6 | "lmap","lamp" 7 | "table lamp","lamp" 8 | " table lamp","lamp" 9 | "unknown/ probably lamp","lamp" 10 | "łamp","lamp" 11 | , 12 | "All Raw that map to “table lamp”", 13 | "unknown/ probably table lamp","table lamp" 14 | , 15 | "All Raw that map to “plant”", 16 | "dacorative plant","plant" 17 | "flower","plant" 18 | "flower ","plant" 19 | "FLOWER","plant" 20 | " flower","plant" 21 | "flowers","plant" 22 | "ornament plant","plant" 23 | "ornamental plant","plant" 24 | "plant","plant" 25 | "plants","plant" 26 | "table plant","plant" 27 | "twigs in vase","plant" 28 | "unknown/ probably decorative plant","plant" 29 | "vegetation","plant" 30 | , 31 | "All Raw that map to “flower”", 32 | "ornament flower","flower" 33 | "vase with flower","flower" 34 | , 35 | "All Raw that map to “couch”", 36 | "couch","couch" 37 | "sofa","couch" 38 | , 39 | "All Raw that map to “sofa”", 40 | "unknown/ probably sofa","sofa" 41 | , 42 | "All Raw that map to “box”", 43 | "box","box" 44 | "boxes","box" 45 | "tabletop box","box" 46 | "tea boxes ","box" 47 | "tea box","box" 48 | "tea boxes","box" 49 | "tin","box" 50 | "towel box","box" 51 | "unknown/ probably box","box" 52 | "unknown/ probably desk or box","box" 53 | , 54 | "All Raw that map to “boxes”", 55 | "pile of boxes","boxes" 56 | , 57 | "All Raw that map to “refrigerator”", 58 | "fridge","refrigerator" 59 | "refridgerator","refrigerator" 60 | "refrigearator","refrigerator" 61 | "refrigerator","refrigerator" 62 | "wine refrigerator","refrigerator" 63 | , 64 | "All Raw that map to “fridge”", 65 | "unknown/ probably fridge","fridge" 66 | , 67 | "All Raw that map to “picture”", 68 | "art","picture" 69 | "framed picture","picture" 70 | "image","picture" 71 | "pictrure","picture" 72 | "picture","picture" 73 | "picture ","picture" 74 | "pictures","picture" 75 | "picure","picture" 76 | "poster","picture" 77 | "unknown picture/window","picture" 78 | "unknown/ probably framed picture","picture" 79 | "wall decoration","picture" 80 | "wall frame","picture" 81 | , 82 | "All Raw that map to “poster”", 83 | "poster figure","poster" 84 | , 85 | "All Raw that map to “bedside table”", 86 | "bedside table","nightstand" 87 | "nighstand","nightstand" 88 | "night stand","nightstand" 89 | "night table","nightstand" 90 | "nightsand","nightstand" 91 | "nightstand","nightstand" 92 | "nigtstand","nightstand" 93 | , 94 | "All Raw that map to “bedside table”", 95 | "beside table","bedside table" 96 | , 97 | "All Raw that map to “paper”", 98 | "paper","paper" 99 | "paper ","paper" 100 | " paper","paper" 101 | "papers","paper" 102 | , 103 | "All Raw that map to “papers”", 104 | "pile of papers","papers" 105 | "unknown/ probably letters","papers" 106 | , 107 | "All Raw that map to “ironing board”", 108 | "ironing board","ironing board" 109 | "ironing board ","ironing board" 110 | , 111 | "All Raw that map to “iron board”", 112 | "iron board","iron board" 113 | , 114 | "All Raw that map to “cup”", 115 | "cup","cup" 116 | "cups","cup" 117 | "toothbrush cup","cup" 118 | "toothbrush holder","cup" 119 | "unknown /probably cup","cup" 120 | "unknown/ proably cup","cup" 121 | "unknown/ probably cups","cup" 122 | , 123 | "All Raw that map to “cups”", 124 | "pile of cups","cups" 125 | , 126 | "All Raw that map to “drawer”", 127 | "drawer","drawer" 128 | "drawers","drawer" 129 | "wardrobe drawer","drawer" 130 | , 131 | "All Raw that map to “drawers”", 132 | "office drawer","drawers" 133 | "organizer drawers","drawers" 134 | , 135 | "All Raw that map to “paper towel”", 136 | "paper towel","paper towel" 137 | "paper towels","paper towel" 138 | , 139 | "All Raw that map to “paper towels”", 140 | "unknown/ probably paper towel dispenser","paper towels" 141 | "unknown/ probably paper towel holder","paper towels" 142 | , 143 | "All Raw that map to “weight”", 144 | "weight","weight" 145 | "weights","weight" 146 | , 147 | "All Raw that map to “weights”", 148 | "workout weight","weights" 149 | -------------------------------------------------------------------------------- /ovon/models/encoders/cross_attention.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | 4 | 5 | class CrossAttention(nn.Module): 6 | def __init__( 7 | self, 8 | x1_dim: int, 9 | x2_dim: int, 10 | num_heads: int, 11 | use_vis_query: bool, 12 | use_residual: bool, 13 | residual_vision: bool, 14 | embed_dim: int = None, 15 | dropout: float = 0.1, 16 | ) -> None: 17 | """ 18 | Meant for fusion of two different modalities, x1 being language embeddings and 19 | x2 being visual embeddings. 20 | 21 | Args: 22 | x1_dim: Dimension of the first input (language) 23 | x2_dim: Dimension of the second input (visual) 24 | num_heads: Number of heads for the multihead attention 25 | use_vis_query: Whether to use visual encoding as the query and value 26 | use_residual: Whether to use the residual connection 27 | embed_dim: Dimension of the embedding space 28 | dropout: Dropout rate for the multihead attention 29 | """ 30 | super(CrossAttention, self).__init__() 31 | 32 | embed_dim = embed_dim or num_heads * 64 33 | 34 | if x1_dim == x2_dim == embed_dim: 35 | self.proj1 = nn.Identity() 36 | self.proj2 = nn.Identity() 37 | else: 38 | # Linear layers to project x1 and x2 into embedding space 39 | self.proj1 = nn.Linear(x1_dim, embed_dim) 40 | self.proj2 = nn.Linear(x2_dim, embed_dim) 41 | 42 | # Initialize with Xavier initialization 43 | nn.init.xavier_uniform_(self.proj1.weight) 44 | nn.init.xavier_uniform_(self.proj2.weight) 45 | nn.init.zeros_(self.proj1.bias) 46 | nn.init.zeros_(self.proj2.bias) 47 | 48 | self.multihead_attn = nn.MultiheadAttention( 49 | embed_dim=embed_dim, num_heads=num_heads, dropout=dropout 50 | ) 51 | 52 | # Initialize weights and biases in MultiheadAttention 53 | for name, param in self.multihead_attn.named_parameters(): 54 | if "weight" in name: 55 | nn.init.xavier_uniform_(param) 56 | elif "bias" in name: 57 | nn.init.zeros_(param) 58 | 59 | self.norm = nn.LayerNorm(embed_dim) 60 | self.use_vis_query = use_vis_query 61 | self.use_residual = use_residual 62 | self.residual_vision = residual_vision 63 | self.output_size = embed_dim 64 | 65 | def forward(self, x1: torch.Tensor, x2: torch.Tensor) -> torch.Tensor: 66 | """ 67 | Args: 68 | x1: [batch_size, x1_dim] tensor (language) 69 | x2: [batch_size, x2_dim] tensor (visual) 70 | 71 | Returns: 72 | [batch_size, embed_dim] tensor 73 | """ 74 | 75 | # Project x1 and x2 into the embedding space 76 | x1_proj = self.proj1(x1) 77 | x2_proj = self.proj2(x2) 78 | 79 | # Reshape the tensors to be [seq_len, batch_size, embed_dim], where seq_len is 1 80 | x1_proj = x1_proj.unsqueeze(0) 81 | x2_proj = x2_proj.unsqueeze(0) 82 | 83 | # Perform the cross-attention calculation based on use_vis_query 84 | if self.use_vis_query: 85 | query = x2_proj 86 | key = x1_proj 87 | value = x1_proj 88 | else: 89 | query = x1_proj 90 | key = x2_proj 91 | value = x2_proj 92 | 93 | # output: [1, batch_size, embed_dim] 94 | output, _ = self.multihead_attn(query=query, key=key, value=value) 95 | 96 | if self.use_residual: 97 | # Add residual connection 98 | if self.residual_vision: 99 | output += x2_proj 100 | else: 101 | output += query 102 | 103 | # Apply Layer Normalization 104 | # output: [1, batch_size, embed_dim] 105 | output = self.norm(output) 106 | 107 | # Squeeze the sequence length dimension 108 | # output: [batch_size, embed_dim] 109 | output = output.squeeze(0) 110 | 111 | return output 112 | -------------------------------------------------------------------------------- /ovon/measurements/imagenav.py: -------------------------------------------------------------------------------- 1 | from typing import TYPE_CHECKING, Any 2 | 3 | import numpy as np 4 | import quaternion 5 | from habitat.core.embodied_task import EmbodiedTask, Measure 6 | from habitat.core.registry import registry 7 | from habitat.core.simulator import Simulator 8 | from habitat.tasks.nav.nav import NavigationEpisode, Success 9 | from habitat.utils.geometry_utils import ( 10 | angle_between_quaternions, 11 | quaternion_from_coeff, 12 | ) 13 | 14 | if TYPE_CHECKING: 15 | from omegaconf import DictConfig 16 | 17 | 18 | @registry.register_measure 19 | class AngleToGoal(Measure): 20 | """The measure calculates an angle towards the goal. Note: this measure is 21 | only valid for single goal tasks (e.g., ImageNav) 22 | """ 23 | 24 | cls_uuid: str = "angle_to_goal" 25 | 26 | def __init__(self, sim: Simulator, *args: Any, **kwargs: Any): 27 | super().__init__() 28 | self._sim = sim 29 | 30 | def _get_uuid(self, *args: Any, **kwargs: Any) -> str: 31 | return self.cls_uuid 32 | 33 | def reset_metric(self, episode, *args: Any, **kwargs: Any): 34 | self._metric = None 35 | self.update_metric(episode=episode, *args, **kwargs) # type: ignore 36 | 37 | def update_metric(self, episode: NavigationEpisode, *args: Any, **kwargs: Any): 38 | current_angle = self._sim.get_agent_state().rotation 39 | if not isinstance(current_angle, quaternion.quaternion): 40 | current_angle = quaternion_from_coeff(current_angle) 41 | 42 | goal_angle = episode.goals[0].rotation 43 | if not isinstance(goal_angle, quaternion.quaternion): 44 | goal_angle = quaternion_from_coeff(goal_angle) 45 | 46 | self._metric = angle_between_quaternions(current_angle, goal_angle) 47 | 48 | 49 | @registry.register_measure 50 | class AngleSuccess(Measure): 51 | """Weather or not the agent is within an angle tolerance.""" 52 | 53 | cls_uuid: str = "angle_success" 54 | 55 | def __init__(self, config: "DictConfig", *args: Any, **kwargs: Any): 56 | self._config = config 57 | 58 | super().__init__() 59 | 60 | def _get_uuid(self, *args: Any, **kwargs: Any) -> str: 61 | return self.cls_uuid 62 | 63 | def reset_metric(self, task: EmbodiedTask, *args: Any, **kwargs: Any): 64 | task.measurements.check_measure_dependencies( 65 | self.uuid, [Success.cls_uuid, AngleToGoal.cls_uuid] 66 | ) 67 | self.update_metric(task=task, *args, **kwargs) # type: ignore 68 | 69 | def update_metric(self, task: EmbodiedTask, *args: Any, **kwargs: Any): 70 | success = task.measurements.measures[Success.cls_uuid].get_metric() 71 | angle_to_goal = task.measurements.measures[AngleToGoal.cls_uuid].get_metric() 72 | 73 | if success and np.rad2deg(angle_to_goal) < self._config.success_angle: 74 | self._metric = 1.0 75 | else: 76 | self._metric = 0.0 77 | 78 | 79 | @registry.register_measure 80 | class AgentPosition(Measure): 81 | """The measure calculates current position of agent""" 82 | 83 | cls_uuid: str = "agent_position" 84 | 85 | def __init__(self, sim: Simulator, *args: Any, **kwargs: Any): 86 | super().__init__() 87 | self._sim = sim 88 | 89 | def _get_uuid(self, *args: Any, **kwargs: Any) -> str: 90 | return self.cls_uuid 91 | 92 | def reset_metric(self, *args: Any, **kwargs: Any): 93 | self._metric = None 94 | self.update_metric(*args, **kwargs) # type: ignore 95 | 96 | def update_metric(self, *args: Any, **kwargs: Any): 97 | self._metric = self._sim.get_agent_state().position 98 | 99 | 100 | @registry.register_measure 101 | class AgentRotation(Measure): 102 | """The measure calculates current position of agent""" 103 | 104 | cls_uuid: str = "agent_rotation" 105 | 106 | def __init__(self, sim: Simulator, *args: Any, **kwargs: Any): 107 | super().__init__() 108 | self._sim = sim 109 | 110 | def _get_uuid(self, *args: Any, **kwargs: Any) -> str: 111 | return self.cls_uuid 112 | 113 | def reset_metric(self, *args: Any, **kwargs: Any): 114 | self._metric = None 115 | self.update_metric(*args, **kwargs) # type: ignore 116 | 117 | def update_metric(self, *args: Any, **kwargs: Any): 118 | self._metric = self._sim.get_agent_state().rotation 119 | -------------------------------------------------------------------------------- /ovon/utils/utils.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import gzip 3 | import json 4 | import os 5 | import pickle 6 | from collections import defaultdict 7 | 8 | import numpy as np 9 | import torch 10 | from habitat.utils.visualizations import maps 11 | from PIL import Image 12 | from tqdm import tqdm 13 | 14 | from ovon.models.encoders.resnet_gn import ResNet 15 | 16 | 17 | def write_json(data, path): 18 | with open(path, "w") as file: 19 | file.write(json.dumps(data)) 20 | 21 | 22 | def load_json(path): 23 | file = open(path, "r") 24 | data = json.loads(file.read()) 25 | return data 26 | 27 | 28 | def write_txt(data, path): 29 | with open(path, "w") as file: 30 | file.write("\n".join(data)) 31 | 32 | 33 | def save_image(img, file_name): 34 | im = Image.fromarray(img) 35 | im.save(file_name) 36 | 37 | 38 | def load_dataset(path): 39 | with gzip.open(path, "rt") as file: 40 | data = json.loads(file.read(), encoding="utf-8") 41 | return data 42 | 43 | 44 | def save_pickle(data, path): 45 | file = open(path, "wb") 46 | data = pickle.dump(data, file) 47 | 48 | 49 | def load_pickle(path): 50 | file = open(path, "rb") 51 | data = pickle.load(file) 52 | return data 53 | 54 | 55 | def write_dataset(data, path): 56 | with gzip.open(path, "wt") as f: 57 | json.dump(data, f) 58 | 59 | 60 | def is_on_same_floor(height, ref_floor_height, ceiling_height=0.5): 61 | return ( 62 | (ref_floor_height - ceiling_height) 63 | <= height 64 | < (ref_floor_height + ceiling_height) 65 | ) 66 | 67 | 68 | def draw_point(sim, top_down_map, position, point_type, point_padding=2): 69 | t_x, t_y = maps.to_grid( 70 | position[2], 71 | position[0], 72 | (top_down_map.shape[0], top_down_map.shape[1]), 73 | sim=sim, 74 | ) 75 | top_down_map[ 76 | t_x - point_padding : t_x + point_padding + 1, 77 | t_y - point_padding : t_y + point_padding + 1, 78 | ] = point_type 79 | return top_down_map 80 | 81 | 82 | def draw_bounding_box( 83 | sim, top_down_map, goal_object_id, ref_floor_height, line_thickness=4 84 | ): 85 | sem_scene = sim.semantic_annotations() 86 | object_id = goal_object_id 87 | 88 | sem_obj = None 89 | for object in sem_scene.objects: 90 | if object.id == object_id: 91 | sem_obj = object 92 | break 93 | 94 | center = sem_obj.aabb.center 95 | x_len, _, z_len = sem_obj.aabb.sizes / 2.0 96 | # Nodes to draw rectangle 97 | corners = [ 98 | center + np.array([x, 0, z]) 99 | for x, z in [ 100 | (-x_len, -z_len), 101 | (-x_len, z_len), 102 | (x_len, z_len), 103 | (x_len, -z_len), 104 | (-x_len, -z_len), 105 | ] 106 | if is_on_same_floor(center[1], ref_floor_height=ref_floor_height) 107 | ] 108 | 109 | map_corners = [ 110 | maps.to_grid( 111 | p[2], 112 | p[0], 113 | ( 114 | top_down_map.shape[0], 115 | top_down_map.shape[1], 116 | ), 117 | sim=sim, 118 | ) 119 | for p in corners 120 | ] 121 | 122 | maps.draw_path( 123 | top_down_map, 124 | map_corners, 125 | maps.MAP_TARGET_BOUNDING_BOX, 126 | line_thickness, 127 | ) 128 | return top_down_map 129 | 130 | 131 | def load_encoder(encoder, path): 132 | assert os.path.exists(path) 133 | if isinstance(encoder.backbone, ResNet): 134 | state_dict = torch.load(path, map_location="cpu")["teacher"] 135 | state_dict = {k.replace("module.", ""): v for k, v in state_dict.items()} 136 | return encoder.load_state_dict(state_dict=state_dict, strict=False) 137 | else: 138 | raise ValueError("unknown encoder backbone") 139 | 140 | 141 | def count_episodes(path): 142 | files = glob.glob(os.path.join(path, "*.json.gz")) 143 | count = 0 144 | categories = defaultdict(int) 145 | for f in tqdm(files): 146 | dataset = load_dataset(f) 147 | for episode in dataset["episodes"]: 148 | categories[episode["object_category"]] += 1 149 | count += len(dataset["episodes"]) 150 | print("Total episodes: {}".format(count)) 151 | print("Categories: {}".format(categories)) 152 | print("Total categories: {}".format(len(categories))) 153 | return count, categories 154 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 |
3 |
12 | Project Website 13 |
14 | 15 | 16 | ## Overview 17 | 18 | We present the Habitat-Matterport 3D Open Vocabulary Object Goal Navigation dataset (HM3D-OVON), a large-scale benchmark that broadens the scope and se- mantic range of prior Object Goal Navigation (ObjectNav) benchmarks. Leveraging the HM3DSem dataset, HM3D-OVON incorporates over 15k annotated instances of household objects across 379 distinct categories, derived from photo-realistic 3D scans of real-world environments. In contrast to earlier ObjectNav datasets, which limit goal objects to a predefined set of 6-21 categories, HM3D-OVON facilitates the training and evaluation of models with an open-set of goals defined through free-form language at test-time. Through this open-vocabulary formulation, HM3D-OVON encourages progress towards learning visuo-semantic navigation behaviors that are capable of searching for any object specified by text in an open- vocabulary manner. Additionally, we systematically evaluate and compare several different types of approaches on HM3D-OVON. We find that HM3D-OVON can be used to train an open-vocabulary ObjectNav agent that achieves both higher performance and is more robust to localization and actuation noise than the state-of-the-art ObjectNav approach. We hope that our benchmark and baseline results will drive interest in developing embodied agents that can navigate real-world spaces to find household objects specified through free-form language, taking a step towards more flexible and human-like semantic visual navigation. Videos available at: naoki.io/ovon. 19 | 20 | ## :hammer_and_wrench: Installation 21 | 22 | ### Getting Started 23 | 24 | Create the conda environment and install all of the dependencies. Mamba is recommended for faster installation: 25 | ```bash 26 | conda_env_name=ovon 27 | mamba create -n $conda_env_name python=3.7 cmake=3.14.0 -y 28 | mamba install -n $conda_env_name \ 29 | habitat-sim=0.2.3 headless pytorch=1.12.1 cudatoolkit=11.3 \ 30 | -c pytorch -c nvidia -c conda-forge -c aihabitat -y 31 | 32 | # Install this repo as a package 33 | mamba activate $conda_env_name 34 | pip install -e . 35 | 36 | # Install frontier_exploration 37 | cd frontier_exploration && pip install -e . && cd .. 38 | 39 | # Install habitat-lab 40 | git clone --branch v0.2.3 git@github.com:facebookresearch/habitat-lab.git 41 | cd habitat-lab 42 | pip install -e habitat-lab 43 | pip install -e habitat-baselines 44 | 45 | pip install ftfy regex tqdm GPUtil trimesh seaborn timm scikit-learn einops transformers 46 | pip install git+https://github.com/openai/CLIP.git 47 | ``` 48 | ## :dart: Downloading the datasets 49 | First, set the following variables during installation (don't need to put in .bashrc): 50 | ```bash 51 | MATTERPORT_TOKEN_ID=