├── .gitignore ├── LICENSE ├── README.md ├── conda_env.yml ├── diffusion_reward ├── configs │ ├── models │ │ ├── codec_models │ │ │ └── vqgan │ │ │ │ ├── dataset │ │ │ │ ├── adroit.yaml │ │ │ │ └── metaworld.yaml │ │ │ │ └── default.yaml │ │ └── video_models │ │ │ ├── videogpt │ │ │ ├── dataset │ │ │ │ ├── adroit.yaml │ │ │ │ └── metaworld.yaml │ │ │ └── default.yaml │ │ │ └── vqdiffusion │ │ │ ├── dataset │ │ │ ├── adroit.yaml │ │ │ └── metaworld.yaml │ │ │ └── default.yaml │ └── rl │ │ ├── agent │ │ └── drqv2.yaml │ │ ├── default.yaml │ │ ├── reward │ │ ├── amp.yaml │ │ ├── diffusion_reward.yaml │ │ ├── expl_reward │ │ │ └── rnd.yaml │ │ ├── rnd.yaml │ │ └── viper.yaml │ │ └── task │ │ ├── assembly-v2.yaml │ │ ├── door-v0.yaml │ │ ├── hammer-v0.yaml │ │ └── pen-v0.yaml ├── envs │ ├── __init__.py │ ├── adroit.py │ ├── metaworld.py │ └── wrapper.py ├── models │ ├── codec_models │ │ └── vqgan │ │ │ ├── README.md │ │ │ ├── codebook.py │ │ │ ├── decoder.py │ │ │ ├── discriminator.py │ │ │ ├── encoder.py │ │ │ ├── helper.py │ │ │ ├── lpips.py │ │ │ ├── utils.py │ │ │ └── vqgan.py │ ├── reward_models │ │ ├── __init__.py │ │ ├── amp.py │ │ ├── diffusion_reward.py │ │ ├── expl_rewards │ │ │ └── rnd.py │ │ ├── rnd.py │ │ ├── statistics │ │ │ ├── diffusion_reward │ │ │ │ ├── entropy │ │ │ │ │ ├── adroit.yaml │ │ │ │ │ └── metaworld.yaml │ │ │ │ └── likelihood │ │ │ │ │ ├── adroit.yaml │ │ │ │ │ └── metaworld.yaml │ │ │ └── viper │ │ │ │ ├── entropy │ │ │ │ ├── adroit.yaml │ │ │ │ └── metaworld.yaml │ │ │ │ └── likelihood │ │ │ │ ├── adroit.yaml │ │ │ │ └── metaworld.yaml │ │ └── viper.py │ └── video_models │ │ ├── videogpt │ │ ├── README.md │ │ ├── helper.py │ │ ├── mingpt.py │ │ ├── transformer.py │ │ └── utils.py │ │ └── vqdiffusion │ │ ├── README.md │ │ ├── data │ │ ├── build.py │ │ └── dataset.py │ │ ├── distributed │ │ ├── distributed.py │ │ └── launch.py │ │ ├── engine │ │ ├── clip_grad_norm.py │ │ ├── ema.py │ │ ├── logger.py │ │ ├── lr_scheduler.py │ │ └── solver.py │ │ ├── modeling │ │ ├── build.py │ │ ├── codecs │ │ │ ├── base_codec.py │ │ │ └── image_codec │ │ │ │ └── vqgan.py │ │ ├── embeddings │ │ │ ├── base_embedding.py │ │ │ ├── dalle_mask_image_embedding.py │ │ │ └── frame_embedding.py │ │ ├── models │ │ │ └── frame_conditional_dalle.py │ │ ├── transformers │ │ │ ├── diffusion_transformer.py │ │ │ └── transformer_utils.py │ │ └── utils │ │ │ └── misc.py │ │ └── utils │ │ ├── io.py │ │ └── misc.py └── rl │ └── drqv2 │ ├── agent.py │ ├── logger.py │ ├── replay_buffer.py │ ├── utils.py │ └── video.py ├── docs └── diffusion_reward_overview.png ├── env_dependencies ├── mj_envs │ ├── LICENSE │ ├── README.md │ ├── dependencies │ │ └── Adroit │ │ │ ├── Adroit_hand.xml │ │ │ ├── Adroit_hand_withOverlay.xml │ │ │ ├── LICENSE │ │ │ ├── README.md │ │ │ ├── gallery │ │ │ ├── news.JPG │ │ │ └── projects.JPG │ │ │ └── resources │ │ │ ├── assets.xml │ │ │ ├── chain.xml │ │ │ ├── chain1.xml │ │ │ ├── joint_position_actuation.xml │ │ │ ├── meshes │ │ │ ├── F1.stl │ │ │ ├── F2.stl │ │ │ ├── F3.stl │ │ │ ├── TH1_z.stl │ │ │ ├── TH2_z.stl │ │ │ ├── TH3_z.stl │ │ │ ├── arm_base.stl │ │ │ ├── arm_trunk.stl │ │ │ ├── arm_trunk_asmbly.stl │ │ │ ├── distal_ellipsoid.stl │ │ │ ├── elbow_flex.stl │ │ │ ├── elbow_rotate_motor.stl │ │ │ ├── elbow_rotate_muscle.stl │ │ │ ├── forearm_Cy_PlateAsmbly(muscle_cone).stl │ │ │ ├── forearm_Cy_PlateAsmbly.stl │ │ │ ├── forearm_PlateAsmbly.stl │ │ │ ├── forearm_electric.stl │ │ │ ├── forearm_electric_cvx.stl │ │ │ ├── forearm_muscle.stl │ │ │ ├── forearm_simple.stl │ │ │ ├── forearm_simple_cvx.stl │ │ │ ├── forearm_weight.stl │ │ │ ├── knuckle.stl │ │ │ ├── lfmetacarpal.stl │ │ │ ├── palm.stl │ │ │ ├── upper_arm.stl │ │ │ ├── upper_arm_asmbl_shoulder.stl │ │ │ ├── upper_arm_ass.stl │ │ │ └── wrist.stl │ │ │ ├── tendon_torque_actuation.xml │ │ │ └── textures │ │ │ ├── darkwood.png │ │ │ ├── dice.png │ │ │ ├── foil.png │ │ │ ├── marble.png │ │ │ ├── silverRaw.png │ │ │ ├── skin.png │ │ │ ├── square.png │ │ │ ├── wood.png │ │ │ └── woodb.png │ ├── mj_envs │ │ ├── __init__.py │ │ ├── hand_manipulation_suite │ │ │ ├── __init__.py │ │ │ ├── assets │ │ │ │ ├── DAPG_Adroit.xml │ │ │ │ ├── DAPG_assets.xml │ │ │ │ ├── DAPG_door.xml │ │ │ │ ├── DAPG_hammer.xml │ │ │ │ ├── DAPG_pen.xml │ │ │ │ ├── DAPG_relocate.xml │ │ │ │ └── tasks.jpg │ │ │ ├── door_v0.py │ │ │ ├── hammer_v0.py │ │ │ ├── pen_v0.py │ │ │ └── relocate_v0.py │ │ └── utils │ │ │ ├── quatmath.py │ │ │ └── visualize_env.py │ └── setup.py └── mjrl │ ├── LICENSE │ ├── README.md │ ├── examples │ ├── README.md │ ├── behavior_clone.py │ ├── example_configs │ │ ├── hopper_npg.txt │ │ ├── swimmer_npg.txt │ │ └── swimmer_ppo.txt │ ├── linear_nn_comparison.py │ └── policy_opt_job_script.py │ ├── mjrl │ ├── __init__.py │ ├── algos │ │ ├── __init__.py │ │ ├── batch_reinforce.py │ │ ├── behavior_cloning.py │ │ ├── dapg.py │ │ ├── mbac.py │ │ ├── model_accel │ │ │ ├── __init__.py │ │ │ ├── model_accel_npg.py │ │ │ ├── model_learning_mpc.py │ │ │ ├── nn_dynamics.py │ │ │ ├── run_experiments │ │ │ │ ├── configs │ │ │ │ │ ├── point_mass.txt │ │ │ │ │ └── reacher.txt │ │ │ │ ├── run_model_accel_npg.py │ │ │ │ ├── sandbox │ │ │ │ │ ├── example_config_mpc.txt │ │ │ │ │ └── run_model_learning_mpc.py │ │ │ │ └── utils │ │ │ │ │ ├── reward_functions │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── mjrl_point_mass.py │ │ │ │ │ ├── visualize_policy.py │ │ │ │ │ └── visualize_trajectories.py │ │ │ └── sampling.py │ │ ├── npg_cg.py │ │ ├── ppo_clip.py │ │ └── trpo.py │ ├── baselines │ │ ├── __init__.py │ │ ├── linear_baseline.py │ │ ├── mlp_baseline.py │ │ ├── quadratic_baseline.py │ │ └── zero_baseline.py │ ├── envs │ │ ├── __init__.py │ │ ├── assets │ │ │ ├── peg_insertion.xml │ │ │ ├── point_mass.xml │ │ │ ├── sawyer.xml │ │ │ └── swimmer.xml │ │ ├── mujoco_env.py │ │ ├── peg_insertion_sawyer.py │ │ ├── point_mass.py │ │ ├── reacher_sawyer.py │ │ └── swimmer.py │ ├── policies │ │ ├── __init__.py │ │ ├── gaussian_linear.py │ │ ├── gaussian_mlp.py │ │ └── mpc_actor.py │ ├── samplers │ │ ├── __init__.py │ │ └── core.py │ └── utils │ │ ├── __init__.py │ │ ├── cg_solve.py │ │ ├── fc_network.py │ │ ├── get_environment.py │ │ ├── gym_env.py │ │ ├── logger.py │ │ ├── make_train_plots.py │ │ ├── optimize_model.py │ │ ├── plot_from_logs.py │ │ ├── process_samples.py │ │ ├── tensor_utils.py │ │ ├── train_agent.py │ │ └── visualize_policy.py │ ├── setup.py │ ├── setup │ ├── README.md │ └── env.yml │ └── tests │ ├── hydra │ ├── config │ │ └── hydra_npg_config.yaml │ └── hydra_policy_opt_job_script.py │ ├── point_mass_test.py │ └── visualizer_test.py ├── scripts ├── run │ ├── codec_model │ │ ├── vqgan_adroit.sh │ │ └── vqgan_metaworld.sh │ ├── rl │ │ ├── drqv2_adroit_amp.sh │ │ ├── drqv2_adroit_diffusion_reward.sh │ │ ├── drqv2_adroit_raw_sparse_reward.sh │ │ ├── drqv2_adroit_rnd.sh │ │ ├── drqv2_adroit_viper.sh │ │ ├── drqv2_adroit_viper_std.sh │ │ ├── drqv2_metaworld_amp.sh │ │ ├── drqv2_metaworld_diffusion_reward.sh │ │ ├── drqv2_metaworld_raw_sparse_reward.sh │ │ ├── drqv2_metaworld_rnd.sh │ │ ├── drqv2_metaworld_viper.sh │ │ └── drqv2_metaworld_viper_std.sh │ └── video_model │ │ ├── videogpt_adroit.sh │ │ ├── videogpt_metaworld.sh │ │ ├── vqdiffusion_adroit.sh │ │ └── vqdiffusion_metaworld.sh ├── train_drqv2.py ├── train_videogpt.py ├── train_vqdiffusion.py └── train_vqgan.py └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # dataset and exp 156 | *video_dataset* 157 | *exp_local* 158 | *mujoco-py* 159 | *diffusion-reward-mw* 160 | *Metaworld* 161 | 162 | # PyCharm 163 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 164 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 165 | # and can be added to the global gitignore or merged into this file. For a more nuclear 166 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 167 | #.idea/ 168 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Tao Huang 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /conda_env.yml: -------------------------------------------------------------------------------- 1 | name: diffusion_reward 2 | channels: 3 | - defaults 4 | dependencies: 5 | - python=3.8 6 | - pip=21.1.3 7 | - numpy=1.19.2 8 | - absl-py=0.13.0 9 | - pyparsing=2.4.7 10 | - jupyterlab=3.0.14 11 | - scikit-image=0.18.1 12 | - pip: 13 | - termcolor==1.1.0 14 | - imageio==2.9.0 15 | - imageio-ffmpeg==0.4.4 16 | - pandas==1.3.0 17 | - ipdb==0.13.9 18 | - yapf==0.31.0 19 | - sklearn==0.0 20 | - matplotlib==3.4.2 21 | - opencv-python==4.5.3.56 22 | - wandb==0.15.4 23 | - hydra-core==1.1.0 24 | - hydra-submitit-launcher==1.1.5 25 | - gym==0.21.0 26 | - setuptools==63.2.0 27 | - tb-nightly 28 | - tqdm 29 | - einops 30 | - dm_env 31 | -------------------------------------------------------------------------------- /diffusion_reward/configs/models/codec_models/vqgan/dataset/adroit.yaml: -------------------------------------------------------------------------------- 1 | domain: adroit 2 | latent_size: 8 -------------------------------------------------------------------------------- /diffusion_reward/configs/models/codec_models/vqgan/dataset/metaworld.yaml: -------------------------------------------------------------------------------- 1 | domain: metaworld 2 | latent_size: 8 -------------------------------------------------------------------------------- /diffusion_reward/configs/models/codec_models/vqgan/default.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - _self_ 3 | - dataset@_global_: adroit 4 | 5 | # File path 6 | cwd: ${hydra:runtime.output_dir} 7 | 8 | # Env 9 | domain: adroit 10 | multi_task: true 11 | 12 | # Model 13 | latent_dim: 64 14 | code_dim: 64 15 | num_codebook_vectors: 1024 16 | channels: [128, 128, 256, 256] 17 | resolution: ${image_size} 18 | latent_size: 8 19 | 20 | # Training 21 | image_size: 64 22 | image_channels: 3 23 | beta: 0.25 24 | device: cuda 25 | batch_size: 32 26 | epochs: 200 27 | learning_rate: 1e-4 28 | beta1: 0.5 29 | beta2: 0.9 30 | disc_start: 1000 31 | disc_factor: 0.1 32 | rec_loss_factor: 1 33 | perceptual_loss_factor: 0.1 34 | dataset_path: /video_dataset/${domain} 35 | 36 | 37 | # Working space 38 | hydra: 39 | run: 40 | dir: ./exp_local/codec_models/vqgan/${domain} -------------------------------------------------------------------------------- /diffusion_reward/configs/models/video_models/videogpt/dataset/adroit.yaml: -------------------------------------------------------------------------------- 1 | domain: adroit 2 | latent_size: 8 3 | epochs: 5000 -------------------------------------------------------------------------------- /diffusion_reward/configs/models/video_models/videogpt/dataset/metaworld.yaml: -------------------------------------------------------------------------------- 1 | domain: metaworld 2 | latent_size: 8 3 | epochs: 5000 -------------------------------------------------------------------------------- /diffusion_reward/configs/models/video_models/videogpt/default.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - _self_ 3 | - dataset@_global_: adroit 4 | 5 | # env 6 | domain: adroit 7 | 8 | # model 9 | codec: 10 | latent_dim: 64 11 | code_dim: 64 12 | channels: [128, 128, 256, 256] 13 | resolution: ${image_size} 14 | latent_size: 8 15 | checkpoint_path: /exp_local/codec_models/vqgan/${domain}/results/checkpoints/vqgan.pt 16 | image_channels: 3 17 | beta: 0.25 18 | device: ${device} 19 | num_codebook_vectors: 1024 20 | 21 | transformer: 22 | vocab_size: ${codec.num_codebook_vectors} 23 | block_size: 5120 24 | n_layer: 8 25 | n_head: 8 26 | n_embd: 512 27 | use_vqemb: ${use_vqemb} 28 | code_dim: ${codec.code_dim} 29 | 30 | pkeep: 0.8 31 | sos_token: 0 32 | num_frames: 2 33 | frame_skip: 1 34 | use_vqemb: true 35 | device: cuda 36 | 37 | # training 38 | image_size: 64 39 | batch_size: 32 40 | dataset_path: /video_dataset/${domain} 41 | 42 | # Working space 43 | hydra: 44 | run: 45 | dir: ./exp_local/video_models/videogpt/${domain} 46 | 47 | -------------------------------------------------------------------------------- /diffusion_reward/configs/models/video_models/vqdiffusion/dataset/adroit.yaml: -------------------------------------------------------------------------------- 1 | exp_name: adroit 2 | 3 | # change from o4 4 | model: 5 | target: diffusion_reward.models.video_models.vqdiffusion.modeling.models.frame_conditional_dalle.FC_DALLE 6 | params: 7 | content_info: {key: image} 8 | condition_info: {key: frame} 9 | frame_skip: 1 10 | content_codec_config: 11 | target: diffusion_reward.models.video_models.vqdiffusion.modeling.codecs.image_codec.vqgan.MiniVQGAN 12 | params: 13 | args: {latent_dim: 64, device: 'cuda', image_channels: 3, num_codebook_vectors: 1024, beta: 0.25, channels: [128, 128, 256, 256], resolution: 64, latent_size: 8} 14 | trainable: False 15 | token_shape: [8, 8] 16 | #config_path: 'OUTPUT/pretrained_model/taming_dvae/vqgan_imagenet_f16_16384.yaml' 17 | ckpt_path: /exp_local/codec_models/vqgan/adroit/results/checkpoints/vqgan.pt 18 | # num_tokens: 16384 19 | # quantize_number: 974 20 | # mapping_path: './help_folder/statistics/taming_vqvae_974.pt' 21 | # return_logits: True 22 | diffusion_config: 23 | target: diffusion_reward.models.video_models.vqdiffusion.modeling.transformers.diffusion_transformer.DiffusionTransformer 24 | params: 25 | diffusion_step: 100 26 | alpha_init_type: 'alpha1' 27 | auxiliary_loss_weight: 1.0e-3 28 | adaptive_auxiliary_loss: True 29 | mask_weight: [1, 1] # the loss weight on mask region and non-mask region 30 | 31 | transformer_config: 32 | target: diffusion_reward.models.video_models.vqdiffusion.modeling.transformers.transformer_utils.Text2ImageTransformer 33 | params: 34 | diffusion_step: ??? 35 | content_emb_config: ??? 36 | attn_type: 'selfcross' 37 | n_layer: 16 38 | condition_seq_len: 128 ###### 77 for clip and 256 for dalle 39 | content_seq_len: 64 # 32 x 32 40 | content_spatial_size: [8, 8] 41 | n_embd: 128 # the dim of embedding dims 42 | condition_dim: 1024 43 | n_head: 16 44 | attn_pdrop: 0.0 45 | resid_pdrop: 0.0 46 | block_activate: GELU2 47 | timestep_type: 'adalayernorm' # adainsnorm or adalayernorm and abs 48 | mlp_hidden_times: 2 49 | mlp_type: 'conv_mlp' 50 | condition_emb_config: 51 | target: diffusion_reward.models.video_models.vqdiffusion.modeling.embeddings.frame_embedding.FrameEmbedding 52 | params: 53 | num_embed: 1024 # 54 | embed_dim: 1024 55 | identity: false 56 | trainable: true 57 | num_cond_frames: 2 58 | content_emb_config: 59 | target: diffusion_reward.models.video_models.vqdiffusion.modeling.embeddings.dalle_mask_image_embedding.DalleMaskImageEmbedding 60 | params: 61 | num_embed: 1024 62 | spatial_size: [8, 8] 63 | embed_dim: 128 64 | trainable: True 65 | pos_emb_type: embedding 66 | 67 | dataloader: 68 | data_root: "/video_dataset/adroit/" 69 | batch_size: 4 70 | num_workers: 4 71 | train_datasets: # a list of configures, so we can combine several schedulers 72 | - target: diffusion_reward.models.video_models.vqdiffusion.data.dataset.VideoDataset 73 | params: 74 | data_root: ${dataloader.data_root} 75 | phase: train 76 | frame_skip: ${model.params.frame_skip} 77 | frames_per_sample: 3 78 | 79 | validation_datasets: 80 | - target: diffusion_reward.models.video_models.vqdiffusion.data.dataset.VideoDataset 81 | params: 82 | data_root: ${dataloader.data_root} 83 | phase: test 84 | frame_skip: ${model.params.frame_skip} 85 | frames_per_sample: 3 -------------------------------------------------------------------------------- /diffusion_reward/configs/models/video_models/vqdiffusion/dataset/metaworld.yaml: -------------------------------------------------------------------------------- 1 | exp_name: metaworld 2 | 3 | # change from o4 4 | model: 5 | target: diffusion_reward.models.video_models.vqdiffusion.modeling.models.frame_conditional_dalle.FC_DALLE 6 | params: 7 | content_info: {key: image} 8 | condition_info: {key: frame} 9 | frame_skip: 1 10 | content_codec_config: 11 | target: diffusion_reward.models.video_models.vqdiffusion.modeling.codecs.image_codec.vqgan.MiniVQGAN 12 | params: 13 | args: {latent_dim: 64, device: 'cuda', image_channels: 3, num_codebook_vectors: 1024, beta: 0.25, channels: [128, 128, 256, 256], resolution: 64, latent_size: 8} 14 | trainable: False 15 | token_shape: [8, 8] 16 | #config_path: 'OUTPUT/pretrained_model/taming_dvae/vqgan_imagenet_f16_16384.yaml' 17 | ckpt_path: /exp_local/codec_models/vqgan/metaworld/results/checkpoints/vqgan.pt 18 | # num_tokens: 16384 19 | # quantize_number: 974 20 | # mapping_path: './help_folder/statistics/taming_vqvae_974.pt' 21 | # return_logits: True 22 | diffusion_config: 23 | target: diffusion_reward.models.video_models.vqdiffusion.modeling.transformers.diffusion_transformer.DiffusionTransformer 24 | params: 25 | diffusion_step: 100 26 | alpha_init_type: 'alpha1' 27 | auxiliary_loss_weight: 1.0e-3 28 | adaptive_auxiliary_loss: True 29 | mask_weight: [1, 1] # the loss weight on mask region and non-mask region 30 | 31 | transformer_config: 32 | target: diffusion_reward.models.video_models.vqdiffusion.modeling.transformers.transformer_utils.Text2ImageTransformer 33 | params: 34 | diffusion_step: ??? 35 | content_emb_config: ??? 36 | attn_type: 'selfcross' 37 | n_layer: 16 38 | condition_seq_len: 128 ###### 77 for clip and 256 for dalle 39 | content_seq_len: 64 # 32 x 32 40 | content_spatial_size: [8, 8] 41 | n_embd: 128 # the dim of embedding dims 42 | condition_dim: 1024 43 | n_head: 16 44 | attn_pdrop: 0.0 45 | resid_pdrop: 0.0 46 | block_activate: GELU2 47 | timestep_type: 'adalayernorm' # adainsnorm or adalayernorm and abs 48 | mlp_hidden_times: 2 49 | mlp_type: 'conv_mlp' 50 | condition_emb_config: 51 | target: diffusion_reward.models.video_models.vqdiffusion.modeling.embeddings.frame_embedding.FrameEmbedding 52 | params: 53 | num_embed: 1024 # 54 | embed_dim: 1024 55 | identity: false 56 | trainable: true 57 | num_cond_frames: 2 58 | content_emb_config: 59 | target: diffusion_reward.models.video_models.vqdiffusion.modeling.embeddings.dalle_mask_image_embedding.DalleMaskImageEmbedding 60 | params: 61 | num_embed: 1024 62 | spatial_size: [8, 8] 63 | embed_dim: 128 64 | trainable: True 65 | pos_emb_type: embedding 66 | 67 | dataloader: 68 | data_root: "/video_dataset/metaworld/" 69 | batch_size: 4 70 | num_workers: 4 71 | train_datasets: # a list of configures, so we can combine several schedulers 72 | - target: diffusion_reward.models.video_models.vqdiffusion.data.dataset.VideoDataset 73 | params: 74 | data_root: ${dataloader.data_root} 75 | phase: train 76 | frame_skip: ${model.params.frame_skip} 77 | frames_per_sample: 3 78 | 79 | validation_datasets: 80 | - target: diffusion_reward.models.video_models.vqdiffusion.data.dataset.VideoDataset 81 | params: 82 | data_root: ${dataloader.data_root} 83 | phase: test 84 | frame_skip: ${model.params.frame_skip} 85 | frames_per_sample: 3 -------------------------------------------------------------------------------- /diffusion_reward/configs/models/video_models/vqdiffusion/default.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - _self_ 3 | - dataset@_global_: adroit 4 | 5 | exp_name: null 6 | log_frequency: 100 7 | load_path: null 8 | resume_name: null 9 | auto_resume: false 10 | 11 | # ddp 12 | num_node: 1 13 | node_rank: null 14 | dist_usrl: null 15 | gpu: 0 16 | sync_bn: false 17 | tensorboard: false 18 | timestamp: false 19 | 20 | # Random 21 | seed: null 22 | cudnn_deterministic: false 23 | amp: false 24 | debug: false 25 | 26 | # Modify config 27 | opts: null 28 | 29 | # Solver 30 | solver: 31 | base_lr: 3.0e-6 32 | adjust_lr: none # not adjust lr according to total batch_size 33 | max_epochs: 100 34 | save_epochs: 20 35 | validation_epochs: 1 36 | sample_iterations: 400 # epoch #30000 37 | print_specific_things: True 38 | 39 | # config for ema 40 | ema: 41 | decay: 0.99 42 | update_interval: 25 43 | device: cpu 44 | 45 | clip_grad_norm: 46 | target: diffusion_reward.models.video_models.vqdiffusion.engine.clip_grad_norm.ClipGradNorm 47 | params: 48 | start_iteration: 0 49 | end_iteration: 5000 50 | max_norm: 0.5 51 | optimizers_and_schedulers: # a list of configures, so we can config several optimizers and schedulers 52 | - name: none # default is None 53 | optimizer: 54 | target: torch.optim.AdamW 55 | params: 56 | betas: [0.9, 0.96] 57 | weight_decay: 4.5e-2 58 | scheduler: 59 | step_iteration: 1 60 | target: diffusion_reward.models.video_models.vqdiffusion.engine.lr_scheduler.ReduceLROnPlateauWithWarmup 61 | params: 62 | factor: 0.5 63 | patience: 100000 64 | min_lr: 1.0e-6 65 | threshold: 1.0e-1 66 | threshold_mode: rel 67 | warmup_lr: 4.5e-4 # the lr to be touched after warmup 68 | warmup: 5000 69 | 70 | save_dir: ??? 71 | dist_url: ??? 72 | ngpus_per_node: ??? 73 | world_size: ??? 74 | local_rank: ??? 75 | global_rank: ??? 76 | distributed: false 77 | 78 | hydra: 79 | run: 80 | dir: ./exp_local/video_models/vqdiffusion/${exp_name} -------------------------------------------------------------------------------- /diffusion_reward/configs/rl/agent/drqv2.yaml: -------------------------------------------------------------------------------- 1 | _target_: diffusion_reward.rl.drqv2.agent.DrQV2Agent 2 | obs_shape: ??? # to be specified later 3 | action_shape: ??? # to be specified later 4 | device: ${device} 5 | lr: ${lr} 6 | critic_target_tau: 0.01 7 | update_every_steps: 2 8 | use_tb: ${use_tb} 9 | num_expl_steps: 2000 10 | hidden_dim: 1024 11 | feature_dim: ${feature_dim} 12 | stddev_schedule: ${stddev_schedule} 13 | stddev_clip: 0.3 -------------------------------------------------------------------------------- /diffusion_reward/configs/rl/default.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - _self_ 3 | - task@_global_: door 4 | - agent: drqv2 5 | - reward: diffusion_reward 6 | 7 | # task settings 8 | frame_stack: 1 9 | action_repeat: 2 10 | discount: 0.99 11 | 12 | # train settings 13 | num_seed_frames: 4000 14 | update_every_steps: 2 15 | 16 | # eval 17 | eval_every_frames: 20000 18 | num_eval_episodes: 10 19 | 20 | # snapshot 21 | save_snapshot: false 22 | 23 | # wandb 24 | use_wandb: false 25 | 26 | # replay buffer 27 | replay_buffer_size: 1000000 28 | replay_buffer_num_workers: 4 29 | nstep: 3 30 | batch_size: 256 31 | 32 | # misc 33 | seed: 121 34 | device: cuda 35 | save_video: true 36 | save_train_video: false 37 | use_tb: true 38 | 39 | # experiment 40 | exp_name: test 41 | 42 | # agent 43 | lr: 1e-4 44 | feature_dim: 50 45 | 46 | # reward model 47 | use_rm: true 48 | 49 | hydra: 50 | run: 51 | dir: ./exp_local/rl/${now:%Y.%m.%d}/${task_name}/${reward.rm_model}/${exp_name}_${seed} -------------------------------------------------------------------------------- /diffusion_reward/configs/rl/reward/amp.yaml: -------------------------------------------------------------------------------- 1 | rm_model: amp 2 | obs_shape: ??? 3 | action_shape: ??? 4 | device: ${device} 5 | batch_size: 32 6 | 7 | pretrain_rm: false 8 | disc_lr: 1e-4 9 | hidden_dim: 256 10 | input_dim: 64 11 | expl_scale: 1 12 | use_expl_reward: true 13 | expl_update_interval: 10 -------------------------------------------------------------------------------- /diffusion_reward/configs/rl/reward/diffusion_reward.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - _self_ 3 | - expl_reward: rnd 4 | 5 | # task info 6 | task_name: ${task_name} 7 | obs_shape: ??? 8 | action_shape: ??? 9 | 10 | # reward model 11 | rm_model: diffusion_reward 12 | video_model: vqdiffusion 13 | pretrain_rm: true 14 | ckpt_path: /exp_local/video_models/${reward.video_model}/${domain}/checkpoint/best.pth 15 | cfg_path: /exp_local/video_models/${reward.video_model}/${domain}/.hydra/config.yaml 16 | stat_path: /diffusion_reward/models/reward_models/statistics/${reward.rm_model}/${reward.reward_type}/${domain}.yaml 17 | 18 | reward_type: entropy 19 | skip_step: 9 20 | noise: true 21 | num_sample: 1 22 | noise_scale: 1 23 | use_std: true 24 | 25 | # exploration reward 26 | use_expl_reward: true 27 | expl_reward: ${expl_reward} 28 | expl_lr: ${lr} 29 | expl_scale: 0.95 30 | expl_std: true 31 | expl_update_interval: 100 32 | 33 | # environment reward 34 | use_env_reward: true 35 | -------------------------------------------------------------------------------- /diffusion_reward/configs/rl/reward/expl_reward/rnd.yaml: -------------------------------------------------------------------------------- 1 | _target_: diffusion_reward.models.reward_models.expl_rewards.rnd.RND 2 | obs_shape: ??? 3 | action_shape: ??? 4 | device: ${device} -------------------------------------------------------------------------------- /diffusion_reward/configs/rl/reward/rnd.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - _self_ 3 | - expl_reward: rnd 4 | 5 | # task info 6 | task_name: ${task_name} 7 | obs_shape: ??? 8 | action_shape: ??? 9 | 10 | 11 | # reward model 12 | rm_model: rnd 13 | pretrain_rm: true # but none 14 | 15 | # exploration reward 16 | use_expl_reward: true 17 | expl_reward: ${expl_reward} 18 | expl_lr: ${lr} 19 | expl_scale: 1 20 | expl_update_interval: 100 21 | 22 | # environment reward 23 | use_env_reward: true -------------------------------------------------------------------------------- /diffusion_reward/configs/rl/reward/viper.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - _self_ 3 | - expl_reward: rnd 4 | 5 | # task info 6 | task_name: ${task_name} 7 | obs_shape: ??? 8 | action_shape: ??? 9 | 10 | 11 | # reward model 12 | rm_model: viper 13 | video_model: videogpt 14 | pretrain_rm: true 15 | ckpt_path: /exp_local/video_models/${reward.video_model}/${domain}/results/checkpoints/videogpt.pt 16 | cfg_path: /exp_local/video_models/${reward.video_model}/${domain}/.hydra/config.yaml 17 | stat_path: /diffusion_reward/models/reward_models/statistics/${reward.rm_model}/${reward.reward_type}/${domain}.yaml 18 | 19 | reward_type: likelihood 20 | use_std: true 21 | compute_joint: false 22 | 23 | # exploration reward 24 | use_expl_reward: true 25 | expl_reward: ${expl_reward} 26 | expl_lr: ${lr} 27 | expl_scale: 0.5 28 | expl_std: false 29 | expl_update_interval: 100 30 | 31 | # environment reward 32 | use_env_reward: true -------------------------------------------------------------------------------- /diffusion_reward/configs/rl/task/assembly-v2.yaml: -------------------------------------------------------------------------------- 1 | domain: metaworld 2 | task_name: assembly-v2 3 | action_repeat: 3 4 | num_train_frames: 3100000 5 | stddev_schedule: 'linear(1.0,0.1,3000000)' -------------------------------------------------------------------------------- /diffusion_reward/configs/rl/task/door-v0.yaml: -------------------------------------------------------------------------------- 1 | domain: adroit 2 | task_name: door-v0 3 | num_train_frames: 3100000 4 | stddev_schedule: 'linear(1.0,0.1,3000000)' 5 | -------------------------------------------------------------------------------- /diffusion_reward/configs/rl/task/hammer-v0.yaml: -------------------------------------------------------------------------------- 1 | domain: adroit 2 | task_name: hammer-v0 3 | num_train_frames: 3100000 4 | stddev_schedule: 'linear(1.0,0.1,3000000)' 5 | -------------------------------------------------------------------------------- /diffusion_reward/configs/rl/task/pen-v0.yaml: -------------------------------------------------------------------------------- 1 | domain: adroit 2 | task_name: pen-v0 3 | num_train_frames: 3100000 4 | stddev_schedule: 'linear(1.0,0.1,3000000)' -------------------------------------------------------------------------------- /diffusion_reward/envs/__init__.py: -------------------------------------------------------------------------------- 1 | import diffusion_reward.envs.adroit as adroit 2 | import diffusion_reward.envs.metaworld as metaworld 3 | import metaworld.envs.mujoco.env_dict as _mw_envs 4 | 5 | from .adroit import _mj_envs 6 | 7 | 8 | def make_env(name, frame_stack, action_repeat, seed): 9 | if name in _mj_envs: 10 | env = adroit.make(name, frame_stack, action_repeat, seed) 11 | elif name in _mw_envs.ALL_V2_ENVIRONMENTS.keys(): 12 | env = metaworld.make(name, frame_stack, action_repeat, seed) 13 | else: 14 | raise NotImplementedError 15 | return env -------------------------------------------------------------------------------- /diffusion_reward/envs/metaworld.py: -------------------------------------------------------------------------------- 1 | from metaworld import MT1 2 | 3 | from .wrapper import (ExtendedTimeStepWrapper, MetaWorldWrapper, 4 | TimeLimitWrapper) 5 | 6 | 7 | def mw_gym_make(task_name, task_id=0, seed=None): 8 | if seed is not None: 9 | mt1 = MT1(task_name, seed=seed) 10 | else: 11 | mt1 = MT1(task_name) # Construct the benchmark, sampling tasks 12 | env = mt1.train_classes[task_name](render_mode='rgb_array') 13 | 14 | if task_id is not None: 15 | env.set_task(mt1.train_tasks[task_id]) 16 | return env, mt1 17 | 18 | 19 | def make(name, frame_stack, action_repeat, seed, img_size=64, episode_length=100, task_id=0): # TODO change here or reset??? 20 | env, mt1 = mw_gym_make(name, task_id=task_id, seed=seed) 21 | 22 | env = MetaWorldWrapper(env, img_size, frame_stack, action_repeat, mt1=mt1) 23 | env = TimeLimitWrapper(env, max_episode_steps=episode_length) 24 | env = ExtendedTimeStepWrapper(env) 25 | #env = action_scale.Wrapper(env, minimum=-1.0, maximum=+1.0) 26 | 27 | return env 28 | 29 | -------------------------------------------------------------------------------- /diffusion_reward/models/codec_models/vqgan/README.md: -------------------------------------------------------------------------------- 1 | # VQGAN 2 | 3 | We implement VQGAN mainly based on this [repertory](https://github.com/dome272/VQGAN-pytorch). -------------------------------------------------------------------------------- /diffusion_reward/models/codec_models/vqgan/codebook.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class Codebook(nn.Module): 6 | def __init__(self, args): 7 | super(Codebook, self).__init__() 8 | self.num_codebook_vectors = args.num_codebook_vectors 9 | self.latent_dim = args.latent_dim 10 | self.beta = args.beta 11 | 12 | self.embedding = nn.Embedding(self.num_codebook_vectors, self.latent_dim) 13 | self.embedding.weight.data.uniform_(-1.0 / self.num_codebook_vectors, 1.0 / self.num_codebook_vectors) 14 | 15 | def forward(self, z): 16 | z = z.permute(0, 2, 3, 1).contiguous() 17 | z_flattened = z.view(-1, self.latent_dim) 18 | 19 | d = torch.sum(z_flattened**2, dim=1, keepdim=True) + \ 20 | torch.sum(self.embedding.weight**2, dim=1) - \ 21 | 2*(torch.matmul(z_flattened, self.embedding.weight.t())) 22 | 23 | min_encoding_indices = torch.argmin(d, dim=1) 24 | z_q = self.embedding(min_encoding_indices).view(z.shape) 25 | 26 | 27 | loss = torch.mean((z_q.detach() - z)**2) + self.beta * torch.mean((z_q - z.detach())**2) 28 | 29 | z_q = z + (z_q - z).detach() 30 | z_q = z_q.permute(0, 3, 1, 2) 31 | return z_q, min_encoding_indices, loss -------------------------------------------------------------------------------- /diffusion_reward/models/codec_models/vqgan/decoder.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | 3 | from .helper import (GroupNorm, NonLocalBlock, ResidualBlock, Swish, 4 | UpSampleBlock) 5 | 6 | 7 | class Decoder(nn.Module): 8 | def __init__(self, args): 9 | super(Decoder, self).__init__() 10 | #channels = [512, 256, 256, 128, 128] 11 | channels = args.channels[::-1] 12 | attn_resolutions = [16] 13 | num_res_blocks = 2 14 | resolution = args.latent_size 15 | 16 | in_channels = channels[0] 17 | layers = [nn.Conv2d(args.latent_dim, in_channels, 3, 1, 1), 18 | ResidualBlock(in_channels, in_channels), 19 | NonLocalBlock(in_channels), 20 | ResidualBlock(in_channels, in_channels)] 21 | 22 | for i in range(len(channels)): 23 | out_channels = channels[i] 24 | for j in range(num_res_blocks): 25 | layers.append(ResidualBlock(in_channels, out_channels)) 26 | in_channels = out_channels 27 | if resolution in attn_resolutions: 28 | layers.append(NonLocalBlock(in_channels)) 29 | #if i != 0 and resolution < 64: 30 | if resolution < args.resolution: 31 | layers.append(UpSampleBlock(in_channels)) 32 | resolution *= 2 33 | layers.append(GroupNorm(in_channels)) 34 | layers.append(Swish()) 35 | layers.append(nn.Conv2d(in_channels, args.image_channels, 3, 1, 1)) 36 | self.model = nn.Sequential(*layers) 37 | 38 | def forward(self, x): 39 | return self.model(x) -------------------------------------------------------------------------------- /diffusion_reward/models/codec_models/vqgan/discriminator.py: -------------------------------------------------------------------------------- 1 | """ 2 | PatchGAN Discriminator (https://github.com/junyanz/pytorch-CycleGAN-and-pix2pix/blob/master/models/networks.py#L538) 3 | """ 4 | 5 | import torch.nn as nn 6 | 7 | 8 | class Discriminator(nn.Module): 9 | def __init__(self, args, num_filters_last=64, n_layers=3): 10 | super(Discriminator, self).__init__() 11 | 12 | layers = [nn.Conv2d(args.image_channels, num_filters_last, 4, 2, 1), nn.LeakyReLU(0.2)] 13 | num_filters_mult = 1 14 | 15 | for i in range(1, n_layers + 1): 16 | num_filters_mult_last = num_filters_mult 17 | num_filters_mult = min(2 ** i, 8) 18 | layers += [ 19 | nn.Conv2d(num_filters_last * num_filters_mult_last, num_filters_last * num_filters_mult, 4, 20 | 2 if i < n_layers else 1, 1, bias=False), 21 | nn.BatchNorm2d(num_filters_last * num_filters_mult), 22 | nn.LeakyReLU(0.2, True) 23 | ] 24 | 25 | layers.append(nn.Conv2d(num_filters_last * num_filters_mult, 1, 4, 1, 1)) 26 | self.model = nn.Sequential(*layers) 27 | 28 | def forward(self, x): 29 | return self.model(x) -------------------------------------------------------------------------------- /diffusion_reward/models/codec_models/vqgan/encoder.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | 3 | from .helper import (DownSampleBlock, GroupNorm, NonLocalBlock, ResidualBlock, 4 | Swish, UpSampleBlock) 5 | 6 | 7 | class Encoder(nn.Module): 8 | def __init__(self, args): 9 | super(Encoder, self).__init__() 10 | #channels = [128, 128, 128, 256, 256, 512] 11 | channels = args.channels 12 | attn_resolutions = [16] 13 | num_res_blocks = 2 14 | resolution = args.resolution 15 | latent_size = args.latent_size 16 | layers = [nn.Conv2d(args.image_channels, channels[0], 3, 1, 1)] 17 | for i in range(len(channels)-1): 18 | in_channels = channels[i] 19 | out_channels = channels[i + 1] 20 | for j in range(num_res_blocks): 21 | layers.append(ResidualBlock(in_channels, out_channels)) 22 | in_channels = out_channels 23 | if resolution in attn_resolutions: 24 | layers.append(NonLocalBlock(in_channels)) 25 | #if i != len(channels)-2 and resolution > latent_size: 26 | if resolution > latent_size: 27 | layers.append(DownSampleBlock(channels[i+1])) 28 | resolution //= 2 29 | layers.append(ResidualBlock(channels[-1], channels[-1])) 30 | layers.append(NonLocalBlock(channels[-1])) 31 | layers.append(ResidualBlock(channels[-1], channels[-1])) 32 | layers.append(GroupNorm(channels[-1])) 33 | layers.append(Swish()) 34 | layers.append(nn.Conv2d(channels[-1], args.latent_dim, 3, 1, 1)) 35 | self.model = nn.Sequential(*layers) 36 | 37 | def forward(self, x): 38 | return self.model(x) -------------------------------------------------------------------------------- /diffusion_reward/models/codec_models/vqgan/helper.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | class GroupNorm(nn.Module): 7 | def __init__(self, channels): 8 | super(GroupNorm, self).__init__() 9 | self.gn = nn.GroupNorm(num_groups=32, num_channels=channels, eps=1e-6, affine=True) 10 | 11 | def forward(self, x): 12 | return self.gn(x) 13 | 14 | 15 | class Swish(nn.Module): 16 | def forward(self, x): 17 | return x * torch.sigmoid(x) 18 | 19 | 20 | class ResidualBlock(nn.Module): 21 | def __init__(self, in_channels, out_channels): 22 | super(ResidualBlock, self).__init__() 23 | self.in_channels = in_channels 24 | self.out_channels = out_channels 25 | self.block = nn.Sequential( 26 | GroupNorm(in_channels), 27 | Swish(), 28 | nn.Conv2d(in_channels, out_channels, 3, 1, 1), 29 | GroupNorm(out_channels), 30 | Swish(), 31 | nn.Conv2d(out_channels, out_channels, 3, 1, 1) 32 | ) 33 | 34 | if in_channels != out_channels: 35 | self.channel_up = nn.Conv2d(in_channels, out_channels, 1, 1, 0) 36 | 37 | def forward(self, x): 38 | if self.in_channels != self.out_channels: 39 | return self.channel_up(x) + self.block(x) 40 | else: 41 | return x + self.block(x) 42 | 43 | 44 | class UpSampleBlock(nn.Module): 45 | def __init__(self, channels): 46 | super(UpSampleBlock, self).__init__() 47 | self.conv = nn.Conv2d(channels, channels, 3, 1, 1) 48 | 49 | def forward(self, x): 50 | x = F.interpolate(x, scale_factor=2.0) 51 | return self.conv(x) 52 | 53 | 54 | class DownSampleBlock(nn.Module): 55 | def __init__(self, channels): 56 | super(DownSampleBlock, self).__init__() 57 | self.conv = nn.Conv2d(channels, channels, 3, 2, 0) 58 | 59 | def forward(self, x): 60 | pad = (0, 1, 0, 1) 61 | x = F.pad(x, pad, mode="constant", value=0) 62 | return self.conv(x) 63 | 64 | 65 | class NonLocalBlock(nn.Module): 66 | def __init__(self, channels): 67 | super(NonLocalBlock, self).__init__() 68 | self.in_channels = channels 69 | 70 | self.gn = GroupNorm(channels) 71 | self.q = nn.Conv2d(channels, channels, 1, 1, 0) 72 | self.k = nn.Conv2d(channels, channels, 1, 1, 0) 73 | self.v = nn.Conv2d(channels, channels, 1, 1, 0) 74 | self.proj_out = nn.Conv2d(channels, channels, 1, 1, 0) 75 | 76 | def forward(self, x): 77 | h_ = self.gn(x) 78 | q = self.q(h_) 79 | k = self.k(h_) 80 | v = self.v(h_) 81 | 82 | b, c, h, w = q.shape 83 | 84 | q = q.reshape(b, c, h*w) 85 | q = q.permute(0, 2, 1) 86 | k = k.reshape(b, c, h*w) 87 | v = v.reshape(b, c, h*w) 88 | 89 | attn = torch.bmm(q, k) 90 | attn = attn * (int(c)**(-0.5)) 91 | attn = F.softmax(attn, dim=2) 92 | attn = attn.permute(0, 2, 1) 93 | 94 | A = torch.bmm(v, attn) 95 | A = A.reshape(b, c, h, w) 96 | 97 | return x + A -------------------------------------------------------------------------------- /diffusion_reward/models/codec_models/vqgan/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | 4 | import matplotlib.pyplot as plt 5 | import numpy as np 6 | import torch.nn as nn 7 | from PIL import Image 8 | from torch.utils.data import DataLoader, Dataset 9 | 10 | # --------------------------------------------- # 11 | # Data Utils 12 | # --------------------------------------------- # 13 | 14 | class ImagePaths(Dataset): 15 | def __init__(self, path, size=None, is_train=True): 16 | self.size = size 17 | 18 | path = str(Path(__file__).parents[4]) + path 19 | self.images = [] 20 | for root, subdirs, files in os.walk(path): 21 | for name in files: 22 | if is_train and 'train' in root: 23 | self.images.append(os.path.join(root, name)) 24 | if not is_train and 'test' in root: 25 | self.images.append(os.path.join(root, name)) 26 | 27 | self._length = len(self.images) 28 | self.preprocessor = lambda x: x 29 | 30 | def __len__(self): 31 | return self._length 32 | 33 | def preprocess_image(self, image_path): 34 | image = Image.open(image_path) 35 | if not image.mode == "RGB": 36 | image = image.convert("RGB") 37 | 38 | image = np.array(image).astype(np.uint8) 39 | image = (image / 127.5 - 1.0).astype(np.float32) 40 | image = image.transpose(2, 0, 1) 41 | return image 42 | 43 | def __getitem__(self, i): 44 | example = self.preprocess_image(self.images[i]) 45 | return example 46 | 47 | 48 | def load_data(args): 49 | train_data = ImagePaths(args.dataset_path, size=args.image_size, is_train=True) 50 | eval_data = ImagePaths(args.dataset_path, size=args.image_size, is_train=False) 51 | train_loader = DataLoader(train_data, batch_size=args.batch_size, shuffle=True) 52 | eval_loader = DataLoader(eval_data, batch_size=args.batch_size, shuffle=True) 53 | return train_loader, eval_loader 54 | 55 | 56 | # --------------------------------------------- # 57 | # Module Utils 58 | # for Encoder, Decoder etc. 59 | # --------------------------------------------- # 60 | 61 | def weights_init(m): 62 | classname = m.__class__.__name__ 63 | if classname.find('Conv') != -1: 64 | nn.init.normal_(m.weight.data, 0.0, 0.02) 65 | elif classname.find('BatchNorm') != -1: 66 | nn.init.normal_(m.weight.data, 1.0, 0.02) 67 | nn.init.constant_(m.bias.data, 0) 68 | 69 | 70 | # def plot_images(images): 71 | # x = images["input"] 72 | # reconstruction = images["rec"] 73 | # half_sample = images["half_sample"] 74 | # full_sample = images["full_sample"] 75 | 76 | # fig, axarr = plt.subplots(1, 4) 77 | # axarr[0].imshow(x.cpu().detach().numpy()[0].transpose(1, 2, 0)) 78 | # axarr[1].imshow(reconstruction.cpu().detach().numpy()[0].transpose(1, 2, 0)) 79 | # axarr[2].imshow(half_sample.cpu().detach().numpy()[0].transpose(1, 2, 0)) 80 | # axarr[3].imshow(full_sample.cpu().detach().numpy()[0].transpose(1, 2, 0)) 81 | # plt.show() 82 | 83 | 84 | -------------------------------------------------------------------------------- /diffusion_reward/models/codec_models/vqgan/vqgan.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | from .codebook import Codebook 5 | from .decoder import Decoder 6 | from .encoder import Encoder 7 | 8 | 9 | class VQGAN(nn.Module): 10 | def __init__(self, args): 11 | super(VQGAN, self).__init__() 12 | self.encoder = Encoder(args).to(device=args.device) 13 | self.decoder = Decoder(args).to(device=args.device) 14 | self.codebook = Codebook(args).to(device=args.device) 15 | self.quant_conv = nn.Conv2d(args.latent_dim, args.latent_dim, 1).to(device=args.device) 16 | self.post_quant_conv = nn.Conv2d(args.latent_dim, args.latent_dim, 1).to(device=args.device) 17 | 18 | def forward(self, imgs): 19 | encoded_images = self.encoder(imgs) 20 | quant_conv_encoded_images = self.quant_conv(encoded_images) 21 | codebook_mapping, codebook_indices, q_loss = self.codebook(quant_conv_encoded_images) 22 | post_quant_conv_mapping = self.post_quant_conv(codebook_mapping) 23 | decoded_images = self.decoder(post_quant_conv_mapping) 24 | return decoded_images, codebook_indices, q_loss 25 | 26 | def encode(self, imgs): 27 | encoded_images = self.encoder(imgs) 28 | quant_conv_encoded_images = self.quant_conv(encoded_images) 29 | codebook_mapping, codebook_indices, q_loss = self.codebook(quant_conv_encoded_images) 30 | return codebook_mapping, codebook_indices, q_loss 31 | 32 | def decode(self, z): 33 | post_quant_conv_mapping = self.post_quant_conv(z) 34 | decoded_images = self.decoder(post_quant_conv_mapping) 35 | return decoded_images 36 | 37 | def calculate_lambda(self, perceptual_loss, gan_loss): 38 | last_layer = self.decoder.model[-1] 39 | last_layer_weight = last_layer.weight 40 | perceptual_loss_grads = torch.autograd.grad(perceptual_loss, last_layer_weight, retain_graph=True)[0] 41 | gan_loss_grads = torch.autograd.grad(gan_loss, last_layer_weight, retain_graph=True)[0] 42 | 43 | λ = torch.norm(perceptual_loss_grads) / (torch.norm(gan_loss_grads) + 1e-4) 44 | λ = torch.clamp(λ, 0, 1e4).detach() 45 | return 0.8 * λ 46 | 47 | @staticmethod 48 | def adopt_weight(disc_factor, i, threshold, value=0.): 49 | if i < threshold: 50 | disc_factor = value 51 | return disc_factor 52 | 53 | def load_checkpoint(self, path): 54 | self.load_state_dict(torch.load(path)) -------------------------------------------------------------------------------- /diffusion_reward/models/reward_models/__init__.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | from .amp import AMP 4 | from .diffusion_reward import DiffusionReward 5 | from .rnd import RND 6 | from .viper import VIPER 7 | 8 | 9 | def make_rm(cfg): 10 | if cfg.rm_model == 'diffusion_reward': 11 | cfg.cfg_path = str(Path(__file__).parents[3]) + cfg.cfg_path 12 | cfg.ckpt_path = str(Path(__file__).parents[3]) + cfg.ckpt_path 13 | rm = DiffusionReward(cfg=cfg) 14 | elif cfg.rm_model == 'viper': 15 | cfg.cfg_path = str(Path(__file__).parents[3]) + cfg.cfg_path 16 | cfg.ckpt_path = str(Path(__file__).parents[3]) + cfg.ckpt_path 17 | rm = VIPER(cfg=cfg) 18 | elif cfg.rm_model == 'amp': 19 | rm = AMP(cfg) 20 | elif cfg.rm_model == 'rnd': 21 | rm = RND(cfg) 22 | return rm -------------------------------------------------------------------------------- /diffusion_reward/models/reward_models/amp.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | from torch import autograd 5 | 6 | 7 | def to_torch(xs, device): 8 | return tuple(torch.as_tensor(x, device=device) for x in xs) 9 | 10 | 11 | def weight_init(m): 12 | if isinstance(m, nn.Linear): 13 | nn.init.orthogonal_(m.weight.data) 14 | if hasattr(m.bias, 'data'): 15 | m.bias.data.fill_(0.0) 16 | elif isinstance(m, nn.Conv2d) or isinstance(m, nn.ConvTranspose2d): 17 | gain = nn.init.calculate_gain('relu') 18 | nn.init.orthogonal_(m.weight.data, gain) 19 | if hasattr(m.bias, 'data'): 20 | m.bias.data.fill_(0.0) 21 | 22 | 23 | class Discriminator(nn.Module): 24 | def __init__(self, args): 25 | super(Discriminator, self).__init__() 26 | 27 | self.convnet = nn.Sequential(nn.Conv2d(3, 32, 3, stride=2), 28 | nn.ReLU(), nn.Conv2d(32, 32, 3, stride=1), 29 | nn.ReLU(), nn.Conv2d(32, 32, 3, stride=1), 30 | nn.ReLU()) 31 | 32 | with torch.no_grad(): 33 | x = torch.ones((3, 64,64)) 34 | x = self.convnet(x) 35 | self.repr_dim = np.prod(x.shape) 36 | 37 | self.trunk = nn.Sequential( 38 | nn.Linear(self.repr_dim, args.hidden_dim), nn.Tanh(), 39 | nn.Linear(args.hidden_dim, args.hidden_dim), nn.Tanh(), 40 | nn.Linear(args.hidden_dim, 1)) 41 | 42 | self.apply(weight_init) 43 | 44 | def compute_grad_pen(self, 45 | expert_state, 46 | policy_state, 47 | lambda_=0.1): 48 | alpha = torch.rand(expert_state.size(0), 1, 1, 1) 49 | alpha = alpha.expand_as(expert_state).to(expert_state.device) 50 | 51 | mixup_data = alpha * expert_state + (1 - alpha) * policy_state 52 | mixup_data.requires_grad = True 53 | 54 | disc = self(mixup_data) 55 | ones = torch.ones(disc.size()).to(disc.device) 56 | grad = autograd.grad( 57 | outputs=disc, 58 | inputs=mixup_data, 59 | grad_outputs=ones, 60 | create_graph=True, 61 | retain_graph=True, 62 | only_inputs=True)[0] 63 | 64 | grad_pen = lambda_ * (grad.norm(2, dim=1) - 1).pow(2).mean() 65 | return grad_pen 66 | 67 | def forward(self, obs): 68 | x = self.convnet(obs) 69 | x = x.reshape(x.shape[0], -1) 70 | x = self.trunk(x) 71 | return x 72 | 73 | 74 | class AMP(nn.Module): 75 | def __init__(self, args) -> None: 76 | super().__init__() 77 | self.disc = Discriminator(args) 78 | self.disc_optimizer = torch.optim.Adam(self.disc.parameters(), lr=args.disc_lr) 79 | 80 | # set attribute 81 | for attr_name, attr_value in args.items(): 82 | print(attr_name, attr_value) 83 | setattr(self, attr_name, attr_value) 84 | 85 | def update(self, batch, expert_obs): 86 | metrics = dict() 87 | 88 | obs, _, _, _, _ = to_torch(batch, self.device) 89 | obs = obs[:self.batch_size] / 127.5 - 1.0 90 | expert_obs = torch.as_tensor(expert_obs).to(self.device).permute(0, 3, 1, 2) 91 | expert_obs = expert_obs[:self.batch_size] / 127.5 - 1.0 92 | 93 | policy_d = self.disc(obs) 94 | expert_d = self.disc(expert_obs) 95 | 96 | expert_loss = (expert_d - 1).pow(2).mean() 97 | policy_loss = (policy_d + 1).pow(2).mean() 98 | 99 | gail_loss = expert_loss + policy_loss 100 | grad_pen = self.disc.compute_grad_pen(expert_obs, obs) 101 | 102 | loss = gail_loss + grad_pen 103 | 104 | self.disc_optimizer.zero_grad() 105 | loss.backward() 106 | self.disc_optimizer.step() 107 | 108 | metrics['disc_expert_loss'] = expert_loss.item() 109 | metrics['disc_policy_loss'] = policy_loss.item() 110 | metrics['grad_pen'] = grad_pen.item() 111 | return metrics 112 | 113 | @torch.no_grad() 114 | def calc_expl_reward(self, obs, next_obs): 115 | obs = (obs / 127.5 - 1.0).float() 116 | feat = self.disc(obs) 117 | 118 | rewards = torch.clamp(1 - 0.25 * torch.square(feat - 1), min=0) * self.expl_scale 119 | return rewards -------------------------------------------------------------------------------- /diffusion_reward/models/reward_models/expl_rewards/rnd.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | from torch.nn import init 6 | 7 | 8 | class Flatten(nn.Module): 9 | def forward(self, input): 10 | return input.view(input.size(0), -1) 11 | 12 | 13 | class RND(nn.Module): 14 | def __init__(self, obs_shape, action_shape, device, lr=1e-4): 15 | super(RND, self).__init__() 16 | 17 | self.input_size = obs_shape 18 | self.output_size = action_shape 19 | 20 | feature_output = 7 * 7 * 64 21 | self.predictor = nn.Sequential( 22 | nn.Conv2d( 23 | in_channels=3, 24 | out_channels=32, 25 | kernel_size=6, 26 | stride=3), 27 | nn.LeakyReLU(), 28 | nn.Conv2d( 29 | in_channels=32, 30 | out_channels=64, 31 | kernel_size=4, 32 | stride=2), 33 | nn.LeakyReLU(), 34 | nn.Conv2d( 35 | in_channels=64, 36 | out_channels=64, 37 | kernel_size=3, 38 | stride=1), 39 | nn.LeakyReLU(), 40 | Flatten(), 41 | nn.Linear(feature_output, 512), 42 | nn.ReLU(), 43 | nn.Linear(512, 512), 44 | nn.ReLU(), 45 | nn.Linear(512, 512) 46 | ) 47 | 48 | self.target = nn.Sequential( 49 | nn.Conv2d( 50 | in_channels=3, 51 | out_channels=32, 52 | kernel_size=6, 53 | stride=3), 54 | nn.LeakyReLU(), 55 | nn.Conv2d( 56 | in_channels=32, 57 | out_channels=64, 58 | kernel_size=4, 59 | stride=2), 60 | nn.LeakyReLU(), 61 | nn.Conv2d( 62 | in_channels=64, 63 | out_channels=64, 64 | kernel_size=3, 65 | stride=1), 66 | nn.LeakyReLU(), 67 | Flatten(), 68 | nn.Linear(feature_output, 512) 69 | ) 70 | 71 | for p in self.modules(): 72 | if isinstance(p, nn.Conv2d): 73 | init.orthogonal_(p.weight, np.sqrt(2)) 74 | p.bias.data.zero_() 75 | 76 | if isinstance(p, nn.Linear): 77 | init.orthogonal_(p.weight, np.sqrt(2)) 78 | p.bias.data.zero_() 79 | 80 | for param in self.target.parameters(): 81 | param.requires_grad = False 82 | 83 | self.optim = torch.optim.Adam(self.parameters(), lr=lr) 84 | self.device = device 85 | 86 | def forward(self, next_obs): 87 | next_obs = next_obs / 255.0 - 0.5 88 | 89 | target_feature = self.target(next_obs) 90 | predict_feature = self.predictor(next_obs) 91 | return predict_feature, target_feature 92 | 93 | @torch.no_grad() 94 | def calc_reward(self, obs, next_obs): 95 | predict_next_feature, target_next_feature = self(next_obs) 96 | intrinsic_reward = (target_next_feature - predict_next_feature).pow(2).sum(1) / 2 97 | return intrinsic_reward.unsqueeze(1) 98 | 99 | def update(self, batch): 100 | metrics = dict() 101 | 102 | _, _, _, _, next_obs = batch 103 | next_obs = torch.as_tensor(next_obs, device=self.device) 104 | 105 | predict_next_feature, target_next_feature = self(next_obs) 106 | loss = F.mse_loss(predict_next_feature, target_next_feature) 107 | 108 | self.optim.zero_grad(set_to_none=True) 109 | loss.backward() 110 | self.optim.step() 111 | 112 | metrics['rnd_loss'] = loss.item() 113 | return metrics -------------------------------------------------------------------------------- /diffusion_reward/models/reward_models/rnd.py: -------------------------------------------------------------------------------- 1 | import hydra 2 | import torch 3 | import torch.nn as nn 4 | 5 | 6 | class RND(nn.Module): 7 | def __init__(self, cfg): 8 | super(RND, self).__init__() 9 | 10 | # set attribute 11 | for attr_name, attr_value in cfg.items(): 12 | setattr(self, attr_name, attr_value) 13 | 14 | # build exploration reward model 15 | self.use_expl_reward = cfg.use_expl_reward 16 | assert self.use_expl_reward is True 17 | 18 | cfg.expl_reward.obs_shape = cfg.obs_shape 19 | cfg.expl_reward.action_shape = cfg.action_shape 20 | self.expl_reward = hydra.utils.instantiate(cfg.expl_reward) 21 | self.expl_scale = cfg.expl_scale 22 | 23 | @torch.no_grad() 24 | def calc_reward(self, imgs): 25 | zero_rewards = torch.zeros((imgs.shape[1])).unsqueeze(1) 26 | return zero_rewards 27 | 28 | def update(self, batch): 29 | metrics = dict() 30 | metrics.update(self.expl_reward.update(batch)) 31 | return metrics 32 | 33 | @torch.no_grad() 34 | def calc_expl_reward(self, obs, next_obs): 35 | expl_rewards = self.expl_reward.calc_reward(obs, next_obs) * self.expl_scale 36 | return expl_rewards -------------------------------------------------------------------------------- /diffusion_reward/models/reward_models/statistics/diffusion_reward/entropy/adroit.yaml: -------------------------------------------------------------------------------- 1 | door-v0: 2 | 4: 3 | - -99.85789489746094 4 | - 20.900115966796875 5 | 9: 6 | - -102.01958465576172 7 | - 19.904203414916992 8 | 19: 9 | - -105.30337524414062 10 | - 19.738651275634766 11 | 24: 12 | - -106.27964782714844 13 | - 20.017988204956055 14 | 49: 15 | - -109.614990234375 16 | - 19.79887580871582 17 | hammer-v0: 18 | 4: 19 | - -52.246639251708984 20 | - 23.119186401367188 21 | 9: 22 | - -52.8258171081543 23 | - 24.019678115844727 24 | 19: 25 | - -53.98074722290039 26 | - 24.745115280151367 27 | 24: 28 | - -54.72748947143555 29 | - 24.899057388305664 30 | 49: 31 | - -55.65843200683594 32 | - 25.599533081054688 33 | pen-v0: 34 | 4: 35 | - -43.443058013916016 36 | - 12.564669609069824 37 | 9: 38 | - -43.48733139038086 39 | - 11.796517372131348 40 | 19: 41 | - -44.277122497558594 42 | - 11.631793022155762 43 | 24: 44 | - -44.77915573120117 45 | - 11.245503425598145 46 | 49: 47 | - -45.83853530883789 48 | - 11.658675193786621 49 | -------------------------------------------------------------------------------- /diffusion_reward/models/reward_models/statistics/diffusion_reward/entropy/metaworld.yaml: -------------------------------------------------------------------------------- 1 | assembly-2: 2 | 9: 3 | - -158.09600830078125 4 | - 48.051334381103516 5 | coffee-push-v2: 6 | 9: 7 | - -166.4014434814453 8 | - 38.01456069946289 9 | dial-turn-v2: 10 | 9: 11 | - -168.41358947753906 12 | - 34.20352554321289 13 | door-close-v2: 14 | 9: 15 | - -159.4058380126953 16 | - 42.818206787109375 17 | lever-pull-v2: 18 | 9: 19 | - -172.31826782226562 20 | - 34.27272415161133 21 | peg-unplug-side-v2: 22 | 9: 23 | - -173.65188598632812 24 | - 36.53841018676758 25 | reach-v2: 26 | 9: 27 | - -160.3221435546875 28 | - 32.97285461425781 29 | 30 | coffee-button-v2: 31 | 9: 32 | - -175.44143676757812 33 | - 33.5645751953125 34 | faucet-open-v2: 35 | 9: 36 | - -167.95553588867188 37 | - 34.42766571044922 38 | drawer-open-v2: 39 | 9: 40 | - -176.4348907470703 41 | - 32.88665008544922 42 | window-open-v2: 43 | 9: 44 | - -167.4096221923828 45 | - 34.45367431640625 46 | button-press-v2: 47 | 9: 48 | - -174.83164978027344 49 | - 31.393564224243164 50 | door-lock-v2: 51 | 9: 52 | - -179.74874877929688 53 | - 33.15822219848633 54 | handle-press-v2: 55 | 9: 56 | - -174.46499633789062 57 | - 33.27118682861328 58 | reach-wall-v2: 59 | 9: 60 | - -168.5615997314453 61 | - 32.231990814208984 62 | -------------------------------------------------------------------------------- /diffusion_reward/models/reward_models/statistics/diffusion_reward/likelihood/adroit.yaml: -------------------------------------------------------------------------------- 1 | door-v0: 2 | 4: 3 | - -430.7310791015625 4 | - 127.17665100097656 5 | 9: 6 | - -278.6539306640625 7 | - 91.57159423828125 8 | 19: 9 | - -194.44960021972656 10 | - 67.0953140258789 11 | 24: 12 | - -175.6300048828125 13 | - 59.73146438598633 14 | 49: 15 | - -140.99722290039062 16 | - 45.45034408569336 17 | hammer-v0: 18 | 4: 19 | - -243.68846130371094 20 | - 133.51260375976562 21 | 9: 22 | - -164.27122497558594 23 | - 95.91500091552734 24 | 19: 25 | - -116.71536254882812 26 | - 74.1052474975586 27 | 24: 28 | - -107.38645935058594 29 | - 66.78828430175781 30 | 49: 31 | - -90.06214904785156 32 | - 57.643455505371094 33 | pen-v0: 34 | 4: 35 | - -191.0626678466797 36 | - 88.0717544555664 37 | 9: 38 | - -121.95132446289062 39 | - 59.038482666015625 40 | 19: 41 | - -84.7771224975586 42 | - 42.56492233276367 43 | 24: 44 | - -76.8780288696289 45 | - 39.06648635864258 46 | 49: 47 | - -62.30751037597656 48 | - 31.30320930480957 49 | -------------------------------------------------------------------------------- /diffusion_reward/models/reward_models/statistics/diffusion_reward/likelihood/metaworld.yaml: -------------------------------------------------------------------------------- 1 | assembly-v2: 2 | 9: 3 | - -405.15362548828125 4 | - 125.01175689697266 5 | coffee-push-v2: 6 | 9: 7 | - -416.30438232421875 8 | - 101.3809814453125 9 | dial-turn-v2: 10 | 9: 11 | - -424.91204833984375 12 | - 85.98188018798828 13 | door-close-v2: 14 | 9: 15 | - -408.2358093261719 16 | - 109.04842376708984 17 | lever-pull-v2: 18 | 9: 19 | - -440.8123779296875 20 | - 83.52192687988281 21 | peg-unplug-side-v2: 22 | 9: 23 | - -417.7759094238281 24 | - 89.91905975341797 25 | reach-v2: 26 | 9: 27 | - -395.7561340332031 28 | - 80.13951110839844 -------------------------------------------------------------------------------- /diffusion_reward/models/reward_models/statistics/viper/entropy/adroit.yaml: -------------------------------------------------------------------------------- 1 | door-v0: 2 | - -56.66095733642578 3 | - 15.981107711791992 4 | hammer-v0: 5 | - -34.53904342651367 6 | - 15.55008602142334 7 | pen-v0: 8 | - -25.24106788635254 9 | - 4.092796325683594 10 | -------------------------------------------------------------------------------- /diffusion_reward/models/reward_models/statistics/viper/entropy/metaworld.yaml: -------------------------------------------------------------------------------- 1 | assembly-v2: 2 | - -62.393795013427734 3 | - 25.596942901611328 4 | coffee-push-v2: 5 | - -72.31403350830078 6 | - 15.536263465881348 7 | dial-turn-v2: 8 | - -73.47848510742188 9 | - 13.587380409240723 10 | door-close-v2: 11 | - -72.32239532470703 12 | - 17.33110809326172 13 | lever-pull-v2: 14 | - -74.11442565917969 15 | - 10.929166793823242 16 | peg-unplug-side-v2: 17 | - -74.13835144042969 18 | - 13.308944702148438 19 | reach-v2: 20 | - -70.03245544433594 21 | - 12.91872501373291 -------------------------------------------------------------------------------- /diffusion_reward/models/reward_models/statistics/viper/likelihood/adroit.yaml: -------------------------------------------------------------------------------- 1 | door-v0: 2 | - -50.868473052978516 3 | - 18.086612701416016 4 | hammer-v0: 5 | - -37.07987976074219 6 | - 24.520301818847656 7 | pen-v0: 8 | - -24.51329803466797 9 | - 13.530961036682129 10 | -------------------------------------------------------------------------------- /diffusion_reward/models/reward_models/statistics/viper/likelihood/metaworld.yaml: -------------------------------------------------------------------------------- 1 | assembly-v2: 2 | - -166.12330627441406 3 | - 67.40727233886719 4 | coffee-push-v2: 5 | - -148.99526977539062 6 | - 68.51219177246094 7 | dial-turn-v2: 8 | - -139.26890563964844 9 | - 61.01890563964844 10 | door-close-v2: 11 | - -161.33309936523438 12 | - 63.198875427246094 13 | lever-pull-v2: 14 | - -151.5402069091797 15 | - 50.83563232421875 16 | peg-unplug-side-v2: 17 | - -153.0747528076172 18 | - 63.99015808105469 19 | reach-v2: 20 | - -132.399658203125 21 | - 63.75182342529297 -------------------------------------------------------------------------------- /diffusion_reward/models/video_models/videogpt/README.md: -------------------------------------------------------------------------------- 1 | # VideoGPT 2 | We implement VideoGPT and VIPER reward based on [official implementation](https://github.com/wilson1yan/VideoGPT), [MinGPT](https://github.com/karpathy/minGPT), and [VIPER](https://github.com/Alescontrela/viper_rl/). -------------------------------------------------------------------------------- /diffusion_reward/models/video_models/videogpt/helper.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | class GroupNorm(nn.Module): 7 | def __init__(self, channels): 8 | super(GroupNorm, self).__init__() 9 | self.gn = nn.GroupNorm(num_groups=32, num_channels=channels, eps=1e-6, affine=True) 10 | 11 | def forward(self, x): 12 | return self.gn(x) 13 | 14 | 15 | class Swish(nn.Module): 16 | def forward(self, x): 17 | return x * torch.sigmoid(x) 18 | 19 | 20 | class ResidualBlock(nn.Module): 21 | def __init__(self, in_channels, out_channels): 22 | super(ResidualBlock, self).__init__() 23 | self.in_channels = in_channels 24 | self.out_channels = out_channels 25 | self.block = nn.Sequential( 26 | GroupNorm(in_channels), 27 | Swish(), 28 | nn.Conv2d(in_channels, out_channels, 3, 1, 1), 29 | GroupNorm(out_channels), 30 | Swish(), 31 | nn.Conv2d(out_channels, out_channels, 3, 1, 1) 32 | ) 33 | 34 | if in_channels != out_channels: 35 | self.channel_up = nn.Conv2d(in_channels, out_channels, 1, 1, 0) 36 | 37 | def forward(self, x): 38 | if self.in_channels != self.out_channels: 39 | return self.channel_up(x) + self.block(x) 40 | else: 41 | return x + self.block(x) 42 | 43 | 44 | class UpSampleBlock(nn.Module): 45 | def __init__(self, channels): 46 | super(UpSampleBlock, self).__init__() 47 | self.conv = nn.Conv2d(channels, channels, 3, 1, 1) 48 | 49 | def forward(self, x): 50 | x = F.interpolate(x, scale_factor=2.0) 51 | return self.conv(x) 52 | 53 | 54 | class DownSampleBlock(nn.Module): 55 | def __init__(self, channels): 56 | super(DownSampleBlock, self).__init__() 57 | self.conv = nn.Conv2d(channels, channels, 3, 2, 0) 58 | 59 | def forward(self, x): 60 | pad = (0, 1, 0, 1) 61 | x = F.pad(x, pad, mode="constant", value=0) 62 | return self.conv(x) 63 | 64 | 65 | class NonLocalBlock(nn.Module): 66 | def __init__(self, channels): 67 | super(NonLocalBlock, self).__init__() 68 | self.in_channels = channels 69 | 70 | self.gn = GroupNorm(channels) 71 | self.q = nn.Conv2d(channels, channels, 1, 1, 0) 72 | self.k = nn.Conv2d(channels, channels, 1, 1, 0) 73 | self.v = nn.Conv2d(channels, channels, 1, 1, 0) 74 | self.proj_out = nn.Conv2d(channels, channels, 1, 1, 0) 75 | 76 | def forward(self, x): 77 | h_ = self.gn(x) 78 | q = self.q(h_) 79 | k = self.k(h_) 80 | v = self.v(h_) 81 | 82 | b, c, h, w = q.shape 83 | 84 | q = q.reshape(b, c, h*w) 85 | q = q.permute(0, 2, 1) 86 | k = k.reshape(b, c, h*w) 87 | v = v.reshape(b, c, h*w) 88 | 89 | attn = torch.bmm(q, k) 90 | attn = attn * (int(c)**(-0.5)) 91 | attn = F.softmax(attn, dim=2) 92 | attn = attn.permute(0, 2, 1) 93 | 94 | A = torch.bmm(v, attn) 95 | A = A.reshape(b, c, h, w) 96 | 97 | return x + A -------------------------------------------------------------------------------- /diffusion_reward/models/video_models/vqdiffusion/data/build.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.utils.data import ConcatDataset 3 | 4 | from ..utils.misc import instantiate_from_config 5 | 6 | 7 | def build_dataloader(config, args=None, return_dataset=True): 8 | dataset_cfg = config['dataloader'] 9 | train_dataset = [] 10 | for ds_cfg in dataset_cfg['train_datasets']: 11 | ds_cfg['params']['data_root'] = dataset_cfg.get('data_root', '') 12 | ds = instantiate_from_config(ds_cfg) 13 | train_dataset.append(ds) 14 | if len(train_dataset) > 1: 15 | train_dataset = ConcatDataset(train_dataset) 16 | else: 17 | train_dataset = train_dataset[0] 18 | 19 | val_dataset = [] 20 | for ds_cfg in dataset_cfg['validation_datasets']: 21 | ds_cfg['params']['data_root'] = dataset_cfg.get('data_root', '') 22 | ds = instantiate_from_config(ds_cfg) 23 | val_dataset.append(ds) 24 | if len(val_dataset) > 1: 25 | val_dataset = ConcatDataset(val_dataset) 26 | else: 27 | val_dataset = val_dataset[0] 28 | 29 | if args is not None and args.distributed: 30 | train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset, shuffle=True) 31 | val_sampler = torch.utils.data.distributed.DistributedSampler(val_dataset, shuffle=False) 32 | train_iters = len(train_sampler) // dataset_cfg['batch_size'] 33 | val_iters = len(val_sampler) // dataset_cfg['batch_size'] 34 | else: 35 | train_sampler = None 36 | val_sampler = None 37 | train_iters = len(train_dataset) // dataset_cfg['batch_size'] 38 | val_iters = len(val_dataset) // dataset_cfg['batch_size'] 39 | 40 | # if args is not None and not args.debug: 41 | # num_workers = max(2*dataset_cfg['batch_size'], dataset_cfg['num_workers']) 42 | # num_workers = min(64, num_workers) 43 | # else: 44 | # num_workers = dataset_cfg['num_workers'] 45 | num_workers = dataset_cfg['num_workers'] 46 | train_loader = torch.utils.data.DataLoader(train_dataset, 47 | batch_size=dataset_cfg['batch_size'], 48 | shuffle=(train_sampler is None), 49 | num_workers=num_workers, 50 | pin_memory=True, 51 | sampler=train_sampler, 52 | drop_last=True, 53 | persistent_workers=True) 54 | 55 | val_loader = torch.utils.data.DataLoader(val_dataset, 56 | batch_size=dataset_cfg['batch_size'], 57 | shuffle=False, #(val_sampler is None), 58 | num_workers=num_workers, 59 | pin_memory=True, 60 | sampler=val_sampler, 61 | drop_last=True, 62 | persistent_workers=True) 63 | 64 | dataload_info = { 65 | 'train_loader': train_loader, 66 | 'validation_loader': val_loader, 67 | 'train_iterations': train_iters, 68 | 'validation_iterations': val_iters 69 | } 70 | 71 | if return_dataset: 72 | dataload_info['train_dataset'] = train_dataset 73 | dataload_info['validation_dataset'] = val_dataset 74 | 75 | return dataload_info 76 | -------------------------------------------------------------------------------- /diffusion_reward/models/video_models/vqdiffusion/distributed/distributed.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | 3 | import torch 4 | from torch import distributed as dist 5 | from torch.utils import data 6 | 7 | LOCAL_PROCESS_GROUP = None 8 | 9 | 10 | def is_primary(): 11 | return get_rank() == 0 12 | 13 | 14 | def get_rank(): 15 | if not dist.is_available(): 16 | return 0 17 | 18 | if not dist.is_initialized(): 19 | return 0 20 | 21 | return dist.get_rank() 22 | 23 | 24 | def get_local_rank(): 25 | if not dist.is_available(): 26 | return 0 27 | 28 | if not dist.is_initialized(): 29 | return 0 30 | 31 | if LOCAL_PROCESS_GROUP is None: 32 | raise ValueError("tensorfn.distributed.LOCAL_PROCESS_GROUP is None") 33 | 34 | return dist.get_rank(group=LOCAL_PROCESS_GROUP) 35 | 36 | 37 | def synchronize(): 38 | if not dist.is_available(): 39 | return 40 | 41 | if not dist.is_initialized(): 42 | return 43 | 44 | world_size = dist.get_world_size() 45 | 46 | if world_size == 1: 47 | return 48 | 49 | dist.barrier() 50 | 51 | 52 | def get_world_size(): 53 | if not dist.is_available(): 54 | return 1 55 | 56 | if not dist.is_initialized(): 57 | return 1 58 | 59 | return dist.get_world_size() 60 | 61 | 62 | def is_distributed(): 63 | raise RuntimeError('Please debug this function!') 64 | return get_world_size() > 1 65 | 66 | 67 | def all_reduce(tensor, op=dist.ReduceOp.SUM, async_op=False): 68 | world_size = get_world_size() 69 | 70 | if world_size == 1: 71 | return tensor 72 | dist.all_reduce(tensor, op=op, async_op=async_op) 73 | 74 | return tensor 75 | 76 | 77 | def all_gather(data): 78 | world_size = get_world_size() 79 | 80 | if world_size == 1: 81 | return [data] 82 | 83 | buffer = pickle.dumps(data) 84 | storage = torch.ByteStorage.from_buffer(buffer) 85 | tensor = torch.ByteTensor(storage).to("cuda") 86 | 87 | local_size = torch.IntTensor([tensor.numel()]).to("cuda") 88 | size_list = [torch.IntTensor([1]).to("cuda") for _ in range(world_size)] 89 | dist.all_gather(size_list, local_size) 90 | size_list = [int(size.item()) for size in size_list] 91 | max_size = max(size_list) 92 | 93 | tensor_list = [] 94 | for _ in size_list: 95 | tensor_list.append(torch.ByteTensor(size=(max_size,)).to("cuda")) 96 | 97 | if local_size != max_size: 98 | padding = torch.ByteTensor(size=(max_size - local_size,)).to("cuda") 99 | tensor = torch.cat((tensor, padding), 0) 100 | 101 | dist.all_gather(tensor_list, tensor) 102 | 103 | data_list = [] 104 | 105 | for size, tensor in zip(size_list, tensor_list): 106 | buffer = tensor.cpu().numpy().tobytes()[:size] 107 | data_list.append(pickle.loads(buffer)) 108 | 109 | return data_list 110 | 111 | 112 | def reduce_dict(input_dict, average=True): 113 | world_size = get_world_size() 114 | 115 | if world_size < 2: 116 | return input_dict 117 | 118 | with torch.no_grad(): 119 | keys = [] 120 | values = [] 121 | 122 | for k in sorted(input_dict.keys()): 123 | keys.append(k) 124 | values.append(input_dict[k]) 125 | 126 | values = torch.stack(values, 0) 127 | dist.reduce(values, dst=0) 128 | 129 | if dist.get_rank() == 0 and average: 130 | values /= world_size 131 | 132 | reduced_dict = {k: v for k, v in zip(keys, values)} 133 | 134 | return reduced_dict 135 | 136 | 137 | def data_sampler(dataset, shuffle, distributed): 138 | if distributed: 139 | return data.distributed.DistributedSampler(dataset, shuffle=shuffle) 140 | 141 | if shuffle: 142 | return data.RandomSampler(dataset) 143 | 144 | else: 145 | return data.SequentialSampler(dataset) 146 | -------------------------------------------------------------------------------- /diffusion_reward/models/video_models/vqdiffusion/distributed/launch.py: -------------------------------------------------------------------------------- 1 | import diffusion_reward.models.video_models.vqdiffusion.distributed.distributed as dist_fn 2 | import torch 3 | from torch import distributed as dist 4 | from torch import multiprocessing as mp 5 | 6 | 7 | def find_free_port(): 8 | import socket 9 | 10 | sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) 11 | 12 | sock.bind(("", 0)) 13 | port = sock.getsockname()[1] 14 | sock.close() 15 | 16 | return port 17 | 18 | 19 | def launch(fn, n_gpu_per_machine, n_machine=1, machine_rank=0, dist_url=None, args=()): 20 | world_size = n_machine * n_gpu_per_machine 21 | 22 | if world_size > 1: 23 | # if "OMP_NUM_THREADS" not in os.environ: 24 | # os.environ["OMP_NUM_THREADS"] = "1" 25 | 26 | if dist_url == "auto": 27 | if n_machine != 1: 28 | raise ValueError('dist_url="auto" not supported in multi-machine jobs') 29 | 30 | port = find_free_port() 31 | dist_url = f"tcp://127.0.0.1:{port}" 32 | 33 | if n_machine > 1 and dist_url.startswith("file://"): 34 | raise ValueError( 35 | "file:// is not a reliable init method in multi-machine jobs. Prefer tcp://" 36 | ) 37 | 38 | mp.spawn( 39 | distributed_worker, 40 | nprocs=n_gpu_per_machine, 41 | args=(fn, world_size, n_gpu_per_machine, machine_rank, dist_url, args), 42 | daemon=False, 43 | ) 44 | 45 | else: 46 | local_rank = 0 47 | fn(local_rank, *args) 48 | 49 | 50 | def distributed_worker( 51 | local_rank, fn, world_size, n_gpu_per_machine, machine_rank, dist_url, args 52 | ): 53 | if not torch.cuda.is_available(): 54 | raise OSError("CUDA is not available. Please check your environments") 55 | 56 | global_rank = machine_rank * n_gpu_per_machine + local_rank 57 | 58 | try: 59 | dist.init_process_group( 60 | backend="NCCL", 61 | init_method=dist_url, 62 | world_size=world_size, 63 | rank=global_rank, 64 | ) 65 | 66 | except Exception: 67 | raise OSError("failed to initialize NCCL groups") 68 | 69 | dist_fn.synchronize() 70 | 71 | if n_gpu_per_machine > torch.cuda.device_count(): 72 | raise ValueError( 73 | f"specified n_gpu_per_machine larger than available device ({torch.cuda.device_count()})" 74 | ) 75 | 76 | torch.cuda.set_device(local_rank) 77 | 78 | if dist_fn.LOCAL_PROCESS_GROUP is not None: 79 | raise ValueError("torch.distributed.LOCAL_PROCESS_GROUP is not None") 80 | 81 | n_machine = world_size // n_gpu_per_machine 82 | 83 | for i in range(n_machine): 84 | ranks_on_i = list(range(i * n_gpu_per_machine, (i + 1) * n_gpu_per_machine)) 85 | pg = dist.new_group(ranks_on_i) 86 | 87 | if i == machine_rank: 88 | dist_fn.LOCAL_PROCESS_GROUP = pg 89 | 90 | fn(local_rank, *args) 91 | -------------------------------------------------------------------------------- /diffusion_reward/models/video_models/vqdiffusion/engine/clip_grad_norm.py: -------------------------------------------------------------------------------- 1 | from torch.nn.utils import clip_grad_norm_ 2 | 3 | 4 | class ClipGradNorm(object): 5 | def __init__(self, 6 | start_iteration=0, 7 | end_iteration=-1, # if negative, the norm will be always clipped 8 | max_norm=0.5): 9 | self.start_iteration = start_iteration 10 | self.end_iteration = end_iteration 11 | self.max_norm = max_norm 12 | 13 | self.last_epoch = -1 14 | 15 | 16 | def __call__(self, parameters): 17 | self.last_epoch += 1 18 | clip = False 19 | if self.last_epoch >= self.start_iteration: 20 | clip = True 21 | if self.end_iteration > 0 and self.last_epoch < self.end_iteration: 22 | clip = True 23 | if clip: 24 | clip_grad_norm_(parameters, max_norm=self.max_norm) 25 | 26 | def state_dict(self): 27 | return {key: value for key, value in self.__dict__.items()} 28 | 29 | 30 | def load_state_dict(self, state_dict): 31 | self.__dict__.update(state_dict) -------------------------------------------------------------------------------- /diffusion_reward/models/video_models/vqdiffusion/engine/ema.py: -------------------------------------------------------------------------------- 1 | import copy 2 | 3 | import torch 4 | 5 | 6 | class EMA(object): 7 | def __init__(self, 8 | model, 9 | decay=0.99, 10 | update_interval=1, 11 | device=torch.device('cpu')): 12 | 13 | self.decay = decay 14 | self.update_iterval = update_interval 15 | self.device = device 16 | 17 | self.model = model 18 | with torch.no_grad(): 19 | if hasattr(model, 'get_ema_model') and callable(model.get_ema_model): 20 | self.ema_model = copy.deepcopy(model.get_ema_model()) 21 | self.cur_state_dict = model.get_ema_model().state_dict() 22 | else: 23 | self.ema_model = copy.deepcopy(model) 24 | self.cur_state_dict = model.state_dict() 25 | self.ema_model.to(self.device) 26 | self.cur_state_dict = {k: v.clone().to(self.device) for k, v in self.cur_state_dict.items()} 27 | 28 | def update(self, iteration): 29 | if (iteration + 1) % self.update_iterval == 0: 30 | # print('{} Update ema'.format(iteration)) 31 | if hasattr(self.model, 'get_ema_model') and callable(self.model.get_ema_model): 32 | cur_state_dict = self.model.get_ema_model().state_dict() 33 | else: 34 | cur_state_dict = self.model.state_dict() 35 | 36 | ema_state_dict = self.ema_model.state_dict() 37 | for k in ema_state_dict.keys(): 38 | ema_state_dict[k] = ema_state_dict[k] * self.decay + cur_state_dict[k].clone().to(self.device) * (1-self.decay) 39 | self.ema_model.load_state_dict(ema_state_dict) 40 | 41 | def state_dict(self): 42 | return self.ema_model.state_dict() 43 | 44 | def load_state_dict(self, state_dict, strict=True): 45 | state_dict_ = {k: v.clone().to(self.device) for k, v in state_dict.items()} 46 | self.ema_model.load_state_dict(state_dict_, strict=strict) 47 | 48 | def modify_to_inference(self): 49 | # get current model 50 | if hasattr(self.model, 'get_ema_model') and callable(self.model.get_ema_model): 51 | self.cur_state_dict = self.model.get_ema_model().state_dict() 52 | else: 53 | self.cur_state_dict = self.model.state_dict() 54 | self.cur_state_dict = {k: v.clone().to(self.device) for k, v in self.cur_state_dict.items()} 55 | 56 | ema_state_dict = self.ema_model.state_dict() 57 | ema_state_dict = {k: v.to(self.model.device) for k, v in ema_state_dict.items()} 58 | if hasattr(self.model, 'get_ema_model') and callable(self.model.get_ema_model): 59 | self.model.get_ema_model().load_state_dict(ema_state_dict) 60 | else: 61 | self.model.load_state_dict(ema_state_dict) 62 | 63 | def modify_to_train(self): 64 | self.cur_state_dict = {k: v.clone().to(self.model.device) for k, v in self.cur_state_dict.items()} 65 | if hasattr(self.model, 'get_ema_model') and callable(self.model.get_ema_model): 66 | self.model.get_ema_model().load_state_dict(self.cur_state_dict) 67 | else: 68 | self.model.load_state_dict(self.cur_state_dict) 69 | 70 | 71 | -------------------------------------------------------------------------------- /diffusion_reward/models/video_models/vqdiffusion/engine/logger.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, print_function 2 | 3 | import os 4 | import time 5 | 6 | import torch 7 | 8 | from ..distributed.distributed import is_primary 9 | from ..utils.io import save_config_to_yaml, write_args 10 | 11 | 12 | class Logger(object): 13 | def __init__(self, args): 14 | self.args = args 15 | self.save_dir = args.save_dir 16 | self.is_primary = is_primary() 17 | 18 | if self.is_primary: 19 | # save the args and config 20 | self.config_dir = 'configs' 21 | os.makedirs(self.config_dir, exist_ok=True) 22 | file_name = os.path.join(self.config_dir, 'args.txt') 23 | write_args(args, file_name) 24 | 25 | log_dir = 'logs' 26 | if not os.path.exists(log_dir): 27 | os.makedirs(log_dir, exist_ok=True) 28 | self.text_writer = open(os.path.join(log_dir, 'log.txt'), 'a') # 'w') 29 | if args.tensorboard: 30 | self.log_info('using tensorboard') 31 | self.tb_writer = torch.utils.tensorboard.SummaryWriter(log_dir=log_dir) # tensorboard.SummaryWriter(log_dir=log_dir) 32 | else: 33 | self.tb_writer = None 34 | 35 | 36 | def save_config(self, config): 37 | if self.is_primary: 38 | save_config_to_yaml(config, os.path.join(self.config_dir, 'config.yaml')) 39 | 40 | def log_info(self, info, check_primary=True): 41 | if self.is_primary or (not check_primary): 42 | print(info) 43 | if self.is_primary: 44 | info = str(info) 45 | time_str = time.strftime('%Y-%m-%d-%H-%M') 46 | info = '{}: {}'.format(time_str, info) 47 | if not info.endswith('\n'): 48 | info += '\n' 49 | self.text_writer.write(info) 50 | self.text_writer.flush() 51 | 52 | def add_scalar(self, **kargs): 53 | """Log a scalar variable.""" 54 | if self.is_primary: 55 | if self.tb_writer is not None: 56 | self.tb_writer.add_scalar(**kargs) 57 | 58 | def add_scalars(self, **kargs): 59 | """Log a scalar variable.""" 60 | if self.is_primary: 61 | if self.tb_writer is not None: 62 | self.tb_writer.add_scalars(**kargs) 63 | 64 | def add_image(self, **kargs): 65 | """Log a scalar variable.""" 66 | if self.is_primary: 67 | if self.tb_writer is not None: 68 | self.tb_writer.add_image(**kargs) 69 | 70 | def add_images(self, **kargs): 71 | """Log a scalar variable.""" 72 | if self.is_primary: 73 | if self.tb_writer is not None: 74 | self.tb_writer.add_images(**kargs) 75 | 76 | 77 | def close(self): 78 | if self.is_primary: 79 | self.text_writer.close() 80 | self.tb_writer.close() 81 | 82 | -------------------------------------------------------------------------------- /diffusion_reward/models/video_models/vqdiffusion/modeling/build.py: -------------------------------------------------------------------------------- 1 | from ..utils.misc import instantiate_from_config 2 | 3 | 4 | def build_model(config, args=None): 5 | return instantiate_from_config(config['model']) 6 | -------------------------------------------------------------------------------- /diffusion_reward/models/video_models/vqdiffusion/modeling/codecs/base_codec.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | 3 | 4 | class BaseCodec(nn.Module): 5 | 6 | def get_tokens(self, x, **kwargs): 7 | """ 8 | Input: 9 | x: input data 10 | Return: 11 | indices: B x L, the codebook indices, where L is the length 12 | of flattened feature map size 13 | """ 14 | raise NotImplementedError 15 | 16 | def get_number_of_tokens(self): 17 | """ 18 | Return: int, the number of tokens 19 | """ 20 | raise NotImplementedError 21 | 22 | def encode(self, img): 23 | raise NotImplementedError 24 | 25 | def decode(self, img_seq): 26 | raise NotImplementedError 27 | 28 | def forward(self, **kwargs): 29 | raise NotImplementedError 30 | 31 | def train(self, mode=True): 32 | self.training = mode 33 | if self.trainable and mode: 34 | return super().train(True) 35 | else: 36 | return super().train(False) 37 | 38 | def _set_trainable(self): 39 | if not self.trainable: 40 | for pn, p in self.named_parameters(): 41 | p.requires_grad = False 42 | self.eval() -------------------------------------------------------------------------------- /diffusion_reward/models/video_models/vqdiffusion/modeling/codecs/image_codec/vqgan.py: -------------------------------------------------------------------------------- 1 | import math 2 | from pathlib import Path 3 | 4 | import torch 5 | from diffusion_reward.models.codec_models.vqgan.vqgan import VQGAN 6 | 7 | 8 | class AttrDict(dict): 9 | __setattr__ = dict.__setitem__ 10 | 11 | def __getattr__(self, attr): 12 | # Take care that getattr() raises AttributeError, not KeyError. 13 | # Required e.g. for hasattr(), deepcopy and OrderedDict. 14 | try: 15 | return self.__getitem__(attr) 16 | except KeyError: 17 | raise AttributeError("Attribute %r not found" % attr) 18 | 19 | def __getstate__(self): 20 | return self 21 | 22 | def __setstate__(self, d): 23 | self = d 24 | 25 | 26 | class MiniVQGAN(VQGAN): 27 | def __init__( 28 | self, 29 | args, 30 | token_shape=None, 31 | trainable=False, 32 | ckpt_path=None, 33 | latent_size=16 34 | ): 35 | args = AttrDict(args) 36 | super(VQGAN, self).__init__() 37 | 38 | ckpt_path = str(Path(__file__).parents[7]) + ckpt_path 39 | 40 | self.model = VQGAN(args) 41 | self.model.load_checkpoint(ckpt_path) 42 | self.model.eval() 43 | 44 | self.token_shape = token_shape 45 | 46 | def preprocess(self, imgs): 47 | """ 48 | imgs: B x C x H x W, in the range 0-255 49 | """ 50 | imgs = imgs.div(127.5) - 1 # map to -1 - 1 51 | return imgs 52 | # return map_pixels(imgs) 53 | 54 | def postprocess(self, imgs): 55 | """ 56 | imgs: B x C x H x W, in the range -1 - 1 57 | """ 58 | imgs = (imgs + 1) * 127.5 59 | return imgs.clip(0, 255) 60 | 61 | def get_tokens(self, imgs): 62 | if imgs.max() >= 3: 63 | imgs = self.preprocess(imgs) 64 | if imgs.dim() == 4: 65 | embs, code, _ = self.model.encode(imgs) 66 | #output = {'token': code.reshape([embs.shape[0], self.token_shape[0], self.token_shape[1]])} 67 | output = {'token': code.reshape([embs.shape[0], -1])} 68 | elif imgs.dim() == 5: 69 | # serve as cond tokens, no dict 70 | flat_imgs = imgs.flatten(0, 1) 71 | embs, code, _ = self.model.encode(flat_imgs) 72 | output = code.reshape([imgs.shape[0], -1]) 73 | return output 74 | 75 | @torch.no_grad() 76 | def encode_to_z(self, x): 77 | if x.max() >= 3: 78 | x = self.preprocess(x) 79 | if len(x.shape) == 5: 80 | flat_x = x.flatten(0, 1) 81 | quant_z, indices, _ = self.model.encode(flat_x) 82 | else: 83 | quant_z, indices, _ = self.model.encode(x) 84 | 85 | indices = indices.reshape(x.shape[0], -1) 86 | #indices = indices.view(quant_z.shape[0], -1) 87 | quant_z = quant_z.permute(0, 2, 3, 1) 88 | quant_z = quant_z.reshape(x.shape[0], -1, quant_z.shape[-1]) 89 | return quant_z, indices 90 | 91 | def decode(self, z): 92 | latent_size = int(math.sqrt(z.shape[1])) 93 | assert latent_size ** 2 == z.shape[1] 94 | #z = z.reshape([z.shape[0], latent_size, latent_size]) 95 | 96 | ix_to_vectors = self.model.codebook.embedding(z).reshape([z.shape[0], latent_size, latent_size, self.model.codebook.latent_dim]) 97 | ix_to_vectors = ix_to_vectors.permute(0, 3, 1, 2) 98 | image = self.model.decode(ix_to_vectors) 99 | return self.postprocess(image) 100 | -------------------------------------------------------------------------------- /diffusion_reward/models/video_models/vqdiffusion/modeling/embeddings/base_embedding.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | 3 | 4 | class BaseEmbedding(nn.Module): 5 | 6 | def get_loss(self): 7 | return None 8 | 9 | def forward(self, **kwargs): 10 | raise NotImplementedError 11 | 12 | def train(self, mode=True): 13 | self.training = mode 14 | if self.trainable and mode: 15 | super().train() 16 | return self 17 | 18 | def _set_trainable(self): 19 | if not self.trainable: 20 | for pn, p in self.named_parameters(): 21 | p.requires_grad = False 22 | self.eval() 23 | 24 | -------------------------------------------------------------------------------- /diffusion_reward/models/video_models/vqdiffusion/modeling/embeddings/dalle_mask_image_embedding.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | from .base_embedding import BaseEmbedding 5 | 6 | 7 | class DalleMaskImageEmbedding(BaseEmbedding): 8 | def __init__(self, 9 | num_embed=8192, 10 | spatial_size=[32, 32], # height and with 11 | embed_dim=3968, 12 | trainable=True, 13 | pos_emb_type='embedding' 14 | 15 | ): 16 | super().__init__() 17 | 18 | if isinstance(spatial_size, int): 19 | spatial_size = [spatial_size, spatial_size] 20 | 21 | self.spatial_size = spatial_size 22 | self.num_embed = num_embed + 1 23 | self.embed_dim = embed_dim 24 | self.trainable = trainable 25 | self.pos_emb_type = pos_emb_type 26 | 27 | assert self.pos_emb_type in ['embedding', 'parameter'] 28 | 29 | self.emb = nn.Embedding(self.num_embed, embed_dim) 30 | if self.pos_emb_type == 'embedding': 31 | self.height_emb = nn.Embedding(self.spatial_size[0], embed_dim) # height 32 | self.width_emb = nn.Embedding(self.spatial_size[1], embed_dim) # width 33 | else: 34 | self.height_emb = nn.Parameter(torch.zeros(1, self.spatial_size[0], embed_dim)) # height #32,1024 35 | self.width_emb = nn.Parameter(torch.zeros(1, self.spatial_size[1], embed_dim)) # width #32,1024 36 | 37 | self._set_trainable() 38 | 39 | def forward(self, index, **kwargs): 40 | assert index.dim() == 2 # B x L 41 | try: 42 | index[index < 0] = 0 43 | emb = self.emb(index) 44 | except: 45 | raise RuntimeError('IndexError: index out of range in self, max index {}, num embed {}'.format(index.max(), self.num_embed)) 46 | 47 | # add col and row embedding 48 | if emb.shape[1] > 0: 49 | # if False: 50 | if self.pos_emb_type == 'embedding': 51 | height_emb = self.height_emb(torch.arange(self.spatial_size[0], device=index.device).view(1, self.spatial_size[0])).unsqueeze(2) # 1 x H x D -> 1 x H x 1 x D 52 | width_emb = self.width_emb(torch.arange(self.spatial_size[1], device=index.device).view(1, self.spatial_size[1])).unsqueeze(1) # 1 x W x D -> 1 x 1 x W x D 53 | else: 54 | height_emb = self.height_emb.unsqueeze(2) # 1 x H x D -> 1 x H x 1 x D 55 | width_emb = self.width_emb.unsqueeze(1) # 1 x W x D -> 1 x 1 x W x D 56 | pos_emb = (height_emb + width_emb).view(1, self.spatial_size[0] * self.spatial_size[1], -1) # 1 x H x W x D -> 1 x L xD 57 | emb = emb + pos_emb[:, :emb.shape[1], :] 58 | 59 | return emb 60 | -------------------------------------------------------------------------------- /diffusion_reward/models/video_models/vqdiffusion/modeling/embeddings/frame_embedding.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | 3 | from .base_embedding import BaseEmbedding 4 | 5 | 6 | class FrameEmbedding(BaseEmbedding): 7 | def __init__(self, 8 | num_embed=1000, 9 | embed_dim=512, 10 | identity=False, 11 | trainable=True, 12 | num_cond_frames=2, 13 | ): 14 | super().__init__() 15 | self.identity = identity 16 | self.trainable = trainable 17 | self.num_embed = num_embed 18 | self.embed_dim = embed_dim 19 | if self.identity == False: 20 | self.emb = nn.Embedding(self.num_embed, embed_dim) 21 | self._set_trainable() 22 | 23 | def forward(self, index, **kwargs): 24 | """ 25 | index: B x L, index 26 | mask: B x L, bool type. The value of False indicating padded index 27 | """ 28 | if self.identity == True: 29 | return index 30 | else: 31 | emb = self.emb(index)#.unsqueeze(1) 32 | return emb 33 | 34 | -------------------------------------------------------------------------------- /diffusion_reward/models/video_models/vqdiffusion/utils/io.py: -------------------------------------------------------------------------------- 1 | import json 2 | import sys 3 | 4 | import torch 5 | import yaml 6 | 7 | 8 | def load_yaml_config(path): 9 | with open(path) as f: 10 | config = yaml.full_load(f) 11 | return config 12 | 13 | 14 | def save_config_to_yaml(config, path): 15 | assert path.endswith('.yaml') 16 | with open(path, 'w') as f: 17 | f.write(yaml.dump(config)) 18 | f.close() 19 | 20 | def save_dict_to_json(d, path, indent=None): 21 | json.dump(d, open(path, 'w'), indent=indent) 22 | 23 | 24 | def load_dict_from_json(path): 25 | return json.load(open(path, 'r')) 26 | 27 | 28 | def write_args(args, path): 29 | args_dict = dict((name, getattr(args, name)) for name in dir(args)if not name.startswith('_')) 30 | with open(path, 'a') as args_file: 31 | args_file.write('==> torch version: {}\n'.format(torch.__version__)) 32 | args_file.write('==> cudnn version: {}\n'.format(torch.backends.cudnn.version())) 33 | args_file.write('==> Cmd:\n') 34 | args_file.write(str(sys.argv)) 35 | args_file.write('\n==> args:\n') 36 | for k, v in sorted(args_dict.items()): 37 | args_file.write(' %s: %s\n' % (str(k), str(v))) 38 | args_file.close() -------------------------------------------------------------------------------- /diffusion_reward/rl/drqv2/video.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import imageio 3 | 4 | 5 | class VideoRecorder: 6 | def __init__(self, root_dir, render_size=256, fps=20): 7 | if root_dir is not None: 8 | self.save_dir = root_dir / 'eval_video' 9 | self.save_dir.mkdir(exist_ok=True) 10 | else: 11 | self.save_dir = None 12 | 13 | self.render_size = render_size 14 | self.fps = fps 15 | self.frames = [] 16 | 17 | def init(self, env, enabled=True): 18 | self.frames = [] 19 | self.enabled = self.save_dir is not None and enabled 20 | self.record(env) 21 | 22 | def record(self, env): 23 | if self.enabled: 24 | if hasattr(env, 'physics'): 25 | frame = env.physics.render(height=self.render_size, 26 | width=self.render_size, 27 | camera_id=0) 28 | else: 29 | frame = env.render() 30 | self.frames.append(frame) 31 | 32 | def save(self, file_name): 33 | if self.enabled: 34 | path = self.save_dir / file_name 35 | imageio.mimsave(str(path), self.frames, fps=self.fps) 36 | 37 | 38 | class TrainVideoRecorder: 39 | def __init__(self, root_dir, render_size=256, fps=20): 40 | if root_dir is not None: 41 | self.save_dir = root_dir / 'train_video' 42 | self.save_dir.mkdir(exist_ok=True) 43 | else: 44 | self.save_dir = None 45 | 46 | self.render_size = render_size 47 | self.fps = fps 48 | self.frames = [] 49 | 50 | def init(self, obs, enabled=True): 51 | self.frames = [] 52 | self.enabled = self.save_dir is not None and enabled 53 | self.record(obs) 54 | 55 | def record(self, obs): 56 | if self.enabled: 57 | frame = cv2.resize(obs[-3:].transpose(1, 2, 0), 58 | dsize=(self.render_size, self.render_size), 59 | interpolation=cv2.INTER_CUBIC) 60 | self.frames.append(frame) 61 | 62 | def save(self, file_name): 63 | if self.enabled: 64 | path = self.save_dir / file_name 65 | imageio.mimsave(str(path), self.frames, fps=self.fps) 66 | -------------------------------------------------------------------------------- /docs/diffusion_reward_overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TEA-Lab/diffusion_reward/384ec68aa3d590f6266cff3ffb62b6c9135111b9/docs/diffusion_reward_overview.png -------------------------------------------------------------------------------- /env_dependencies/mj_envs/README.md: -------------------------------------------------------------------------------- 1 | # Mujoco Environments 2 | `mj_envs` is a collection of environments/tasks simulated with the [Mujoco](http://www.mujoco.org/) physics engine and wrapped in the OpenAI `gym` API. 3 | 4 | ## Getting Started 5 | `mj_envs` uses git submodules to resolve dependencies. Please follow steps exactly as below to install correctly. 6 | 7 | 1. Clone this repo with pre-populated submodule dependencies 8 | ``` 9 | $ git clone --recursive https://github.com/vikashplus/mj_envs.git 10 | ``` 11 | 2. Update submodules 12 | ``` 13 | $ cd mj_envs 14 | $ git submodule update --remote 15 | ``` 16 | 3. Install package using `pip` 17 | ``` 18 | $ pip install -e . 19 | ``` 20 | **OR** 21 | Add repo to pythonpath by updating `~/.bashrc` or `~/.bash_profile` 22 | ``` 23 | export PYTHONPATH=":$PYTHONPATH" 24 | ``` 25 | 4. You can visualize the environments with random controls using the below command 26 | ``` 27 | $ python utils/visualize_env.py --env_name hammer-v0 28 | ``` 29 | **NOTE:** If the visualization results in a GLFW error, this is because `mujoco-py` does not see some graphics drivers correctly. This can usually be fixed by explicitly loading the correct drivers before running the python script. See [this page](https://github.com/aravindr93/mjrl/tree/master/setup#known-issues) for details. 30 | 31 | # modules 32 | *mj_envs* contains a variety of environements, which are organized as modules. Each module is a collection of loosely related environements. Following modules are provided at the moment with plans to improve the diversity of the collection. 33 | 34 | ## 1. Hand Manipulation Suite (HMS) 35 | HMS contains a collection of environements centered around dexterous manipulation with anthroporphic 24 degrees of freedom [Adroit Hand](https://vikashplus.github.io/P_Hand.html). These environments were designed for the publication: [Learning Complex Dexterous Manipulation with Deep Reinforcement Learning and Demonstrations, RSS2018](https://sites.google.com/corp/view/deeprl-dexterous-manipulation). 36 | 37 | Hand-Manipulation-Suite Tasks [(video)](https://youtu.be/jJtBll8l_OM) 38 | :-------------------------: 39 | ![Alt text](mj_envs/hand_manipulation_suite/assets/tasks.jpg?raw=false "Fetch Pole") 40 | 41 | ## 2. More coming soon 42 | -------------------------------------------------------------------------------- /env_dependencies/mj_envs/dependencies/Adroit/Adroit_hand.xml: -------------------------------------------------------------------------------- 1 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | -------------------------------------------------------------------------------- /env_dependencies/mj_envs/dependencies/Adroit/Adroit_hand_withOverlay.xml: -------------------------------------------------------------------------------- 1 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | -------------------------------------------------------------------------------- /env_dependencies/mj_envs/dependencies/Adroit/README.md: -------------------------------------------------------------------------------- 1 | # Adroit Manipulation Platform 2 | 3 | Adroit manipulation platform is reconfigurable, tendon-driven, pneumatically-actuated platform designed and developed by [Vikash Kumar](https://vikashplus.github.io/) during this Ph.D. ([Thesis: Manipulators and Manipulation in high dimensional spaces](https://digital.lib.washington.edu/researchworks/handle/1773/38104)) to study dynamic dexterous manipulation. Adroit is comprised of the [Shadow Hand](https://www.shadowrobot.com/products/dexterous-hand/) skeleton (developed by [Shadow Robot company](https://www.shadowrobot.com/)) and a custom arm, and is powered by a custom actuation sysem. This custom actuation system allows Adroit to move the ShadowHand skeleton faster than a human hand (70 msec limit-to-limit movement, 30 msec overall reflex latency), generate sufficient forces (40 N at each finger tendon, 125N at each wrist tendon), and achieve high compliance on the mechanism level (6 grams of external force at the fingertip displaces the finger when the system is powered.) This combination of speed, force, and compliance is a prerequisite for dexterous manipulation, yet it has never before been achieved with a tendon-driven system, let alone a system with 24 degrees of freedom and 40 tendons. 4 | 5 | ## Mujoco Model 6 | Adroit is a 28 degree of freedom system which consists of a 24 degrees of freedom **ShadowHand** and a 4 degree of freedom arm. This repository contains the Mujoco Models of the system developed with extreme care and great attention to the details. 7 | 8 | 9 | ## In Projects 10 | Adroit has been used in a wide variety of project. A small list is appended below. Details of these projects can be found [here](https://vikashplus.github.io/). 11 | [![projects](https://github.com/vikashplus/Adroit/blob/master/gallery/projects.JPG)](https://vikashplus.github.io/) 12 | ## In News and Media 13 | Adroit has found quite some attention in the world media. Details can be found [here](https://vikashplus.github.io/news.html) 14 | 15 | [![News](https://github.com/vikashplus/Adroit/blob/master/gallery/news.JPG)](https://vikashplus.github.io/news.html) 16 | 17 | 18 | ## Citation 19 | If the contents of this repo helped you, please consider citing 20 | 21 | ``` 22 | @phdthesis{Kumar2016thesis, 23 | title = {Manipulators and Manipulation in high dimensional spaces}, 24 | school = {University of Washington, Seattle}, 25 | author = {Kumar, Vikash}, 26 | year = {2016}, 27 | url = {https://digital.lib.washington.edu/researchworks/handle/1773/38104} 28 | } 29 | ``` 30 | -------------------------------------------------------------------------------- /env_dependencies/mj_envs/dependencies/Adroit/gallery/news.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TEA-Lab/diffusion_reward/384ec68aa3d590f6266cff3ffb62b6c9135111b9/env_dependencies/mj_envs/dependencies/Adroit/gallery/news.JPG -------------------------------------------------------------------------------- /env_dependencies/mj_envs/dependencies/Adroit/gallery/projects.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TEA-Lab/diffusion_reward/384ec68aa3d590f6266cff3ffb62b6c9135111b9/env_dependencies/mj_envs/dependencies/Adroit/gallery/projects.JPG -------------------------------------------------------------------------------- /env_dependencies/mj_envs/dependencies/Adroit/resources/joint_position_actuation.xml: -------------------------------------------------------------------------------- 1 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | -------------------------------------------------------------------------------- /env_dependencies/mj_envs/dependencies/Adroit/resources/meshes/F1.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TEA-Lab/diffusion_reward/384ec68aa3d590f6266cff3ffb62b6c9135111b9/env_dependencies/mj_envs/dependencies/Adroit/resources/meshes/F1.stl -------------------------------------------------------------------------------- /env_dependencies/mj_envs/dependencies/Adroit/resources/meshes/F2.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TEA-Lab/diffusion_reward/384ec68aa3d590f6266cff3ffb62b6c9135111b9/env_dependencies/mj_envs/dependencies/Adroit/resources/meshes/F2.stl -------------------------------------------------------------------------------- /env_dependencies/mj_envs/dependencies/Adroit/resources/meshes/F3.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TEA-Lab/diffusion_reward/384ec68aa3d590f6266cff3ffb62b6c9135111b9/env_dependencies/mj_envs/dependencies/Adroit/resources/meshes/F3.stl -------------------------------------------------------------------------------- /env_dependencies/mj_envs/dependencies/Adroit/resources/meshes/TH1_z.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TEA-Lab/diffusion_reward/384ec68aa3d590f6266cff3ffb62b6c9135111b9/env_dependencies/mj_envs/dependencies/Adroit/resources/meshes/TH1_z.stl -------------------------------------------------------------------------------- /env_dependencies/mj_envs/dependencies/Adroit/resources/meshes/TH2_z.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TEA-Lab/diffusion_reward/384ec68aa3d590f6266cff3ffb62b6c9135111b9/env_dependencies/mj_envs/dependencies/Adroit/resources/meshes/TH2_z.stl -------------------------------------------------------------------------------- /env_dependencies/mj_envs/dependencies/Adroit/resources/meshes/TH3_z.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TEA-Lab/diffusion_reward/384ec68aa3d590f6266cff3ffb62b6c9135111b9/env_dependencies/mj_envs/dependencies/Adroit/resources/meshes/TH3_z.stl -------------------------------------------------------------------------------- /env_dependencies/mj_envs/dependencies/Adroit/resources/meshes/arm_base.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TEA-Lab/diffusion_reward/384ec68aa3d590f6266cff3ffb62b6c9135111b9/env_dependencies/mj_envs/dependencies/Adroit/resources/meshes/arm_base.stl -------------------------------------------------------------------------------- /env_dependencies/mj_envs/dependencies/Adroit/resources/meshes/arm_trunk.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TEA-Lab/diffusion_reward/384ec68aa3d590f6266cff3ffb62b6c9135111b9/env_dependencies/mj_envs/dependencies/Adroit/resources/meshes/arm_trunk.stl -------------------------------------------------------------------------------- /env_dependencies/mj_envs/dependencies/Adroit/resources/meshes/arm_trunk_asmbly.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TEA-Lab/diffusion_reward/384ec68aa3d590f6266cff3ffb62b6c9135111b9/env_dependencies/mj_envs/dependencies/Adroit/resources/meshes/arm_trunk_asmbly.stl -------------------------------------------------------------------------------- /env_dependencies/mj_envs/dependencies/Adroit/resources/meshes/distal_ellipsoid.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TEA-Lab/diffusion_reward/384ec68aa3d590f6266cff3ffb62b6c9135111b9/env_dependencies/mj_envs/dependencies/Adroit/resources/meshes/distal_ellipsoid.stl -------------------------------------------------------------------------------- /env_dependencies/mj_envs/dependencies/Adroit/resources/meshes/elbow_flex.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TEA-Lab/diffusion_reward/384ec68aa3d590f6266cff3ffb62b6c9135111b9/env_dependencies/mj_envs/dependencies/Adroit/resources/meshes/elbow_flex.stl -------------------------------------------------------------------------------- /env_dependencies/mj_envs/dependencies/Adroit/resources/meshes/elbow_rotate_motor.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TEA-Lab/diffusion_reward/384ec68aa3d590f6266cff3ffb62b6c9135111b9/env_dependencies/mj_envs/dependencies/Adroit/resources/meshes/elbow_rotate_motor.stl -------------------------------------------------------------------------------- /env_dependencies/mj_envs/dependencies/Adroit/resources/meshes/elbow_rotate_muscle.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TEA-Lab/diffusion_reward/384ec68aa3d590f6266cff3ffb62b6c9135111b9/env_dependencies/mj_envs/dependencies/Adroit/resources/meshes/elbow_rotate_muscle.stl -------------------------------------------------------------------------------- /env_dependencies/mj_envs/dependencies/Adroit/resources/meshes/forearm_Cy_PlateAsmbly(muscle_cone).stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TEA-Lab/diffusion_reward/384ec68aa3d590f6266cff3ffb62b6c9135111b9/env_dependencies/mj_envs/dependencies/Adroit/resources/meshes/forearm_Cy_PlateAsmbly(muscle_cone).stl -------------------------------------------------------------------------------- /env_dependencies/mj_envs/dependencies/Adroit/resources/meshes/forearm_Cy_PlateAsmbly.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TEA-Lab/diffusion_reward/384ec68aa3d590f6266cff3ffb62b6c9135111b9/env_dependencies/mj_envs/dependencies/Adroit/resources/meshes/forearm_Cy_PlateAsmbly.stl -------------------------------------------------------------------------------- /env_dependencies/mj_envs/dependencies/Adroit/resources/meshes/forearm_PlateAsmbly.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TEA-Lab/diffusion_reward/384ec68aa3d590f6266cff3ffb62b6c9135111b9/env_dependencies/mj_envs/dependencies/Adroit/resources/meshes/forearm_PlateAsmbly.stl -------------------------------------------------------------------------------- /env_dependencies/mj_envs/dependencies/Adroit/resources/meshes/forearm_electric.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TEA-Lab/diffusion_reward/384ec68aa3d590f6266cff3ffb62b6c9135111b9/env_dependencies/mj_envs/dependencies/Adroit/resources/meshes/forearm_electric.stl -------------------------------------------------------------------------------- /env_dependencies/mj_envs/dependencies/Adroit/resources/meshes/forearm_electric_cvx.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TEA-Lab/diffusion_reward/384ec68aa3d590f6266cff3ffb62b6c9135111b9/env_dependencies/mj_envs/dependencies/Adroit/resources/meshes/forearm_electric_cvx.stl -------------------------------------------------------------------------------- /env_dependencies/mj_envs/dependencies/Adroit/resources/meshes/forearm_muscle.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TEA-Lab/diffusion_reward/384ec68aa3d590f6266cff3ffb62b6c9135111b9/env_dependencies/mj_envs/dependencies/Adroit/resources/meshes/forearm_muscle.stl -------------------------------------------------------------------------------- /env_dependencies/mj_envs/dependencies/Adroit/resources/meshes/forearm_simple.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TEA-Lab/diffusion_reward/384ec68aa3d590f6266cff3ffb62b6c9135111b9/env_dependencies/mj_envs/dependencies/Adroit/resources/meshes/forearm_simple.stl -------------------------------------------------------------------------------- /env_dependencies/mj_envs/dependencies/Adroit/resources/meshes/forearm_simple_cvx.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TEA-Lab/diffusion_reward/384ec68aa3d590f6266cff3ffb62b6c9135111b9/env_dependencies/mj_envs/dependencies/Adroit/resources/meshes/forearm_simple_cvx.stl -------------------------------------------------------------------------------- /env_dependencies/mj_envs/dependencies/Adroit/resources/meshes/forearm_weight.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TEA-Lab/diffusion_reward/384ec68aa3d590f6266cff3ffb62b6c9135111b9/env_dependencies/mj_envs/dependencies/Adroit/resources/meshes/forearm_weight.stl -------------------------------------------------------------------------------- /env_dependencies/mj_envs/dependencies/Adroit/resources/meshes/knuckle.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TEA-Lab/diffusion_reward/384ec68aa3d590f6266cff3ffb62b6c9135111b9/env_dependencies/mj_envs/dependencies/Adroit/resources/meshes/knuckle.stl -------------------------------------------------------------------------------- /env_dependencies/mj_envs/dependencies/Adroit/resources/meshes/lfmetacarpal.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TEA-Lab/diffusion_reward/384ec68aa3d590f6266cff3ffb62b6c9135111b9/env_dependencies/mj_envs/dependencies/Adroit/resources/meshes/lfmetacarpal.stl -------------------------------------------------------------------------------- /env_dependencies/mj_envs/dependencies/Adroit/resources/meshes/palm.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TEA-Lab/diffusion_reward/384ec68aa3d590f6266cff3ffb62b6c9135111b9/env_dependencies/mj_envs/dependencies/Adroit/resources/meshes/palm.stl -------------------------------------------------------------------------------- /env_dependencies/mj_envs/dependencies/Adroit/resources/meshes/upper_arm.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TEA-Lab/diffusion_reward/384ec68aa3d590f6266cff3ffb62b6c9135111b9/env_dependencies/mj_envs/dependencies/Adroit/resources/meshes/upper_arm.stl -------------------------------------------------------------------------------- /env_dependencies/mj_envs/dependencies/Adroit/resources/meshes/upper_arm_asmbl_shoulder.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TEA-Lab/diffusion_reward/384ec68aa3d590f6266cff3ffb62b6c9135111b9/env_dependencies/mj_envs/dependencies/Adroit/resources/meshes/upper_arm_asmbl_shoulder.stl -------------------------------------------------------------------------------- /env_dependencies/mj_envs/dependencies/Adroit/resources/meshes/upper_arm_ass.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TEA-Lab/diffusion_reward/384ec68aa3d590f6266cff3ffb62b6c9135111b9/env_dependencies/mj_envs/dependencies/Adroit/resources/meshes/upper_arm_ass.stl -------------------------------------------------------------------------------- /env_dependencies/mj_envs/dependencies/Adroit/resources/meshes/wrist.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TEA-Lab/diffusion_reward/384ec68aa3d590f6266cff3ffb62b6c9135111b9/env_dependencies/mj_envs/dependencies/Adroit/resources/meshes/wrist.stl -------------------------------------------------------------------------------- /env_dependencies/mj_envs/dependencies/Adroit/resources/textures/darkwood.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TEA-Lab/diffusion_reward/384ec68aa3d590f6266cff3ffb62b6c9135111b9/env_dependencies/mj_envs/dependencies/Adroit/resources/textures/darkwood.png -------------------------------------------------------------------------------- /env_dependencies/mj_envs/dependencies/Adroit/resources/textures/dice.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TEA-Lab/diffusion_reward/384ec68aa3d590f6266cff3ffb62b6c9135111b9/env_dependencies/mj_envs/dependencies/Adroit/resources/textures/dice.png -------------------------------------------------------------------------------- /env_dependencies/mj_envs/dependencies/Adroit/resources/textures/foil.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TEA-Lab/diffusion_reward/384ec68aa3d590f6266cff3ffb62b6c9135111b9/env_dependencies/mj_envs/dependencies/Adroit/resources/textures/foil.png -------------------------------------------------------------------------------- /env_dependencies/mj_envs/dependencies/Adroit/resources/textures/marble.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TEA-Lab/diffusion_reward/384ec68aa3d590f6266cff3ffb62b6c9135111b9/env_dependencies/mj_envs/dependencies/Adroit/resources/textures/marble.png -------------------------------------------------------------------------------- /env_dependencies/mj_envs/dependencies/Adroit/resources/textures/silverRaw.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TEA-Lab/diffusion_reward/384ec68aa3d590f6266cff3ffb62b6c9135111b9/env_dependencies/mj_envs/dependencies/Adroit/resources/textures/silverRaw.png -------------------------------------------------------------------------------- /env_dependencies/mj_envs/dependencies/Adroit/resources/textures/skin.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TEA-Lab/diffusion_reward/384ec68aa3d590f6266cff3ffb62b6c9135111b9/env_dependencies/mj_envs/dependencies/Adroit/resources/textures/skin.png -------------------------------------------------------------------------------- /env_dependencies/mj_envs/dependencies/Adroit/resources/textures/square.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TEA-Lab/diffusion_reward/384ec68aa3d590f6266cff3ffb62b6c9135111b9/env_dependencies/mj_envs/dependencies/Adroit/resources/textures/square.png -------------------------------------------------------------------------------- /env_dependencies/mj_envs/dependencies/Adroit/resources/textures/wood.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TEA-Lab/diffusion_reward/384ec68aa3d590f6266cff3ffb62b6c9135111b9/env_dependencies/mj_envs/dependencies/Adroit/resources/textures/wood.png -------------------------------------------------------------------------------- /env_dependencies/mj_envs/dependencies/Adroit/resources/textures/woodb.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TEA-Lab/diffusion_reward/384ec68aa3d590f6266cff3ffb62b6c9135111b9/env_dependencies/mj_envs/dependencies/Adroit/resources/textures/woodb.png -------------------------------------------------------------------------------- /env_dependencies/mj_envs/mj_envs/__init__.py: -------------------------------------------------------------------------------- 1 | import mj_envs.hand_manipulation_suite -------------------------------------------------------------------------------- /env_dependencies/mj_envs/mj_envs/hand_manipulation_suite/__init__.py: -------------------------------------------------------------------------------- 1 | from gym.envs.registration import register 2 | from mjrl.envs.mujoco_env import MujocoEnv 3 | 4 | # Swing the door open 5 | register( 6 | id='door-v0', 7 | entry_point='mj_envs.hand_manipulation_suite:DoorEnvV0', 8 | max_episode_steps=200, 9 | ) 10 | from mj_envs.hand_manipulation_suite.door_v0 import DoorEnvV0 11 | 12 | # Hammer a nail into the board 13 | register( 14 | id='hammer-v0', 15 | entry_point='mj_envs.hand_manipulation_suite:HammerEnvV0', 16 | max_episode_steps=200, 17 | ) 18 | from mj_envs.hand_manipulation_suite.hammer_v0 import HammerEnvV0 19 | 20 | # Reposition a pen in hand 21 | register( 22 | id='pen-v0', 23 | entry_point='mj_envs.hand_manipulation_suite:PenEnvV0', 24 | max_episode_steps=100, 25 | ) 26 | from mj_envs.hand_manipulation_suite.pen_v0 import PenEnvV0 27 | 28 | # Relcoate an object to the target 29 | register( 30 | id='relocate-v0', 31 | entry_point='mj_envs.hand_manipulation_suite:RelocateEnvV0', 32 | max_episode_steps=200, 33 | ) 34 | from mj_envs.hand_manipulation_suite.relocate_v0 import RelocateEnvV0 35 | -------------------------------------------------------------------------------- /env_dependencies/mj_envs/mj_envs/hand_manipulation_suite/assets/tasks.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TEA-Lab/diffusion_reward/384ec68aa3d590f6266cff3ffb62b6c9135111b9/env_dependencies/mj_envs/mj_envs/hand_manipulation_suite/assets/tasks.jpg -------------------------------------------------------------------------------- /env_dependencies/mj_envs/mj_envs/utils/visualize_env.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import mj_envs 3 | import click 4 | import os 5 | import gym 6 | import numpy as np 7 | import pickle 8 | from mjrl.utils.gym_env import GymEnv 9 | from mjrl.policies.gaussian_mlp import MLP 10 | 11 | DESC = ''' 12 | Helper script to visualize policy (in mjrl format).\n 13 | USAGE:\n 14 | Visualizes policy on the env\n 15 | $ python visualize_policy.py --env_name door-v0 \n 16 | $ python visualize_policy.py --env_name door-v0 --policy my_policy.pickle --mode evaluation --episodes 10 \n 17 | ''' 18 | 19 | # MAIN ========================================================= 20 | @click.command(help=DESC) 21 | @click.option('--env_name', type=str, help='environment to load', required= True) 22 | @click.option('--policy', type=str, help='absolute path of the policy file', default=None) 23 | @click.option('--mode', type=str, help='exploration or evaluation mode for policy', default='evaluation') 24 | @click.option('--seed', type=int, help='seed for generating environment instances', default=123) 25 | @click.option('--episodes', type=int, help='number of episodes to visualize', default=10) 26 | 27 | def main(env_name, policy, mode, seed, episodes): 28 | e = GymEnv(env_name) 29 | e.set_seed(seed) 30 | if policy is not None: 31 | pi = pickle.load(open(policy, 'rb')) 32 | else: 33 | pi = MLP(e.spec, hidden_sizes=(32,32), seed=seed, init_log_std=-1.0) 34 | # render policy 35 | e.visualize_policy(pi, num_episodes=episodes, horizon=e.horizon, mode=mode) 36 | 37 | if __name__ == '__main__': 38 | main() -------------------------------------------------------------------------------- /env_dependencies/mj_envs/setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from setuptools import setup, find_packages 4 | 5 | if sys.version_info.major != 3: 6 | print("This Python is only compatible with Python 3, but you are running " 7 | "Python {}. The installation will likely fail.".format(sys.version_info.major)) 8 | 9 | def read(fname): 10 | return open(os.path.join(os.path.dirname(__file__), fname)).read() 11 | 12 | setup( 13 | name='mj_envs', 14 | version='1.0.0', 15 | packages=find_packages(), 16 | description='environments simulated in MuJoCo', 17 | long_description=read('README.md'), 18 | url='https://github.com/vikashplus/mj_envs.git', 19 | author='Movement Control Lab, UW', 20 | install_requires=[ 21 | 'click', 'termcolor', 22 | ], 23 | ) -------------------------------------------------------------------------------- /env_dependencies/mjrl/README.md: -------------------------------------------------------------------------------- 1 | # RL for MuJoCo 2 | 3 | This package contains implementations of various RL algorithms for continuous control tasks simulated with [MuJoCo.](http://www.mujoco.org/) 4 | 5 | # Installation 6 | The main package dependencies are `MuJoCo`, `python=3.7`, `gym>=0.13`, `mujoco-py>=2.0`, and `pytorch>=1.0`. See `setup/README.md` ([link](https://github.com/aravindr93/mjrl/tree/master/setup#installation)) for detailed install instructions. 7 | 8 | # Bibliography 9 | If you find the package useful, please cite the following papers. 10 | ``` 11 | @INPROCEEDINGS{Rajeswaran-NIPS-17, 12 | AUTHOR = {Aravind Rajeswaran and Kendall Lowrey and Emanuel Todorov and Sham Kakade}, 13 | TITLE = "{Towards Generalization and Simplicity in Continuous Control}", 14 | BOOKTITLE = {NIPS}, 15 | YEAR = {2017}, 16 | } 17 | 18 | @INPROCEEDINGS{Rajeswaran-RSS-18, 19 | AUTHOR = {Aravind Rajeswaran AND Vikash Kumar AND Abhishek Gupta AND 20 | Giulia Vezzani AND John Schulman AND Emanuel Todorov AND Sergey Levine}, 21 | TITLE = "{Learning Complex Dexterous Manipulation with Deep Reinforcement Learning and Demonstrations}", 22 | BOOKTITLE = {Proceedings of Robotics: Science and Systems (RSS)}, 23 | YEAR = {2018}, 24 | } 25 | ``` 26 | 27 | # Credits 28 | This package is maintained by [Aravind Rajeswaran](http://homes.cs.washington.edu/~aravraj/) and other members of the [Movement Control Lab,](http://homes.cs.washington.edu/~todorov/) University of Washington Seattle. 29 | -------------------------------------------------------------------------------- /env_dependencies/mjrl/examples/README.md: -------------------------------------------------------------------------------- 1 | # Examples 2 | 3 | Here we provide a job script to illustrate policy optimization with incrimental learning methods like NPG and PPO. To run the experiments, use the commands below. The experiments are run through the job script provided which tasks two arguments: 4 | - `output`: path to directory where all the results will be saved 5 | - `config`: a config `.txt` file with all the experiment parameters (examples are provided) 6 | The script has to be run from this directory, i.e. `mjrl/examples` 7 | 8 | 1. To train an NPG agent on a task shipped with `mjrl` (e.g. swimmer) 9 | ``` 10 | $ python policy_opt_job_script.py --output swimmer_npg_exp --config example_configs/swimmer_npg.txt 11 | ``` 12 | 13 | 2. To train an NPG agent on an OpenAI gym benchmark task (e.g. hopper) 14 | ``` 15 | $ python policy_opt_job_script.py --output hopper_npg_exp --config example_configs/hopper_npg.txt 16 | ``` 17 | Note that since the Hopper env has termination conditions, we pick the sampling mode in the config to be `samples` rather than trajectories, so that per update we have 10K samples. 18 | 19 | 3. To train a PPO agent on the swimmer task 20 | ``` 21 | $ python policy_opt_job_script.py --output swimmer_ppo_exp --config example_configs/swimmer_ppo.txt 22 | ``` -------------------------------------------------------------------------------- /env_dependencies/mjrl/examples/behavior_clone.py: -------------------------------------------------------------------------------- 1 | from mjrl.utils.gym_env import GymEnv 2 | from mjrl.policies.gaussian_mlp import MLP 3 | from mjrl.baselines.quadratic_baseline import QuadraticBaseline 4 | from mjrl.baselines.mlp_baseline import MLPBaseline 5 | from mjrl.algos.npg_cg import NPG 6 | from mjrl.algos.behavior_cloning import BC 7 | from mjrl.utils.train_agent import train_agent 8 | from mjrl.samplers.core import sample_paths 9 | import mjrl.envs 10 | import time as timer 11 | import pickle 12 | SEED = 500 13 | 14 | # ------------------------------ 15 | # Train expert policy first 16 | e = GymEnv('mjrl_swimmer-v0') 17 | policy = MLP(e.spec, hidden_sizes=(32,32), seed=SEED) 18 | baseline = MLPBaseline(e.spec, reg_coef=1e-3, batch_size=64, epochs=5, learn_rate=1e-3) 19 | agent = NPG(e, policy, baseline, normalized_step_size=0.1, seed=SEED, save_logs=True) 20 | 21 | ts = timer.time() 22 | print("========================================") 23 | print("Training expert policy") 24 | print("========================================") 25 | train_agent(job_name='swimmer_exp1', 26 | agent=agent, 27 | seed=SEED, 28 | niter=50, 29 | gamma=0.995, 30 | gae_lambda=0.97, 31 | num_cpu=1, 32 | sample_mode='trajectories', 33 | num_traj=10, 34 | save_freq=5, 35 | evaluation_rollouts=None) 36 | print("========================================") 37 | print("Expert policy training complete !!!") 38 | print("========================================") 39 | print("time taken = %f" % (timer.time()-ts)) 40 | print("========================================") 41 | 42 | # ------------------------------ 43 | # Get demonstrations 44 | print("========================================") 45 | print("Collecting expert demonstrations") 46 | print("========================================") 47 | expert_pol = pickle.load(open('swimmer_exp1/iterations/best_policy.pickle', 'rb')) 48 | demo_paths = sample_paths(num_traj=5, policy=expert_pol, env=e.env_id) 49 | 50 | # ------------------------------ 51 | # Train BC 52 | policy = MLP(e.spec, hidden_sizes=(32,32), seed=SEED) 53 | bc_agent = BC(demo_paths, policy=policy, epochs=20, batch_size=64, lr=1e-3) # will use Adam by default 54 | ts = timer.time() 55 | print("========================================") 56 | print("Running BC with expert demonstrations") 57 | print("========================================") 58 | bc_agent.train() 59 | print("========================================") 60 | print("BC training complete !!!") 61 | print("time taken = %f" % (timer.time()-ts)) 62 | print("========================================") 63 | 64 | # ------------------------------ 65 | # Evaluate Policies 66 | bc_pol_score = e.evaluate_policy(policy, num_episodes=5, mean_action=True) 67 | expert_score = e.evaluate_policy(expert_pol, num_episodes=5, mean_action=True) 68 | print("Expert policy performance (eval mode) = %f" % expert_score[0][0]) 69 | print("BC policy performance (eval mode) = %f" % bc_pol_score[0][0]) 70 | -------------------------------------------------------------------------------- /env_dependencies/mjrl/examples/example_configs/hopper_npg.txt: -------------------------------------------------------------------------------- 1 | { 2 | 3 | # general inputs 4 | 5 | 'env' : 'Hopper-v3', 6 | 'algorithm' : 'NPG', 7 | 'seed' : 123, 8 | 'sample_mode' : 'samples', 9 | 'rl_num_samples' : 10000, 10 | 'rl_num_iter' : 100, 11 | 'num_cpu' : 1, 12 | 'save_freq' : 25, 13 | 'eval_rollouts' : None, 14 | 'exp_notes' : 'Example config for training policy with NPG on the OpenAI gym Hopper-v3 task.', 15 | 16 | # RL parameters (all params related to PG, value function etc.) 17 | 18 | 'policy_size' : (32, 32), 19 | 'init_log_std' : -0.5, 20 | 'vf_hidden_size' : (128, 128), 21 | 'vf_batch_size' : 64, 22 | 'vf_epochs' : 2, 23 | 'vf_learn_rate' : 1e-3, 24 | 'rl_step_size' : 0.05, 25 | 'rl_gamma' : 0.995, 26 | 'rl_gae' : 0.97, 27 | 28 | # Algorithm hyperparameters : if alg requires additional params, can be specified here (or defaults will be used) 29 | 30 | 'alg_hyper_params' : dict(), 31 | 32 | } 33 | 34 | -------------------------------------------------------------------------------- /env_dependencies/mjrl/examples/example_configs/swimmer_npg.txt: -------------------------------------------------------------------------------- 1 | { 2 | 3 | # general inputs 4 | 5 | 'env' : 'mjrl_swimmer-v0', 6 | 'algorithm' : 'NPG', 7 | 'seed' : 123, 8 | 'sample_mode' : 'trajectories', 9 | 'rl_num_traj' : 10, 10 | 'rl_num_iter' : 50, 11 | 'num_cpu' : 2, 12 | 'save_freq' : 25, 13 | 'eval_rollouts' : None, 14 | 'exp_notes' : 'Example config for training policy with NPG on the mjrl swimmer task.', 15 | 16 | # RL parameters (all params related to PG, value function, DAPG etc.) 17 | 18 | 'policy_size' : (32, 32), 19 | 'init_log_std' : -0.5, 20 | 'vf_hidden_size' : (128, 128), 21 | 'vf_batch_size' : 64, 22 | 'vf_epochs' : 2, 23 | 'vf_learn_rate' : 1e-3, 24 | 'rl_step_size' : 0.1, 25 | 'rl_gamma' : 0.995, 26 | 'rl_gae' : 0.97, 27 | 28 | # Algorithm hyperparameters : if alg requires additional params, can be specified here (or defaults will be used) 29 | 30 | 'alg_hyper_params' : dict(), 31 | 32 | } -------------------------------------------------------------------------------- /env_dependencies/mjrl/examples/example_configs/swimmer_ppo.txt: -------------------------------------------------------------------------------- 1 | { 2 | 3 | # general inputs 4 | 5 | 'env' : 'mjrl_swimmer-v0', 6 | 'algorithm' : 'PPO', 7 | 'seed' : 123, 8 | 'sample_mode' : 'trajectories', 9 | 'rl_num_traj' : 10, 10 | 'rl_num_iter' : 50, 11 | 'num_cpu' : 2, 12 | 'save_freq' : 25, 13 | 'eval_rollouts' : None, 14 | 'exp_notes' : 'Example config for training policy with PPO on the mjrl swimmer task.', 15 | 16 | # RL parameters (all params related to PG, value function, DAPG etc.) 17 | 18 | 'policy_size' : (32, 32), 19 | 'init_log_std' : -0.5, 20 | 'vf_hidden_size' : (128, 128), 21 | 'vf_batch_size' : 64, 22 | 'vf_epochs' : 2, 23 | 'vf_learn_rate' : 1e-3, 24 | 'rl_step_size' : 0.1, 25 | 'rl_gamma' : 0.995, 26 | 'rl_gae' : 0.97, 27 | 28 | # Algorithm hyperparameters : if alg requires additional params, can be specified here (or defaults will be used) 29 | 30 | 'alg_hyper_params' : dict(clip_coef=0.2, epochs=10, mb_size=64, learn_rate=5e-4), 31 | 32 | } -------------------------------------------------------------------------------- /env_dependencies/mjrl/examples/linear_nn_comparison.py: -------------------------------------------------------------------------------- 1 | from mjrl.utils.gym_env import GymEnv 2 | from mjrl.policies.gaussian_mlp import MLP 3 | from mjrl.policies.gaussian_linear import LinearPolicy 4 | from mjrl.baselines.quadratic_baseline import QuadraticBaseline 5 | from mjrl.baselines.mlp_baseline import MLPBaseline 6 | from mjrl.algos.npg_cg import NPG 7 | from mjrl.utils.train_agent import train_agent 8 | import mjrl.envs 9 | import time as timer 10 | SEED = 500 11 | 12 | # NN policy 13 | # ================================== 14 | e = GymEnv('mjrl_swimmer-v0') 15 | policy = MLP(e.spec, hidden_sizes=(32,32), seed=SEED) 16 | baseline = MLPBaseline(e.spec, reg_coef=1e-3, batch_size=64, epochs=2, learn_rate=1e-3) 17 | agent = NPG(e, policy, baseline, normalized_step_size=0.1, seed=SEED, save_logs=True) 18 | 19 | ts = timer.time() 20 | train_agent(job_name='swimmer_nn_exp1', 21 | agent=agent, 22 | seed=SEED, 23 | niter=50, 24 | gamma=0.995, 25 | gae_lambda=0.97, 26 | num_cpu=1, 27 | sample_mode='trajectories', 28 | num_traj=10, 29 | save_freq=5, 30 | evaluation_rollouts=5) 31 | print("time taken for NN policy training = %f" % (timer.time()-ts)) 32 | 33 | 34 | # Linear policy 35 | # ================================== 36 | e = GymEnv('mjrl_swimmer-v0') 37 | policy = LinearPolicy(e.spec, seed=SEED) 38 | baseline = MLPBaseline(e.spec, reg_coef=1e-3, batch_size=64, epochs=2, learn_rate=1e-3) 39 | agent = NPG(e, policy, baseline, normalized_step_size=0.1, seed=SEED, save_logs=True) 40 | 41 | ts = timer.time() 42 | train_agent(job_name='swimmer_linear_exp1', 43 | agent=agent, 44 | seed=SEED, 45 | niter=50, 46 | gamma=0.995, 47 | gae_lambda=0.97, 48 | num_cpu=1, 49 | sample_mode='trajectories', 50 | num_traj=10, 51 | save_freq=5, 52 | evaluation_rollouts=5) 53 | print("time taken for linear policy training = %f" % (timer.time()-ts)) 54 | -------------------------------------------------------------------------------- /env_dependencies/mjrl/mjrl/__init__.py: -------------------------------------------------------------------------------- 1 | import mjrl.envs -------------------------------------------------------------------------------- /env_dependencies/mjrl/mjrl/algos/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TEA-Lab/diffusion_reward/384ec68aa3d590f6266cff3ffb62b6c9135111b9/env_dependencies/mjrl/mjrl/algos/__init__.py -------------------------------------------------------------------------------- /env_dependencies/mjrl/mjrl/algos/model_accel/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TEA-Lab/diffusion_reward/384ec68aa3d590f6266cff3ffb62b6c9135111b9/env_dependencies/mjrl/mjrl/algos/model_accel/__init__.py -------------------------------------------------------------------------------- /env_dependencies/mjrl/mjrl/algos/model_accel/run_experiments/configs/point_mass.txt: -------------------------------------------------------------------------------- 1 | { 2 | 3 | # general inputs 4 | 5 | 'env_name' : 'mjrl_point_mass-v0', 6 | 'seed' : 123, 7 | 'debug_mode' : False, 8 | 'num_iter' : 5, 9 | 'iter_samples' : 100, 10 | 'eval_rollouts' : 25, 11 | 'num_models' : 3, 12 | 'exp_notes' : 'Toy experiment for initial trial.', 13 | 'save_freq' : 1, 14 | 'device' : 'cpu', 15 | 'learn_reward' : False, 16 | 'reward_file' : 'utils/reward_functions/mjrl_point_mass.py', 17 | 18 | # dynamics learning 19 | 20 | 'hidden_size' : (256, 256), 21 | 'activation' : 'relu', 22 | 'fit_lr' : 1e-3, 23 | 'fit_wd' : 1e-5, 24 | 'buffer_size' : 10000, 25 | 'fit_mb_size' : 16, 26 | 'fit_epochs' : 25, 27 | 'refresh_fit' : False, 28 | 29 | # initial data 30 | 31 | 'init_log_std' : -0.5, 32 | 'min_log_std' : -2.0, 33 | 'init_samples' : 1000, 34 | 35 | # NPG params 36 | 37 | 'policy_size' : (32, 32), 38 | 'inner_steps' : 10, 39 | 'step_size' : 0.05, 40 | 'update_paths' : 250, 41 | 'start_state' : 'init', 42 | 'horizon' : 25, 43 | 44 | } 45 | -------------------------------------------------------------------------------- /env_dependencies/mjrl/mjrl/algos/model_accel/run_experiments/configs/reacher.txt: -------------------------------------------------------------------------------- 1 | { 2 | 3 | # general inputs 4 | 5 | 'env_name' : 'mjrl_reacher_7dof-v0', 6 | 'seed' : 123, 7 | 'debug_mode' : False, 8 | 'num_iter' : 25, 9 | 'iter_samples' : 500, 10 | 'eval_rollouts' : 10, 11 | 'num_models' : 4, 12 | 'save_freq' : 1, 13 | 'device' : 'cpu', 14 | 15 | # dynamics learning 16 | 17 | 'hidden_size' : (256, 256), 18 | 'activation' : 'relu', 19 | 'fit_lr' : 1e-3, 20 | 'fit_wd' : 0.0, 21 | 'buffer_size' : 20000, 22 | 'fit_mb_size' : 64, 23 | 'fit_epochs' : 20, 24 | 'refresh_fit' : False, 25 | 26 | # initial data 27 | 28 | 'init_log_std' : -0.5, 29 | 'min_log_std' : -2.5, 30 | 'init_samples' : 2500, 31 | 'init_policy' : None, 32 | 33 | 34 | # NPG params 35 | 36 | 'policy_size' : (64, 64), 37 | 'inner_steps' : 5, 38 | 'step_size' : 0.05, 39 | 'update_paths' : 250, 40 | 'start_state' : 'init', 41 | 'horizon' : 50, 42 | 43 | } -------------------------------------------------------------------------------- /env_dependencies/mjrl/mjrl/algos/model_accel/run_experiments/sandbox/example_config_mpc.txt: -------------------------------------------------------------------------------- 1 | { 2 | 3 | # general inputs 4 | 5 | 'env_name' : 'mjrl_point_mass-v0', 6 | 'seed' : 123, 7 | 'debug_mode' : False, 8 | 'num_iter' : 5, 9 | 'paths_per_iter': 5, 10 | 'eval_rollouts' : 10, 11 | 'num_models' : 3, 12 | 'exp_notes' : 'Toy experiment for initial trial.', 13 | 'save_freq' : 5, 14 | 'device' : 'cpu', 15 | 16 | # dynamics learning 17 | 18 | 'hidden_size' : (64, 64), 19 | 'activation' : 'relu', 20 | 'fit_lr' : 1e-3, 21 | 'fit_wd' : 1e-5, 22 | 'max_paths' : 1000, 23 | 'fit_mb_size' : 16, 24 | 'fit_epochs' : 25, 25 | 'refresh_fit' : True, 26 | 27 | # initial data 28 | 29 | 'init_log_std' : -0.5, 30 | 'n_init_paths' : 25, 31 | 'use_demos' : False, 32 | 'demo_file' : None, 33 | 34 | # model predictive control 35 | 36 | 'noisy_mpc' : True, # when collecting data for exploration 37 | 'noise_level' : 0.1, 38 | 'filter_coefs' : {'f1': 0.5, 'f2': 1.0, 'f3': 0.0, 'f4': 0.0}, 39 | 'plan_paths' : 200, 40 | 'plan_horizon' : 10, 41 | 'kappa' : 2.0, 42 | 'omega' : 0.0, 43 | 44 | } 45 | -------------------------------------------------------------------------------- /env_dependencies/mjrl/mjrl/algos/model_accel/run_experiments/utils/reward_functions/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TEA-Lab/diffusion_reward/384ec68aa3d590f6266cff3ffb62b6c9135111b9/env_dependencies/mjrl/mjrl/algos/model_accel/run_experiments/utils/reward_functions/__init__.py -------------------------------------------------------------------------------- /env_dependencies/mjrl/mjrl/algos/model_accel/run_experiments/utils/reward_functions/mjrl_point_mass.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def reward_function(paths): 4 | # path has two keys: observations and actions 5 | # path["observations"] : (num_traj, horizon, obs_dim) 6 | # return paths that contain rewards in path["rewards"] 7 | # path["rewards"] should have shape (num_traj, horizon) 8 | obs = paths["observations"] 9 | obs = np.expand_dims(obs, axis=0) if len(obs.shape) == 2 else obs 10 | agent_pos = obs[:, :, :2] 11 | target_pos = obs[:, :, -2:] 12 | l1_dist = np.sum(np.abs(agent_pos - target_pos), axis=-1) 13 | l2_dist = np.linalg.norm(agent_pos - target_pos, axis=-1) 14 | rewards = -1.0 * l1_dist - 0.5 * l2_dist 15 | rewards[..., :-1] = rewards[..., 1:] # shift index by 1 to have r(s,a)=r(s') 16 | paths["rewards"] = rewards if rewards.shape[0] > 1 else rewards.ravel() 17 | return paths 18 | -------------------------------------------------------------------------------- /env_dependencies/mjrl/mjrl/algos/model_accel/run_experiments/utils/visualize_policy.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import mjrl.envs 3 | import trajopt.envs 4 | import mj_envs 5 | import click 6 | import os 7 | import gym 8 | import numpy as np 9 | import pickle 10 | import torch 11 | from mjrl.utils.gym_env import GymEnv 12 | from mjrl.policies.gaussian_mlp import MLP 13 | import trajopt.envs 14 | 15 | DESC = ''' 16 | Helper script to visualize policy (in mjrl format).\n 17 | USAGE:\n 18 | Visualizes policy on the env\n 19 | $ python utils/visualize_policy --env_name mjrl_swimmer-v0 --policy my_policy.pickle --mode evaluation --episodes 10 \n 20 | ''' 21 | 22 | # MAIN ========================================================= 23 | @click.command(help=DESC) 24 | @click.option('--env_name', type=str, help='environment to load', required= True) 25 | @click.option('--policy', type=str, help='absolute path of the policy file', default=None) 26 | @click.option('--mode', type=str, help='exploration or evaluation mode for policy', default='evaluation') 27 | @click.option('--seed', type=int, help='seed for generating environment instances', default=123) 28 | @click.option('--episodes', type=int, help='number of episodes to visualize', default=10) 29 | @click.option('--log_std', type=float, default=-0.5) 30 | @click.option('--terminate', type=bool, default=True) 31 | @click.option('--device_path', type=str, default=None) 32 | def main(env_name, policy, mode, seed, episodes, log_std, terminate, device_path): 33 | render = True 34 | 35 | # TODO(Aravind): Map to hardware if device_path is specified 36 | 37 | e = GymEnv(env_name) 38 | e.set_seed(seed) 39 | np.random.seed(seed) 40 | torch.manual_seed(seed) 41 | if policy is not None: 42 | policy = pickle.load(open(policy, 'rb')) 43 | else: 44 | policy = MLP(e.spec, hidden_sizes=(32,32), seed=seed, init_log_std=log_std) 45 | 46 | for ep in range(episodes): 47 | o = e.reset() 48 | rew = 0.0 49 | t = 0 50 | done = False 51 | while t < e.horizon and done is False: 52 | o = e.get_obs() 53 | a = policy.get_action(o)[0] if mode == 'exploration' else policy.get_action(o)[1]['evaluation'] 54 | next_o, r, done, ifo = e.step(a) 55 | if terminate is False: 56 | done = False 57 | rew = rew + r 58 | t = t + 1 59 | if render: 60 | e.render() 61 | if done and t < e.horizon - 1: 62 | print("Episode terminated early") 63 | print("episode score = %f " % rew) 64 | 65 | e.reset() 66 | 67 | 68 | if __name__ == '__main__': 69 | main() 70 | -------------------------------------------------------------------------------- /env_dependencies/mjrl/mjrl/algos/model_accel/run_experiments/utils/visualize_trajectories.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import click 3 | import json 4 | import numpy as np 5 | import torch 6 | import mjrl.envs 7 | import trajopt.envs 8 | import mj_envs 9 | import mjrl.utils.tensor_utils as tensor_utils 10 | 11 | from mjrl.utils.gym_env import GymEnv 12 | from mjrl.algos.model_accel.sampling import evaluate_policy 13 | 14 | DESC = ''' 15 | Helper script to visualize optimized trajectories (list of trajectories in trajopt format).\n 16 | USAGE:\n 17 | $ python viz_trajectories.py --file path_to_file.pickle\n 18 | ''' 19 | @click.command(help=DESC) 20 | @click.option('--file', type=str, help='pickle file with trajectories', required= True) 21 | @click.option('--seed', type=int, default=123) 22 | @click.option('--noise_level', type=float, default=0.0) 23 | @click.option('--num_episodes', type=int, help='number of times to play trajectories', default=5) 24 | @click.option('--config', type=str, help='if provided MPC params from here will be used.', default=None) 25 | @click.option('--device_path', type=str, default=None) 26 | def main(file, seed, noise_level, num_episodes, config, device_path): 27 | exp_data = pickle.load(open(file, 'rb')) 28 | policy = exp_data['policy'] 29 | model = exp_data['fitted_model'] 30 | model = model[-1] if type(model) == list else model 31 | env_id = policy.env.env_id 32 | render = True 33 | 34 | # TODO(Aravind): Map to hardware if device_path is specified 35 | 36 | env = GymEnv(env_id) 37 | policy.env = env 38 | 39 | env.set_seed(seed) 40 | np.random.seed(seed) 41 | torch.manual_seed(seed) 42 | 43 | if config is not None: 44 | try: 45 | with open(config, 'r') as f: 46 | config = eval(f.read()) 47 | except: 48 | with open(config, 'r') as f: 49 | config = json.load(f) 50 | policy.plan_horizon = config['plan_horizon'] 51 | policy.num_traj = config['plan_paths'] 52 | policy.kappa = config['kappa'] 53 | policy.filter_coefs = [config['filter_coefs'][k] for k in ['f1', 'f2', 'f3', 'f4']] 54 | policy.omega = config['omega'] if 'omega' in config.keys() else 0.0 55 | 56 | # TODO(Aravind): Implement capability to set predicted state for rendering purposes 57 | # evaluate_policy(env, policy, model, noise_level, real_step=False, num_episodes=num_episodes, visualize=render) 58 | evaluate_policy(env, policy, model, noise_level, real_step=True, num_episodes=num_episodes, visualize=render) 59 | 60 | # final close out 61 | env.reset() 62 | 63 | 64 | if __name__ == '__main__': 65 | main() 66 | -------------------------------------------------------------------------------- /env_dependencies/mjrl/mjrl/baselines/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TEA-Lab/diffusion_reward/384ec68aa3d590f6266cff3ffb62b6c9135111b9/env_dependencies/mjrl/mjrl/baselines/__init__.py -------------------------------------------------------------------------------- /env_dependencies/mjrl/mjrl/baselines/linear_baseline.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import copy 3 | 4 | 5 | class LinearBaseline: 6 | def __init__(self, env_spec, inp_dim=None, inp='obs', reg_coeff=1e-5): 7 | self.inp = inp 8 | self._reg_coeff = reg_coeff 9 | self._coeffs = None 10 | 11 | def _features(self, paths): 12 | if self.inp == 'env_features': 13 | o = np.concatenate([path["env_infos"]["env_features"][0] for path in paths]) 14 | else: 15 | o = np.concatenate([path["observations"] for path in paths]) 16 | o = np.clip(o, -10, 10)/10.0 17 | if o.ndim > 2: 18 | o = o.reshape(o.shape[0], -1) 19 | N, n = o.shape 20 | num_feat = int( n + 1 + 4 ) # linear + bias (1.0) + time till pow 4 21 | feat_mat = np.ones((N, num_feat)) 22 | 23 | # linear features 24 | feat_mat[:,:n] = o 25 | 26 | k = 0 # start from this row 27 | for i in range(len(paths)): 28 | l = len(paths[i]["rewards"]) 29 | al = np.arange(l)/1000.0 30 | for j in range(4): 31 | feat_mat[k:k+l, -4+j] = al**(j+1) 32 | k += l 33 | 34 | return feat_mat 35 | 36 | def fit(self, paths, return_errors=False): 37 | 38 | featmat = self._features(paths) 39 | returns = np.concatenate([path["returns"] for path in paths]) 40 | 41 | if return_errors: 42 | predictions = featmat.dot(self._coeffs) if self._coeffs is not None else np.zeros(returns.shape) 43 | errors = returns - predictions 44 | error_before = np.sum(errors**2)/np.sum(returns**2) 45 | 46 | reg_coeff = copy.deepcopy(self._reg_coeff) 47 | for _ in range(10): 48 | self._coeffs = np.linalg.lstsq( 49 | featmat.T.dot(featmat) + reg_coeff * np.identity(featmat.shape[1]), 50 | featmat.T.dot(returns) 51 | )[0] 52 | if not np.any(np.isnan(self._coeffs)): 53 | break 54 | reg_coeff *= 10 55 | 56 | if return_errors: 57 | predictions = featmat.dot(self._coeffs) 58 | errors = returns - predictions 59 | error_after = np.sum(errors**2)/np.sum(returns**2) 60 | return error_before, error_after 61 | 62 | def predict(self, path): 63 | if self._coeffs is None: 64 | return np.zeros(len(path["rewards"])) 65 | return self._features([path]).dot(self._coeffs) 66 | -------------------------------------------------------------------------------- /env_dependencies/mjrl/mjrl/baselines/mlp_baseline.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import copy 3 | import torch 4 | import torch.nn as nn 5 | from torch.autograd import Variable 6 | from mjrl.utils.optimize_model import fit_data 7 | 8 | import pickle 9 | 10 | class MLPBaseline: 11 | def __init__(self, env_spec, inp_dim=None, inp='obs', learn_rate=1e-3, reg_coef=0.0, 12 | batch_size=64, epochs=1, use_gpu=False, hidden_sizes=(128, 128)): 13 | self.n = inp_dim if inp_dim is not None else env_spec.observation_dim 14 | self.batch_size = batch_size 15 | self.epochs = epochs 16 | self.reg_coef = reg_coef 17 | self.use_gpu = use_gpu 18 | self.inp = inp 19 | self.hidden_sizes = hidden_sizes 20 | 21 | self.model = nn.Sequential() 22 | layer_sizes = (self.n + 4, ) + hidden_sizes + (1, ) 23 | for i in range(len(layer_sizes) - 1): 24 | layer_id = 'fc_' + str(i) 25 | relu_id = 'relu_' + str(i) 26 | self.model.add_module(layer_id, nn.Linear(layer_sizes[i], layer_sizes[i+1])) 27 | if i != len(layer_sizes) - 2: 28 | self.model.add_module(relu_id, nn.ReLU()) 29 | 30 | if self.use_gpu: 31 | self.model.cuda() 32 | 33 | self.optimizer = torch.optim.Adam(self.model.parameters(), lr=learn_rate, weight_decay=reg_coef) 34 | self.loss_function = torch.nn.MSELoss() 35 | 36 | def _features(self, paths): 37 | if self.inp == 'env_features': 38 | o = np.concatenate([path["env_infos"]["env_features"][0] for path in paths]) 39 | else: 40 | o = np.concatenate([path["observations"] for path in paths]) 41 | o = np.clip(o, -10, 10)/10.0 42 | if o.ndim > 2: 43 | o = o.reshape(o.shape[0], -1) 44 | N, n = o.shape 45 | num_feat = int( n + 4 ) # linear + time till pow 4 46 | feat_mat = np.ones((N, num_feat)) # memory allocation 47 | 48 | # linear features 49 | feat_mat[:,:n] = o 50 | 51 | k = 0 # start from this row 52 | for i in range(len(paths)): 53 | l = len(paths[i]["rewards"]) 54 | al = np.arange(l)/1000.0 55 | for j in range(4): 56 | feat_mat[k:k+l, -4+j] = al**(j+1) 57 | k += l 58 | return feat_mat 59 | 60 | 61 | def fit(self, paths, return_errors=False): 62 | 63 | featmat = self._features(paths) 64 | returns = np.concatenate([path["returns"] for path in paths]).reshape(-1, 1) 65 | featmat = featmat.astype('float32') 66 | returns = returns.astype('float32') 67 | num_samples = returns.shape[0] 68 | 69 | # Make variables with the above data 70 | if self.use_gpu: 71 | featmat_var = Variable(torch.from_numpy(featmat).cuda(), requires_grad=False) 72 | returns_var = Variable(torch.from_numpy(returns).cuda(), requires_grad=False) 73 | else: 74 | featmat_var = Variable(torch.from_numpy(featmat), requires_grad=False) 75 | returns_var = Variable(torch.from_numpy(returns), requires_grad=False) 76 | 77 | if return_errors: 78 | if self.use_gpu: 79 | predictions = self.model(featmat_var).cpu().data.numpy().ravel() 80 | else: 81 | predictions = self.model(featmat_var).data.numpy().ravel() 82 | errors = returns.ravel() - predictions 83 | error_before = np.sum(errors**2)/(np.sum(returns**2) + 1e-8) 84 | 85 | epoch_losses = fit_data(self.model, featmat_var, returns_var, self.optimizer, 86 | self.loss_function, self.batch_size, self.epochs) 87 | 88 | if return_errors: 89 | if self.use_gpu: 90 | predictions = self.model(featmat_var).cpu().data.numpy().ravel() 91 | else: 92 | predictions = self.model(featmat_var).data.numpy().ravel() 93 | errors = returns.ravel() - predictions 94 | error_after = np.sum(errors**2)/(np.sum(returns**2) + 1e-8) 95 | return error_before, error_after 96 | 97 | def predict(self, path): 98 | featmat = self._features([path]).astype('float32') 99 | if self.use_gpu: 100 | feat_var = Variable(torch.from_numpy(featmat).float().cuda(), requires_grad=False) 101 | prediction = self.model(feat_var).cpu().data.numpy().ravel() 102 | else: 103 | feat_var = Variable(torch.from_numpy(featmat).float(), requires_grad=False) 104 | prediction = self.model(feat_var).data.numpy().ravel() 105 | return prediction 106 | -------------------------------------------------------------------------------- /env_dependencies/mjrl/mjrl/baselines/quadratic_baseline.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import copy 3 | 4 | class QuadraticBaseline: 5 | def __init__(self, env_spec, inp_dim=None, inp='obs', reg_coeff=1e-3): 6 | self.n = inp_dim if inp_dim is not None else env_spec.observation_dim 7 | self.inp = inp 8 | self._reg_coeff = reg_coeff 9 | self._coeffs = None 10 | 11 | def _features(self, paths): 12 | if self.inp == 'env_features': 13 | o = np.concatenate([path["env_infos"]["env_features"][0] for path in paths]) 14 | else: 15 | o = np.concatenate([path["observations"] for path in paths]) 16 | o = np.clip(o, -10, 10)/10.0 17 | if o.ndim > 2: 18 | o = o.reshape(o.shape[0], -1) 19 | N, n = o.shape 20 | num_feat = int( n + n*(n+1)/2 + 1 + 4 ) # linear + full quad (symmetric matrix) + bias (1.0) + time till pow 4 21 | feat_mat = np.ones((N, num_feat)) # memory allocation 22 | 23 | # linear features 24 | feat_mat[:,:n] = o 25 | 26 | # quadratic features 27 | k = n # starting from this column in feat_mat 28 | for i in range(n): 29 | for j in range(i, n): 30 | feat_mat[:,k] = o[:,i]*o[:,j] # element-wise product 31 | k += 1 32 | 33 | k = 0 # start from this row 34 | for i in range(len(paths)): 35 | l = len(paths[i]["rewards"]) 36 | al = np.arange(l)/1000.0 37 | for j in range(4): 38 | feat_mat[k:k+l, -4+j] = al**(j+1) 39 | k += l 40 | 41 | return feat_mat 42 | 43 | 44 | def fit(self, paths, return_errors=False): 45 | 46 | #featmat = np.concatenate([self._features(path) for path in paths]) 47 | featmat = self._features(paths) 48 | returns = np.concatenate([path["returns"] for path in paths]) 49 | 50 | if return_errors: 51 | predictions = featmat.dot(self._coeffs) if self._coeffs is not None else np.zeros(returns.shape) 52 | errors = returns - predictions 53 | error_before = np.sum(errors**2)/np.sum(returns**2) 54 | 55 | reg_coeff = copy.deepcopy(self._reg_coeff) 56 | for _ in range(10): 57 | self._coeffs = np.linalg.lstsq( 58 | featmat.T.dot(featmat) + reg_coeff * np.identity(featmat.shape[1]), 59 | featmat.T.dot(returns) 60 | )[0] 61 | if not np.any(np.isnan(self._coeffs)): 62 | break 63 | reg_coeff *= 10 64 | 65 | if return_errors: 66 | predictions = featmat.dot(self._coeffs) 67 | errors = returns - predictions 68 | error_after = np.sum(errors**2)/np.sum(returns**2) 69 | return error_before, error_after 70 | 71 | def predict(self, path): 72 | if self._coeffs is None: 73 | return np.zeros(len(path["rewards"])) 74 | return self._features([path]).dot(self._coeffs) 75 | -------------------------------------------------------------------------------- /env_dependencies/mjrl/mjrl/baselines/zero_baseline.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import copy 3 | 4 | class ZeroBaseline: 5 | def __init__(self, env_spec, **kwargs): 6 | n = env_spec.observation_dim # number of states 7 | self._coeffs = None 8 | 9 | def fit(self, paths, return_errors=False): 10 | if return_errors: 11 | return 1.0, 1.0 12 | 13 | def predict(self, path): 14 | return np.zeros(len(path["rewards"])) 15 | -------------------------------------------------------------------------------- /env_dependencies/mjrl/mjrl/envs/__init__.py: -------------------------------------------------------------------------------- 1 | from gym.envs.registration import register 2 | 3 | # ---------------------------------------- 4 | # mjrl environments 5 | # ---------------------------------------- 6 | 7 | register( 8 | id='mjrl_point_mass-v0', 9 | entry_point='mjrl.envs:PointMassEnv', 10 | max_episode_steps=25, 11 | ) 12 | 13 | register( 14 | id='mjrl_swimmer-v0', 15 | entry_point='mjrl.envs:SwimmerEnv', 16 | max_episode_steps=500, 17 | ) 18 | 19 | register( 20 | id='mjrl_reacher_7dof-v0', 21 | entry_point='mjrl.envs:Reacher7DOFEnv', 22 | max_episode_steps=50, 23 | ) 24 | 25 | register( 26 | id='mjrl_peg_insertion-v0', 27 | entry_point='mjrl.envs:PegEnv', 28 | max_episode_steps=50, 29 | ) 30 | 31 | from mjrl.envs.mujoco_env import MujocoEnv 32 | # ^^^^^ so that user gets the correct error 33 | # message if mujoco is not installed correctly 34 | from mjrl.envs.point_mass import PointMassEnv 35 | from mjrl.envs.swimmer import SwimmerEnv 36 | from mjrl.envs.reacher_sawyer import Reacher7DOFEnv 37 | from mjrl.envs.peg_insertion_sawyer import PegEnv 38 | -------------------------------------------------------------------------------- /env_dependencies/mjrl/mjrl/envs/assets/point_mass.xml: -------------------------------------------------------------------------------- 1 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 44 | -------------------------------------------------------------------------------- /env_dependencies/mjrl/mjrl/envs/assets/swimmer.xml: -------------------------------------------------------------------------------- 1 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 67 | -------------------------------------------------------------------------------- /env_dependencies/mjrl/mjrl/envs/point_mass.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from gym import utils 3 | from mjrl.envs import mujoco_env 4 | from mujoco_py import MjViewer 5 | 6 | 7 | class PointMassEnv(mujoco_env.MujocoEnv, utils.EzPickle): 8 | def __init__(self): 9 | self.agent_bid = 0 10 | self.target_sid = 0 11 | utils.EzPickle.__init__(self) 12 | mujoco_env.MujocoEnv.__init__(self, 'point_mass.xml', 5) 13 | self.agent_bid = self.sim.model.body_name2id('agent') 14 | self.target_sid = self.sim.model.site_name2id('target') 15 | 16 | def step(self, a): 17 | self.do_simulation(a, self.frame_skip) 18 | obs = self.get_obs() 19 | reward = self.get_reward(obs) 20 | return obs, reward, False, dict(solved=(reward > -0.1), state=self.get_env_state()) 21 | 22 | def get_obs(self): 23 | agent_pos = self.data.body_xpos[self.agent_bid].ravel() 24 | target_pos = self.data.site_xpos[self.target_sid].ravel() 25 | return np.concatenate([agent_pos[:2], self.data.qvel.ravel(), target_pos[:2]]) 26 | 27 | def get_reward(self, obs, act=None): 28 | if len(obs.shape) == 1: 29 | # vector obs, called when stepping the env 30 | agent_pos = obs[:2] 31 | target_pos = obs[-2:] 32 | l1_dist = np.sum(np.abs(agent_pos - target_pos)) 33 | l2_dist = np.linalg.norm(agent_pos - target_pos) 34 | else: 35 | obs = np.expand_dims(obs, axis=0) if len(obs.shape) == 2 else obs 36 | agent_pos = obs[:, :, :2] 37 | target_pos = obs[:, :, -2:] 38 | l1_dist = np.sum(np.abs(agent_pos - target_pos), axis=-1) 39 | l2_dist = np.linalg.norm(agent_pos - target_pos, axis=-1) 40 | reward = -1.0 * l1_dist - 0.5 * l2_dist 41 | return reward 42 | 43 | def compute_path_rewards(self, paths): 44 | # path has two keys: observations and actions 45 | # path["observations"] : (num_traj, horizon, obs_dim) 46 | # path["rewards"] should have shape (num_traj, horizon) 47 | obs = paths["observations"] 48 | rewards = self.get_reward(obs) 49 | rewards[..., :-1] = rewards[..., 1:] # shift index by 1 to have r(s,a)=r(s') 50 | paths["rewards"] = rewards if rewards.shape[0] > 1 else rewards.ravel() 51 | return paths 52 | 53 | def reset_model(self): 54 | # randomize the agent and goal 55 | agent_x = self.np_random.uniform(low=-1.0, high=1.0) 56 | agent_y = self.np_random.uniform(low=-1.0, high=1.0) 57 | goal_x = self.np_random.uniform(low=-1.0, high=1.0) 58 | goal_y = self.np_random.uniform(low=-1.0, high=1.0) 59 | qp = np.array([agent_x, agent_y]) 60 | qv = self.init_qvel.copy() 61 | self.set_state(qp, qv) 62 | self.model.site_pos[self.target_sid][0] = goal_x 63 | self.model.site_pos[self.target_sid][1] = goal_y 64 | self.sim.forward() 65 | return self.get_obs() 66 | 67 | def evaluate_success(self, paths, logger=None): 68 | success = 0.0 69 | for p in paths: 70 | if np.mean(p['env_infos']['solved'][-4:]) > 0.0: 71 | success += 1.0 72 | success_rate = 100.0*success/len(paths) 73 | if logger is None: 74 | # nowhere to log so return the value 75 | return success_rate 76 | else: 77 | # log the success 78 | # can log multiple statistics here if needed 79 | logger.log_kv('success_rate', success_rate) 80 | return None 81 | 82 | # -------------------------------- 83 | # get and set states 84 | # -------------------------------- 85 | 86 | def get_env_state(self): 87 | target_pos = self.model.site_pos[self.target_sid].copy() 88 | return dict(qp=self.data.qpos.copy(), qv=self.data.qvel.copy(), 89 | target_pos=target_pos) 90 | 91 | def set_env_state(self, state): 92 | self.sim.reset() 93 | qp = state['qp'].copy() 94 | qv = state['qv'].copy() 95 | target_pos = state['target_pos'] 96 | self.set_state(qp, qv) 97 | self.model.site_pos[self.target_sid] = target_pos 98 | self.sim.forward() 99 | 100 | # -------------------------------- 101 | # utility functions 102 | # -------------------------------- 103 | 104 | def get_env_infos(self): 105 | return dict(state=self.get_env_state()) 106 | 107 | def mj_viewer_setup(self): 108 | self.viewer = MjViewer(self.sim) 109 | self.sim.forward() 110 | -------------------------------------------------------------------------------- /env_dependencies/mjrl/mjrl/envs/reacher_sawyer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from gym import utils 3 | from mjrl.envs import mujoco_env 4 | from mujoco_py import MjViewer 5 | 6 | 7 | class Reacher7DOFEnv(mujoco_env.MujocoEnv, utils.EzPickle): 8 | def __init__(self): 9 | self.hand_sid = -2 10 | self.target_sid = -1 11 | mujoco_env.MujocoEnv.__init__(self, 'sawyer.xml', 4) 12 | utils.EzPickle.__init__(self) 13 | self.hand_sid = self.model.site_name2id("finger") 14 | self.target_sid = self.model.site_name2id("target") 15 | 16 | def step(self, a): 17 | self.do_simulation(a, self.frame_skip) 18 | obs = self.get_obs() 19 | reward = self.get_reward(obs, a) 20 | return obs, reward, False, self.get_env_infos() 21 | 22 | def get_obs(self): 23 | return np.concatenate([ 24 | self.data.qpos.flat, 25 | self.data.qvel.ravel() * self.dt, # delta_x instead of velocity 26 | self.data.site_xpos[self.hand_sid], 27 | self.data.site_xpos[self.target_sid], 28 | ]) 29 | 30 | def get_reward(self, obs, act=None): 31 | obs = np.clip(obs, -10.0, 10.0) 32 | if len(obs.shape) == 1: 33 | # vector obs, called when stepping the env 34 | hand_pos = obs[-6:-3] 35 | target_pos = obs[-3:] 36 | l1_dist = np.sum(np.abs(hand_pos - target_pos)) 37 | l2_dist = np.linalg.norm(hand_pos - target_pos) 38 | else: 39 | obs = np.expand_dims(obs, axis=0) if len(obs.shape) == 2 else obs 40 | hand_pos = obs[:, :, -6:-3] 41 | target_pos = obs[:, :, -3:] 42 | l1_dist = np.sum(np.abs(hand_pos - target_pos), axis=-1) 43 | l2_dist = np.linalg.norm(hand_pos - target_pos, axis=-1) 44 | reward = - l1_dist - 5.0 * l2_dist 45 | return reward 46 | 47 | def compute_path_rewards(self, paths): 48 | # path has two keys: observations and actions 49 | # path["observations"] : (num_traj, horizon, obs_dim) 50 | # path["rewards"] should have shape (num_traj, horizon) 51 | obs = paths["observations"] 52 | rewards = self.get_reward(obs) 53 | paths["rewards"] = rewards if rewards.shape[0] > 1 else rewards.ravel() 54 | 55 | # -------------------------------- 56 | # resets and randomization 57 | # -------------------------------- 58 | 59 | def robot_reset(self): 60 | self.set_state(self.init_qpos, self.init_qvel) 61 | 62 | def target_reset(self): 63 | target_pos = np.array([0.1, 0.1, 0.1]) 64 | target_pos[0] = self.np_random.uniform(low=-0.3, high=0.3) 65 | target_pos[1] = self.np_random.uniform(low=-0.2, high=0.2) 66 | target_pos[2] = self.np_random.uniform(low=-0.25, high=0.25) 67 | self.model.site_pos[self.target_sid] = target_pos 68 | self.sim.forward() 69 | 70 | def reset_model(self, seed=None): 71 | if seed is not None: 72 | self.seeding = True 73 | self.seed(seed) 74 | self.robot_reset() 75 | self.target_reset() 76 | return self.get_obs() 77 | 78 | # -------------------------------- 79 | # get and set states 80 | # -------------------------------- 81 | 82 | def get_env_state(self): 83 | target_pos = self.model.site_pos[self.target_sid].copy() 84 | return dict(qp=self.data.qpos.copy(), qv=self.data.qvel.copy(), 85 | target_pos=target_pos) 86 | 87 | def set_env_state(self, state): 88 | self.sim.reset() 89 | qp = state['qp'].copy() 90 | qv = state['qv'].copy() 91 | target_pos = state['target_pos'] 92 | self.model.site_pos[self.target_sid] = target_pos 93 | self.data.qpos[:] = qp 94 | self.data.qvel[:] = qv 95 | self.sim.forward() 96 | 97 | # -------------------------------- 98 | # utility functions 99 | # -------------------------------- 100 | 101 | def get_env_infos(self): 102 | return dict(state=self.get_env_state()) 103 | 104 | def mj_viewer_setup(self): 105 | self.viewer = MjViewer(self.sim) 106 | self.viewer.cam.trackbodyid = 1 107 | self.viewer.cam.type = 1 108 | self.sim.forward() 109 | self.viewer.cam.distance = self.model.stat.extent * 2.0 110 | -------------------------------------------------------------------------------- /env_dependencies/mjrl/mjrl/envs/swimmer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from gym import utils 3 | from mjrl.envs import mujoco_env 4 | from mujoco_py import MjViewer 5 | 6 | class SwimmerEnv(mujoco_env.MujocoEnv, utils.EzPickle): 7 | def __init__(self): 8 | mujoco_env.MujocoEnv.__init__(self, 'swimmer.xml', 5) 9 | utils.EzPickle.__init__(self) 10 | 11 | def step(self, a): 12 | xposbefore = self.data.qpos[0] 13 | self.do_simulation(a, self.frame_skip) 14 | xposafter = self.data.qpos[0] 15 | 16 | delta = (xposafter - xposbefore) 17 | # make agent move in the negative x direction 18 | reward = -10.0 * delta 19 | done = False 20 | 21 | ob = self.get_obs() 22 | return ob, reward, done, self.get_env_infos() 23 | 24 | def get_obs(self): 25 | return np.concatenate([ 26 | self.data.qpos.flat[2:], 27 | self.data.qvel.flat, 28 | ]) 29 | 30 | def reset_model(self): 31 | qpos_init = self.init_qpos.copy() 32 | qpos_init[2] = self.np_random.uniform(low=-np.pi, high=np.pi) 33 | self.set_state(qpos_init, self.init_qvel) 34 | self.sim.forward() 35 | return self.get_obs() 36 | 37 | # -------------------------------- 38 | # get and set states 39 | # -------------------------------- 40 | 41 | def get_env_state(self): 42 | return dict(qp=self.data.qpos.copy(), qv=self.data.qvel.copy()) 43 | 44 | def set_env_state(self, state): 45 | self.sim.reset() 46 | qp = state['qp'].copy() 47 | qv = state['qv'].copy() 48 | self.set_state(qp, qv) 49 | self.sim.forward() 50 | 51 | # -------------------------------- 52 | # utility functions 53 | # -------------------------------- 54 | 55 | def get_env_infos(self): 56 | return dict(state=self.get_env_state()) 57 | 58 | def mj_viewer_setup(self): 59 | self.viewer = MjViewer(self.sim) 60 | self.viewer.cam.trackbodyid = 1 61 | self.viewer.cam.type = 1 62 | self.sim.forward() 63 | self.viewer.cam.distance = self.model.stat.extent*1.2 -------------------------------------------------------------------------------- /env_dependencies/mjrl/mjrl/policies/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TEA-Lab/diffusion_reward/384ec68aa3d590f6266cff3ffb62b6c9135111b9/env_dependencies/mjrl/mjrl/policies/__init__.py -------------------------------------------------------------------------------- /env_dependencies/mjrl/mjrl/policies/mpc_actor.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from trajopt.utils import gather_paths_parallel 3 | 4 | 5 | class MPCActor(object): 6 | def __init__(self, env, H, paths_per_cpu, 7 | num_cpu=1, 8 | kappa=1.0, 9 | gamma=1.0, 10 | mean=None, 11 | filter_coefs=None, 12 | seed=123, 13 | ): 14 | 15 | self.env, self.seed = env, seed 16 | self.n, self.m = env.observation_dim, env.action_dim 17 | self.H, self.paths_per_cpu, self.num_cpu = H, paths_per_cpu, num_cpu 18 | 19 | self.mean, self.filter_coefs, self.kappa, self.gamma = mean, filter_coefs, kappa, gamma 20 | if mean is None: 21 | self.mean = np.zeros(self.m) 22 | if filter_coefs is None: 23 | self.filter_coefs = [np.ones(self.m), 1.0, 0.0, 0.0] 24 | 25 | self.env.reset() 26 | self.env.set_seed(seed) 27 | self.env.reset(seed=seed) 28 | self.act_sequence = np.ones((self.H, self.m)) * self.mean 29 | self.ctr = 1 30 | 31 | def score_trajectory(self, paths): 32 | scores = np.zeros(len(paths)) 33 | for i in range(len(paths)): 34 | scores[i] = 0.0 35 | for t in range(paths[i]["rewards"].shape[0]): 36 | scores[i] += (self.gamma**t)*paths[i]["rewards"][t] 37 | return scores 38 | 39 | def get_action(self, env_state): 40 | # Set to env_state 41 | # Shoot trajectories 42 | # Return optimal action 43 | seed = self.seed + self.ctr * 1000 44 | paths = gather_paths_parallel(self.env.env_id, 45 | env_state, 46 | self.act_sequence, 47 | self.filter_coefs, 48 | seed, 49 | self.paths_per_cpu, 50 | self.num_cpu, 51 | ) 52 | 53 | num_traj = len(paths) 54 | R = self.score_trajectory(paths) 55 | S = np.exp(self.kappa*(R-np.max(R))) 56 | act = np.sum([paths[i]["actions"][0] * S[i] for i in range(num_traj)], axis=0) 57 | act = act / (np.sum(S) + 1e-6) 58 | return act -------------------------------------------------------------------------------- /env_dependencies/mjrl/mjrl/samplers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TEA-Lab/diffusion_reward/384ec68aa3d590f6266cff3ffb62b6c9135111b9/env_dependencies/mjrl/mjrl/samplers/__init__.py -------------------------------------------------------------------------------- /env_dependencies/mjrl/mjrl/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TEA-Lab/diffusion_reward/384ec68aa3d590f6266cff3ffb62b6c9135111b9/env_dependencies/mjrl/mjrl/utils/__init__.py -------------------------------------------------------------------------------- /env_dependencies/mjrl/mjrl/utils/cg_solve.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def cg_solve(f_Ax, b, x_0=None, cg_iters=10, residual_tol=1e-10): 4 | x = np.zeros_like(b) #if x_0 is None else x_0 5 | r = b.copy() #if x_0 is None else b-f_Ax(x_0) 6 | p = r.copy() 7 | rdotr = r.dot(r) 8 | 9 | for i in range(cg_iters): 10 | z = f_Ax(p) 11 | v = rdotr / p.dot(z) 12 | x += v * p 13 | r -= v * z 14 | newrdotr = r.dot(r) 15 | mu = newrdotr / rdotr 16 | p = r + mu * p 17 | 18 | rdotr = newrdotr 19 | if rdotr < residual_tol: 20 | break 21 | 22 | return x 23 | -------------------------------------------------------------------------------- /env_dependencies/mjrl/mjrl/utils/fc_network.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | 5 | 6 | class FCNetwork(nn.Module): 7 | def __init__(self, obs_dim, act_dim, 8 | hidden_sizes=(64,64), 9 | nonlinearity='tanh', # either 'tanh' or 'relu' 10 | in_shift = None, 11 | in_scale = None, 12 | out_shift = None, 13 | out_scale = None): 14 | super(FCNetwork, self).__init__() 15 | 16 | self.obs_dim = obs_dim 17 | self.act_dim = act_dim 18 | assert type(hidden_sizes) == tuple 19 | self.layer_sizes = (obs_dim, ) + hidden_sizes + (act_dim, ) 20 | self.set_transformations(in_shift, in_scale, out_shift, out_scale) 21 | 22 | # hidden layers 23 | self.fc_layers = nn.ModuleList([nn.Linear(self.layer_sizes[i], self.layer_sizes[i+1]) \ 24 | for i in range(len(self.layer_sizes) -1)]) 25 | self.nonlinearity = torch.relu if nonlinearity == 'relu' else torch.tanh 26 | 27 | def set_transformations(self, in_shift=None, in_scale=None, out_shift=None, out_scale=None): 28 | # store native scales that can be used for resets 29 | self.transformations = dict(in_shift=in_shift, 30 | in_scale=in_scale, 31 | out_shift=out_shift, 32 | out_scale=out_scale 33 | ) 34 | self.in_shift = torch.from_numpy(np.float32(in_shift)) if in_shift is not None else torch.zeros(self.obs_dim) 35 | self.in_scale = torch.from_numpy(np.float32(in_scale)) if in_scale is not None else torch.ones(self.obs_dim) 36 | self.out_shift = torch.from_numpy(np.float32(out_shift)) if out_shift is not None else torch.zeros(self.act_dim) 37 | self.out_scale = torch.from_numpy(np.float32(out_scale)) if out_scale is not None else torch.ones(self.act_dim) 38 | 39 | def forward(self, x): 40 | # TODO(Aravind): Remove clamping to CPU 41 | # This is a temp change that should be fixed shortly 42 | if x.is_cuda: 43 | out = x.to('cpu') 44 | else: 45 | out = x 46 | out = (out - self.in_shift)/(self.in_scale + 1e-8) 47 | for i in range(len(self.fc_layers)-1): 48 | out = self.fc_layers[i](out) 49 | out = self.nonlinearity(out) 50 | out = self.fc_layers[-1](out) 51 | out = out * self.out_scale + self.out_shift 52 | return out 53 | -------------------------------------------------------------------------------- /env_dependencies/mjrl/mjrl/utils/get_environment.py: -------------------------------------------------------------------------------- 1 | """ 2 | convenience function to generate env 3 | useful if we want some procedural env generation 4 | """ 5 | 6 | import gym 7 | from mjrl.utils.gym_env import GymEnv 8 | 9 | def get_environment(env_name=None, **kwargs): 10 | if env_name is None: print("Need to specify environment name") 11 | e = GymEnv(env_name) 12 | # can make procedural modifications here if needed using kwargs 13 | return e 14 | -------------------------------------------------------------------------------- /env_dependencies/mjrl/mjrl/utils/logger.py: -------------------------------------------------------------------------------- 1 | import matplotlib 2 | matplotlib.use('Agg') 3 | import matplotlib.pyplot as plt 4 | import numpy as np 5 | import scipy 6 | import pickle 7 | import os 8 | import csv 9 | 10 | class DataLog: 11 | 12 | def __init__(self): 13 | self.log = {} 14 | self.max_len = 0 15 | 16 | def log_kv(self, key, value): 17 | # logs the (key, value) pair 18 | 19 | # TODO: This implementation is error-prone: 20 | # it would be NOT aligned if some keys are missing during one iteration. 21 | if key not in self.log: 22 | self.log[key] = [] 23 | self.log[key].append(value) 24 | if len(self.log[key]) > self.max_len: 25 | self.max_len = self.max_len + 1 26 | 27 | def save_log(self, save_path): 28 | # TODO: Validate all lengths are the same. 29 | pickle.dump(self.log, open(save_path + '/log.pickle', 'wb')) 30 | with open(save_path + '/log.csv', 'w') as csv_file: 31 | fieldnames = list(self.log.keys()) 32 | if 'iteration' not in fieldnames: 33 | fieldnames = ['iteration'] + fieldnames 34 | 35 | writer = csv.DictWriter(csv_file, fieldnames=fieldnames) 36 | writer.writeheader() 37 | for row in range(self.max_len): 38 | row_dict = {'iteration': row} 39 | for key in self.log.keys(): 40 | if row < len(self.log[key]): 41 | row_dict[key] = self.log[key][row] 42 | writer.writerow(row_dict) 43 | 44 | def get_current_log(self): 45 | row_dict = {} 46 | for key in self.log.keys(): 47 | # TODO: this is very error-prone (alignment is not guaranteed) 48 | row_dict[key] = self.log[key][-1] 49 | return row_dict 50 | 51 | def shrink_to(self, num_entries): 52 | for key in self.log.keys(): 53 | self.log[key] = self.log[key][:num_entries] 54 | 55 | self.max_len = num_entries 56 | assert min([len(series) for series in self.log.values()]) == \ 57 | max([len(series) for series in self.log.values()]) 58 | 59 | def read_log(self, log_path): 60 | assert log_path.endswith('log.csv') 61 | 62 | with open(log_path) as csv_file: 63 | reader = csv.DictReader(csv_file) 64 | listr = list(reader) 65 | keys = reader.fieldnames 66 | data = {} 67 | for key in keys: 68 | data[key] = [] 69 | for row, row_dict in enumerate(listr): 70 | for key in keys: 71 | try: 72 | data[key].append(eval(row_dict[key])) 73 | except: 74 | print("ERROR on reading key {}: {}".format(key, row_dict[key])) 75 | 76 | if 'iteration' in data and data['iteration'][-1] != row: 77 | raise RuntimeError("Iteration %d mismatch -- possibly corrupted logfile?" % row) 78 | 79 | self.log = data 80 | self.max_len = max(len(v) for k, v in self.log.items()) 81 | print("Log read from {}: had {} entries".format(log_path, self.max_len)) 82 | -------------------------------------------------------------------------------- /env_dependencies/mjrl/mjrl/utils/make_train_plots.py: -------------------------------------------------------------------------------- 1 | import matplotlib 2 | matplotlib.use('Agg') 3 | import matplotlib.pyplot as plt 4 | import numpy as np 5 | import scipy 6 | import csv 7 | from mjrl.utils.logger import DataLog 8 | import argparse 9 | 10 | def make_train_plots(log = None, 11 | log_path = None, 12 | keys = None, 13 | save_loc = None, 14 | sample_key = 'num_samples', 15 | x_scale = 1.0, 16 | y_scale = 1.0): 17 | if log is None and log_path is None: 18 | print("Need to provide either the log or path to a log file") 19 | if log is None: 20 | logger = DataLog() 21 | logger.read_log(log_path) 22 | log = logger.log 23 | # make plots for specified keys 24 | for key in keys: 25 | if key in log.keys(): 26 | fig = plt.figure(figsize=(10,6)) 27 | ax1 = fig.add_subplot(111) 28 | try: 29 | cum_samples = [np.sum(log[sample_key][:i]) * x_scale for i in range(len(log[sample_key]))] 30 | ax1.plot(cum_samples, [elem * y_scale for elem in log[key]]) 31 | ax1.set_xlabel('samples') 32 | # mark iteration on the top axis 33 | ax2 = ax1.twiny() 34 | ax2.set_xlabel('iterations', color=(.7,.7,.7)) 35 | ax2.tick_params(axis='x', labelcolor=(.7,.7,.7)) 36 | ax2.set_xlim([0, len(log[key])]) 37 | except: 38 | ax1.plot(log[key]) 39 | ax1.set_xlabel('iterations') 40 | ax1.set_title(key) 41 | plt.savefig(save_loc+'/'+key+'.png', dpi=100) 42 | plt.close() 43 | 44 | # MAIN ========================================================= 45 | # Example: python make_train_plots.py --log_path logs/log.csv --keys eval_score rollout_score save_loc logs 46 | def main(): 47 | # Parse arguments 48 | parser = argparse.ArgumentParser() 49 | parser.add_argument( 50 | '-l', '--log_path', type=str, required=True, help='path file to log.csv') 51 | parser.add_argument( 52 | '-k', '--keys', type=str, action='append', nargs='+', required=True, help='keys to plot') 53 | parser.add_argument( 54 | '-s', '--save_loc', type=str, default='', help='Path for logs') 55 | args = parser.parse_args() 56 | 57 | make_train_plots(log_path=args.log_path, keys=args.keys[0], save_loc=args.save_loc) 58 | 59 | if __name__ == '__main__': 60 | main() 61 | 62 | -------------------------------------------------------------------------------- /env_dependencies/mjrl/mjrl/utils/optimize_model.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import copy 3 | import torch 4 | import torch.nn as nn 5 | 6 | 7 | def fit_data(model, x, y, optimizer, loss_func, batch_size, epochs): 8 | """ 9 | :param model: pytorch model of form y_hat = f(x) (class) 10 | :param x: inputs to the model (tensor) 11 | :param y: desired outputs or targets (tensor) 12 | :param optimizer: optimizer to be used (class) 13 | :param loss_func: loss criterion (callable) 14 | :param batch_size: mini-batch size for optimization (int) 15 | :param epochs: number of epochs (int) 16 | :return: 17 | """ 18 | 19 | num_samples = x.shape[0] 20 | epoch_losses = [] 21 | for ep in range(epochs): 22 | rand_idx = torch.LongTensor(np.random.permutation(num_samples)) 23 | ep_loss = 0.0 24 | num_steps = int(num_samples / batch_size) - 1 25 | for mb in range(num_steps): 26 | data_idx = rand_idx[mb*batch_size:(mb+1)*batch_size] 27 | batch_x = x[data_idx] 28 | batch_y = y[data_idx] 29 | optimizer.zero_grad() 30 | yhat = model(batch_x) 31 | loss = loss_func(yhat, batch_y) 32 | loss.backward() 33 | optimizer.step() 34 | ep_loss += loss.detach() 35 | epoch_losses.append(ep_loss.to('cpu').data.numpy().ravel() / num_steps) 36 | return epoch_losses 37 | -------------------------------------------------------------------------------- /env_dependencies/mjrl/mjrl/utils/plot_from_logs.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import pickle 4 | import numpy as np 5 | import matplotlib 6 | matplotlib.use('Agg') 7 | import matplotlib.pyplot as plt 8 | colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k'] 9 | 10 | parser = argparse.ArgumentParser(description='Script to explore the data generated by an experiment.') 11 | parser.add_argument('--data', '-d', type=str, required=True, help='location of the .pickle log data file') 12 | parser.add_argument('--output', '-o', type=str, required=True, help='location to store results as a png') 13 | parser.add_argument('--xkey', '-x', type=str, default=None, help='the key to use for x axis in plots') 14 | parser.add_argument('--xscale', '-s', type=int, default=1, help='scaling for the x axis (optional)') 15 | args = parser.parse_args() 16 | 17 | # get inputs and setup output file 18 | if '.png' in args.output: 19 | OUT_FILE = args.output 20 | else: 21 | OUT_FILE = args.output + '/plot.png' 22 | data = pickle.load(open(args.data, 'rb')) 23 | xscale = 1 if args.xscale is None else args.xscale 24 | if args.xkey == 'num_samples': 25 | xscale = xscale if 'act_repeat' not in data.keys() else data['act_repeat'][-1] 26 | 27 | dict_keys = list(data.keys()) 28 | for k in dict_keys: 29 | if len(data[k]) == 1: del(data[k]) 30 | 31 | # plot layout 32 | nplt = len(data.keys()) 33 | ncol = 4 34 | nrow = int(np.ceil(nplt/ncol)) 35 | 36 | # plot data 37 | xkey = args.xkey 38 | start_idx = 2 39 | end_idx = max([len(data[k]) for k in data.keys()]) 40 | xdata = np.arange(end_idx) if (xkey is None or xkey == 'None') else \ 41 | [np.sum(data[xkey][:i+1]) * xscale for i in range(len(data[xkey]))] 42 | 43 | # make the plot 44 | plt.figure(figsize=(15,15), dpi=60) 45 | for idx, key in enumerate(data.keys()): 46 | plt.subplot(nrow, ncol, idx+1) 47 | plt.tight_layout() 48 | try: 49 | last_idx = min(end_idx, len(data[key])) 50 | plt.plot(xdata[start_idx:last_idx], data[key][start_idx:last_idx], color=colors[idx%7], linewidth=3) 51 | except: 52 | pass 53 | plt.title(key) 54 | 55 | plt.savefig(OUT_FILE, dpi=100, bbox_inches="tight") 56 | -------------------------------------------------------------------------------- /env_dependencies/mjrl/mjrl/utils/process_samples.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def compute_returns(paths, gamma): 4 | for path in paths: 5 | path["returns"] = discount_sum(path["rewards"], gamma) 6 | 7 | def compute_advantages(paths, baseline, gamma, gae_lambda=None, normalize=False): 8 | # compute and store returns, advantages, and baseline 9 | # standard mode 10 | if gae_lambda == None or gae_lambda < 0.0 or gae_lambda > 1.0: 11 | for path in paths: 12 | path["baseline"] = baseline.predict(path) 13 | path["advantages"] = path["returns"] - path["baseline"] 14 | if normalize: 15 | alladv = np.concatenate([path["advantages"] for path in paths]) 16 | mean_adv = alladv.mean() 17 | std_adv = alladv.std() 18 | for path in paths: 19 | path["advantages"] = (path["advantages"]-mean_adv)/(std_adv+1e-8) 20 | # GAE mode 21 | else: 22 | for path in paths: 23 | b = path["baseline"] = baseline.predict(path) 24 | if b.ndim == 1: 25 | b1 = np.append(path["baseline"], 0.0 if path["terminated"] else b[-1]) 26 | else: 27 | b1 = np.vstack((b, np.zeros(b.shape[1]) if path["terminated"] else b[-1])) 28 | td_deltas = path["rewards"] + gamma*b1[1:] - b1[:-1] 29 | path["advantages"] = discount_sum(td_deltas, gamma*gae_lambda) 30 | if normalize: 31 | alladv = np.concatenate([path["advantages"] for path in paths]) 32 | mean_adv = alladv.mean() 33 | std_adv = alladv.std() 34 | for path in paths: 35 | path["advantages"] = (path["advantages"]-mean_adv)/(std_adv+1e-8) 36 | 37 | def discount_sum(x, gamma, terminal=0.0): 38 | y = [] 39 | run_sum = terminal 40 | for t in range( len(x)-1, -1, -1): 41 | run_sum = x[t] + gamma*run_sum 42 | y.append(run_sum) 43 | 44 | return np.array(y[::-1]) -------------------------------------------------------------------------------- /env_dependencies/mjrl/mjrl/utils/visualize_policy.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import mjrl.envs 3 | import click 4 | import os 5 | import gym 6 | import numpy as np 7 | import pickle 8 | from mjrl.utils.gym_env import GymEnv 9 | from mjrl.policies.gaussian_mlp import MLP 10 | import trajopt.envs 11 | 12 | DESC = ''' 13 | Helper script to visualize policy (in mjrl format).\n 14 | USAGE:\n 15 | Visualizes policy on the env\n 16 | $ python utils/visualize_policy --env_name mjrl_swimmer-v0 --policy my_policy.pickle --mode evaluation --episodes 10 \n 17 | ''' 18 | 19 | # MAIN ========================================================= 20 | @click.command(help=DESC) 21 | @click.option('--env_name', type=str, help='environment to load', required= True) 22 | @click.option('--policy', type=str, help='absolute path of the policy file', default=None) 23 | @click.option('--mode', type=str, help='exploration or evaluation mode for policy', default='evaluation') 24 | @click.option('--seed', type=int, help='seed for generating environment instances', default=123) 25 | @click.option('--episodes', type=int, help='number of episodes to visualize', default=10) 26 | 27 | def main(env_name, policy, mode, seed, episodes): 28 | e = GymEnv(env_name) 29 | e.set_seed(seed) 30 | if policy is not None: 31 | pi = pickle.load(open(policy, 'rb')) 32 | else: 33 | pi = MLP(e.spec, hidden_sizes=(32,32), seed=seed, init_log_std=-1.0) 34 | # render policy 35 | e.visualize_policy(pi, num_episodes=episodes, horizon=e.horizon, mode=mode) 36 | 37 | if __name__ == '__main__': 38 | main() 39 | 40 | -------------------------------------------------------------------------------- /env_dependencies/mjrl/setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from setuptools import setup, find_packages 4 | 5 | print("Installing mjrl. \n Package intended for use with provided conda env. See setup instructions here: https://github.com/aravindr93/mjrl/tree/master/setup") 6 | 7 | if sys.version_info.major != 3: 8 | print("This Python is only compatible with Python 3, but you are running " 9 | "Python {}. The installation will likely fail.".format(sys.version_info.major)) 10 | 11 | def read(fname): 12 | return open(os.path.join(os.path.dirname(__file__), fname)).read() 13 | 14 | setup( 15 | name='mjrl', 16 | version='1.0.0', 17 | packages=find_packages(), 18 | description='RL algorithms for environments in MuJoCo', 19 | long_description=read('README.md'), 20 | url='https://github.com/aravindr93/mjrl.git', 21 | author='Aravind Rajeswaran', 22 | ) 23 | -------------------------------------------------------------------------------- /env_dependencies/mjrl/setup/README.md: -------------------------------------------------------------------------------- 1 | # Installation 2 | 3 | A short guide to install this package is below. The package relies on `mujoco-py` which might be the trickiest part of the installation. See `known issues` below and also instructions from the mujoco-py [page](https://github.com/openai/mujoco-py) if you are stuck with mujoco-py installation. 4 | 5 | The package can handle both `MuJoCo v1.5` as well as `MuJoCo v2.0`, but the former is not supported for future updates. We encourage you to use v2.0. 6 | 7 | ## Linux 8 | 9 | - Download MuJoCo v2.0 binaries from the official [website](http://www.mujoco.org/) and also obtain the license key. 10 | - Unzip the downloaded `mujoco200` directory into `~/.mujoco/mujoco200`, and place your license key (mjkey.txt) at `~/.mujoco/mjkey.txt`. Note that unzip of the MuJoCo binaries will generate `mujoco200_linux`. You need to rename the directory and place it at `~/.mujoco/mujoco200`. 11 | - Install osmesa related dependencies: 12 | ``` 13 | $ sudo apt-get install libgl1-mesa-dev libgl1-mesa-glx libglew-dev libosmesa6-dev build-essential libglfw3 14 | ``` 15 | - Update `bashrc` by adding the following lines and source it 16 | ``` 17 | export LD_LIBRARY_PATH="/mujoco200/bin:$LD_LIBRARY_PATH" 18 | export MUJOCO_PY_FORCE_CPU=True 19 | alias MJPL='LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libGLEW.so:/usr/lib/nvidia-384/libGL.so' 20 | ``` 21 | - Install this package using 22 | ``` 23 | $ conda update conda 24 | $ cd 25 | $ conda env create -f setup/env.yml 26 | $ source activate mjrl-env 27 | $ pip install -e . 28 | ``` 29 | - *NOTE 1:* If there are issues with install of pytorch, please follow instructions from the [pytorch website](https://pytorch.org/) to install it properly based on the specific version of CUDA (or CPU-only) you have. 30 | 31 | - *NOTE 2:* If you encounter a patchelf error in mujoco_py install, you can fix this with the following command when inside the anaconda env: `conda install -c anaconda patchelf`. See this [page](https://github.com/openai/mujoco-py/issues/147) for additional info. 32 | 33 | ## Mac OS 34 | 35 | - Download MuJoCo binaries from the official [website](http://www.mujoco.org/) and also obtain the license key. 36 | - Unzip the downloaded `mujoco200` directory into `~/.mujoco/mujoco200` (rename unzipped directory to this), and place your license key (mjkey.txt) at `~/.mujoco/mjkey.txt`. 37 | - Update `bashrc` by adding the following lines and source it 38 | ``` 39 | export LD_LIBRARY_PATH="/mujoco200/bin:$LD_LIBRARY_PATH" 40 | ``` 41 | - Install this package using 42 | ``` 43 | $ conda update conda 44 | $ cd path/to/mjrl 45 | $ conda env create -f setup/env.yml 46 | $ source activate mjrl-env 47 | $ pip install -e . 48 | ``` 49 | 50 | - *NOTE 1:* If there are issues with install of pytorch, please follow instructions from the [pytorch website](https://pytorch.org/) to install it properly. 51 | 52 | - *NOTE 2:* If you encounter a patchelf error in mujoco_py install, you can fix this with the following command when inside the anaconda env: `conda install -c anaconda patchelf`. See this [page](https://github.com/openai/mujoco-py/issues/147) for additional info. 53 | 54 | 55 | ## Known Issues 56 | 57 | - Visualization in linux: If the linux system has a GPU, then mujoco-py does not automatically preload the correct drivers. We added an alias `MJPL` in bashrc (see instructions) which stands for mujoco pre-load. When runing any python script that requires rendering, prepend the execution with MJPL. 58 | ``` 59 | $ MJPL python script.py 60 | ``` 61 | 62 | - Errors related to osmesa during installation. This is a `mujoco-py` build error and would likely go away if the following command is used before creating the conda environment. If the problem still persists, please contact the developers of mujoco-py 63 | ``` 64 | $ sudo apt-get install libgl1-mesa-dev libgl1-mesa-glx libglew-dev libosmesa6-dev 65 | ``` 66 | 67 | - If conda environment creation gets interrupted for some reason, you can resume it with the following: 68 | ``` 69 | $ conda env update -n mjrl-env -f setup/env.yml 70 | ``` 71 | 72 | - GCC error in Mac OS: If you get a GCC error from mujoco-py, you can get the correct version mujoco-py expects with `brew install gcc --without-multilib`. This may require uninstalling other versions of GCC that may have been previously installed with `brew remove gcc@6` for example. You can see which brew packages were already installed with `brew list`. 73 | 74 | -------------------------------------------------------------------------------- /env_dependencies/mjrl/setup/env.yml: -------------------------------------------------------------------------------- 1 | name: mjrl-env 2 | channels: 3 | - pytorch 4 | - defaults 5 | dependencies: 6 | - python=3.7 7 | - pip 8 | - ipython 9 | - mkl-service 10 | - pytorch==1.4 11 | - tabulate 12 | - termcolor 13 | - torchvision 14 | - patchelf 15 | - pip: 16 | - click 17 | - cloudpickle 18 | - gym==0.13 19 | - ipdb 20 | - matplotlib 21 | - mujoco-py<2.1,>=2.0 22 | - pip 23 | - pyyaml 24 | - tqdm 25 | - wheel 26 | - scipy 27 | - transforms3d 28 | -------------------------------------------------------------------------------- /env_dependencies/mjrl/tests/hydra/config/hydra_npg_config.yaml: -------------------------------------------------------------------------------- 1 | # general outputs 2 | job_name : 'hydra_npg_test' 3 | 4 | # general inputs 5 | env : Hopper-v3 6 | algorithm : NPG 7 | seed : 123 8 | sample_mode : samples 9 | rl_num_samples : 1000 10 | rl_num_traj : 0 11 | rl_num_iter : 2 12 | num_cpu : 4 13 | save_freq : 5 14 | eval_rollouts : 0 15 | exp_notes : 'Example config for training policy with NPG on the OpenAI gym Hopper-v3 task.' 16 | 17 | # RL parameters (all params related to PG, value function etc.) 18 | policy_size : (32, 32) 19 | init_log_std : -0.5 20 | vf_hidden_size : (128, 128) 21 | vf_batch_size : 64 22 | vf_epochs : 2 23 | vf_learn_rate : 1e-3 24 | rl_step_size : 0.05 25 | rl_gamma : 0.995 26 | rl_gae : 0.97 27 | 28 | # Algorithm hyperparameters : if alg requires additional params, can be specified here (or defaults will be used) 29 | 30 | alg_hyper_params : {} 31 | 32 | hydra: 33 | launcher: 34 | cpus_per_task: 12 35 | gpus_per_node: 0 36 | tasks_per_node: 1 37 | run: 38 | dir: ./outputs/${hydra.job.name}/${now:%Y-%m-%d_%H-%M-%S} 39 | sweep: 40 | dir: /checkpoint/${env:USER}/outputs/${job_name}/${now:%Y-%m-%d}_${now:%H-%M-%S} 41 | subdir: ${hydra.job.num}_${hydra.job.override_dirname} -------------------------------------------------------------------------------- /env_dependencies/mjrl/tests/point_mass_test.py: -------------------------------------------------------------------------------- 1 | from mjrl.utils.gym_env import GymEnv 2 | from mjrl.policies.gaussian_mlp import MLP 3 | from mjrl.baselines.quadratic_baseline import QuadraticBaseline 4 | from mjrl.baselines.mlp_baseline import MLPBaseline 5 | from mjrl.algos.npg_cg import NPG 6 | from mjrl.utils.train_agent import train_agent 7 | import mjrl.envs 8 | import time as timer 9 | SEED = 500 10 | 11 | e = GymEnv('mjrl_point_mass-v0') 12 | policy = MLP(e.spec, hidden_sizes=(32,32), seed=SEED) 13 | baseline = MLPBaseline(e.spec, reg_coef=1e-3, batch_size=64, epochs=10, learn_rate=1e-3) 14 | agent = NPG(e, policy, baseline, normalized_step_size=0.05, seed=SEED, save_logs=True) 15 | 16 | ts = timer.time() 17 | train_agent(job_name='point_mass_exp1', 18 | agent=agent, 19 | seed=SEED, 20 | niter=50, 21 | gamma=0.95, 22 | gae_lambda=0.97, 23 | num_cpu=1, 24 | sample_mode='trajectories', 25 | num_traj=40, # samples = 40*25 = 1000 26 | save_freq=5, 27 | evaluation_rollouts=None, 28 | plot_keys=['stoc_pol_mean', 'running_score']) 29 | print("time taken = %f" % (timer.time()-ts)) 30 | -------------------------------------------------------------------------------- /env_dependencies/mjrl/tests/visualizer_test.py: -------------------------------------------------------------------------------- 1 | from mjrl.utils.gym_env import GymEnv 2 | from mjrl.policies.gaussian_mlp import MLP 3 | from mjrl.baselines.quadratic_baseline import QuadraticBaseline 4 | from mjrl.baselines.mlp_baseline import MLPBaseline 5 | from mjrl.algos.npg_cg import NPG 6 | from mjrl.utils.train_agent import train_agent 7 | import mjrl.envs 8 | import time as timer 9 | SEED = 500 10 | 11 | e = GymEnv('mjrl_point_mass-v0') 12 | policy = MLP(e.spec, hidden_sizes=(32,32), seed=SEED) 13 | baseline = QuadraticBaseline(e.spec) 14 | agent = NPG(e, policy, baseline, normalized_step_size=0.5, seed=SEED, save_logs=True) 15 | 16 | ts = timer.time() 17 | train_agent(job_name='vis_exp', 18 | agent=agent, 19 | seed=SEED, 20 | niter=10, 21 | gamma=0.95, 22 | gae_lambda=0.97, 23 | num_cpu=1, 24 | sample_mode='trajectories', 25 | num_traj=100, 26 | save_freq=5, 27 | evaluation_rollouts=None) 28 | print("time taken = %f" % (timer.time()-ts)) 29 | e.visualize_policy(policy, num_episodes=5, horizon=e.horizon, mode='exploration') 30 | -------------------------------------------------------------------------------- /scripts/run/codec_model/vqgan_adroit.sh: -------------------------------------------------------------------------------- 1 | python scripts/train_vqgan.py dataset=adroit -------------------------------------------------------------------------------- /scripts/run/codec_model/vqgan_metaworld.sh: -------------------------------------------------------------------------------- 1 | python scripts/train_vqgan.py dataset=metaworld -------------------------------------------------------------------------------- /scripts/run/rl/drqv2_adroit_amp.sh: -------------------------------------------------------------------------------- 1 | task=${1} 2 | 3 | python scripts/train_drqv2.py task=${task} reward=amp -------------------------------------------------------------------------------- /scripts/run/rl/drqv2_adroit_diffusion_reward.sh: -------------------------------------------------------------------------------- 1 | task=${1} 2 | 3 | python scripts/train_drqv2.py task=${task} reward=diffusion_reward -------------------------------------------------------------------------------- /scripts/run/rl/drqv2_adroit_raw_sparse_reward.sh: -------------------------------------------------------------------------------- 1 | task=${1} 2 | 3 | python scripts/train_drqv2.py task=${task} use_rm=False -------------------------------------------------------------------------------- /scripts/run/rl/drqv2_adroit_rnd.sh: -------------------------------------------------------------------------------- 1 | task=${1} 2 | 3 | python scripts/train_drqv2.py task=${task} reward=rnd -------------------------------------------------------------------------------- /scripts/run/rl/drqv2_adroit_viper.sh: -------------------------------------------------------------------------------- 1 | task=${1} 2 | 3 | python scripts/train_drqv2.py task=${task} reward=viper reward.expl_std=false -------------------------------------------------------------------------------- /scripts/run/rl/drqv2_adroit_viper_std.sh: -------------------------------------------------------------------------------- 1 | task=${1} 2 | 3 | python scripts/train_drqv2.py task=${task} reward=viper reward.expl_std=true -------------------------------------------------------------------------------- /scripts/run/rl/drqv2_metaworld_amp.sh: -------------------------------------------------------------------------------- 1 | task=${1} 2 | 3 | python scripts/train_drqv2.py task=${task} reward=amp -------------------------------------------------------------------------------- /scripts/run/rl/drqv2_metaworld_diffusion_reward.sh: -------------------------------------------------------------------------------- 1 | task=${1} 2 | 3 | python scripts/train_drqv2.py task=${task} reward=diffusion_reward reward.expl_update_interval=1 -------------------------------------------------------------------------------- /scripts/run/rl/drqv2_metaworld_raw_sparse_reward.sh: -------------------------------------------------------------------------------- 1 | task=${1} 2 | 3 | python scripts/train_drqv2.py task=${task} use_rm=False -------------------------------------------------------------------------------- /scripts/run/rl/drqv2_metaworld_rnd.sh: -------------------------------------------------------------------------------- 1 | task=${1} 2 | 3 | python scripts/train_drqv2.py task=${task} reward=rnd reward.expl_update_interval=1 -------------------------------------------------------------------------------- /scripts/run/rl/drqv2_metaworld_viper.sh: -------------------------------------------------------------------------------- 1 | task=${1} 2 | 3 | python scripts/train_drqv2.py task=${task} reward=viper reward.expl_std=false reward.expl_update_interval=1 -------------------------------------------------------------------------------- /scripts/run/rl/drqv2_metaworld_viper_std.sh: -------------------------------------------------------------------------------- 1 | task=${1} 2 | 3 | python scripts/train_drqv2.py task=${task} reward=viper reward.expl_std=true reward.expl_update_interval=1 -------------------------------------------------------------------------------- /scripts/run/video_model/videogpt_adroit.sh: -------------------------------------------------------------------------------- 1 | python scripts/train_videogpt.py dataset=adroit -------------------------------------------------------------------------------- /scripts/run/video_model/videogpt_metaworld.sh: -------------------------------------------------------------------------------- 1 | python scripts/train_videogpt.py dataset=metaworld -------------------------------------------------------------------------------- /scripts/run/video_model/vqdiffusion_adroit.sh: -------------------------------------------------------------------------------- 1 | python scripts/train_vqdiffusion.py dataset=adroit -------------------------------------------------------------------------------- /scripts/run/video_model/vqdiffusion_metaworld.sh: -------------------------------------------------------------------------------- 1 | python scripts/train_vqdiffusion.py dataset=metaworld -------------------------------------------------------------------------------- /scripts/train_videogpt.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import time 4 | 5 | import matplotlib 6 | import numpy as np 7 | import torch 8 | import torch.nn as nn 9 | import torch.nn.functional as F 10 | from diffusion_reward.models.video_models.videogpt.transformer import \ 11 | VideoGPTTransformer 12 | from diffusion_reward.models.video_models.videogpt.utils import load_video_data 13 | from torchvision import utils as vutils 14 | from tqdm import tqdm 15 | 16 | matplotlib.use('Agg') 17 | import hydra 18 | import matplotlib.pyplot as plt 19 | 20 | 21 | class TrainTransformer: 22 | def __init__(self, args): 23 | self.args = args 24 | self.work_dir = 'results' 25 | self.model = VideoGPTTransformer(args).to(device=args.device) 26 | self.optim = self.configure_optimizers() 27 | 28 | self.prepare_training() 29 | 30 | self.train(args) 31 | 32 | def prepare_training(self): 33 | if os.path.exists(f"{self.work_dir}/results"): 34 | shutil.rmtree(f"{self.work_dir}/results") 35 | os.makedirs(f"{self.work_dir}/results", exist_ok=True) 36 | if os.path.exists(f"{self.work_dir}/checkpoints"): 37 | shutil.rmtree(f"{self.work_dir}/checkpoints") 38 | os.makedirs(f"{self.work_dir}/checkpoints", exist_ok=True) 39 | 40 | def configure_optimizers(self): 41 | decay, no_decay = set(), set() 42 | whitelist_weight_modules = (nn.Linear, ) 43 | blacklist_weight_modules = (nn.LayerNorm, nn.Embedding) 44 | 45 | for mn, m in self.model.transformer.named_modules(): 46 | for pn, p in m.named_parameters(): 47 | fpn = f"{mn}.{pn}" if mn else pn 48 | 49 | if pn.endswith("bias"): 50 | no_decay.add(fpn) 51 | 52 | elif pn.endswith("weight") and isinstance(m, whitelist_weight_modules): 53 | decay.add(fpn) 54 | 55 | elif pn.endswith("weight") and isinstance(m, blacklist_weight_modules): 56 | no_decay.add(fpn) 57 | 58 | no_decay.add("pos_emb") 59 | 60 | param_dict = {pn: p for pn, p in self.model.transformer.named_parameters()} 61 | 62 | optim_groups = [ 63 | {"params": [param_dict[pn] for pn in sorted(list(decay))], "weight_decay": 0.01}, 64 | {"params": [param_dict[pn] for pn in sorted(list(no_decay))], "weight_decay": 0.0}, 65 | ] 66 | 67 | optimizer = torch.optim.AdamW(optim_groups, lr=4.5e-06, betas=(0.9, 0.95)) 68 | return optimizer 69 | 70 | @torch.no_grad() 71 | def eval(self, val_dataset): 72 | losses = [] 73 | for imgs in val_dataset: 74 | loss = self.compute_loss(imgs) 75 | losses.append(loss.cpu().detach().numpy().item()) 76 | loss = np.array(losses).mean() 77 | return loss 78 | 79 | def train(self, args): 80 | train_dataset, val_dataset = load_video_data(args) 81 | best_loss = float('inf') 82 | for epoch in range(args.epochs): 83 | with tqdm(range(len(train_dataset))) as pbar: 84 | for i, imgs in zip(pbar, train_dataset): 85 | self.optim.zero_grad() 86 | loss = self.compute_loss(imgs) 87 | loss.backward() 88 | self.optim.step() 89 | pbar.set_postfix(Transformer_Loss=np.round(loss.cpu().detach().numpy().item(), 4)) 90 | pbar.update(0) 91 | 92 | val_loss = self.eval(val_dataset) 93 | is_best = val_loss < best_loss 94 | best_loss = min(val_loss, best_loss) 95 | if is_best: 96 | print(f'Checkpoint at epoch {epoch} is saved with eval loss {best_loss} !!!') 97 | torch.save(self.model.state_dict(), os.path.join(f"{self.work_dir}/checkpoints/videogpt.pt")) 98 | 99 | def compute_loss(self, imgs): 100 | imgs = imgs.to(device=self.args.device) 101 | logits, targets = self.model.output(imgs, True) 102 | loss = F.cross_entropy(logits.reshape(-1, logits.size(-1)), targets.reshape(-1)) 103 | return loss 104 | 105 | 106 | @hydra.main(config_path="../diffusion_reward/configs/models/video_models/videogpt", config_name="default") 107 | def main(args): 108 | TrainTransformer(args) 109 | 110 | 111 | if __name__ == '__main__': 112 | main() -------------------------------------------------------------------------------- /scripts/train_vqdiffusion.py: -------------------------------------------------------------------------------- 1 | import os 2 | import warnings 3 | 4 | import hydra 5 | import torch 6 | from diffusion_reward.models.video_models.vqdiffusion.data.build import \ 7 | build_dataloader 8 | from diffusion_reward.models.video_models.vqdiffusion.distributed.launch import launch 9 | from diffusion_reward.models.video_models.vqdiffusion.engine.logger import Logger 10 | from diffusion_reward.models.video_models.vqdiffusion.engine.solver import Solver 11 | from diffusion_reward.models.video_models.vqdiffusion.modeling.build import \ 12 | build_model 13 | from diffusion_reward.models.video_models.vqdiffusion.utils.io import load_yaml_config 14 | from diffusion_reward.models.video_models.vqdiffusion.utils.misc import ( 15 | merge_opts_to_config, modify_config_for_debug, seed_everything) 16 | 17 | # environment variables 18 | NODE_RANK = os.environ['AZ_BATCHAI_TASK_INDEX'] if 'AZ_BATCHAI_TASK_INDEX' in os.environ else 0 19 | NODE_RANK = int(NODE_RANK) 20 | MASTER_ADDR, MASTER_PORT = os.environ['AZ_BATCH_MASTER_NODE'].split(':') if 'AZ_BATCH_MASTER_NODE' in os.environ else ("127.0.0.1", 29500) 21 | MASTER_PORT = int(MASTER_PORT) 22 | DIST_URL = 'tcp://%s:%s' % (MASTER_ADDR, MASTER_PORT) 23 | 24 | 25 | @hydra.main(config_path='../diffusion_reward/configs/models/video_models/vqdiffusion', config_name='default') 26 | def main(args): 27 | args.save_dir = os.path.abspath(os.path.dirname(__file__)) 28 | args.node_rank = NODE_RANK 29 | args.dist_url = DIST_URL 30 | 31 | if args.seed is not None or args.cudnn_deterministic: 32 | seed_everything(args.seed, args.cudnn_deterministic) 33 | 34 | if args.gpu is not None: 35 | warnings.warn('You have chosen a specific GPU. This will completely disable ddp.') 36 | torch.cuda.set_device(args.gpu) 37 | args.ngpus_per_node = 1 38 | args.world_size = 1 39 | else: 40 | if args.num_node == 1: 41 | args.dist_url == "auto" 42 | else: 43 | assert args.num_node > 1 44 | args.ngpus_per_node = torch.cuda.device_count() 45 | args.world_size = args.ngpus_per_node * args.num_node 46 | 47 | launch(main_worker, args.ngpus_per_node, args.num_node, args.node_rank, args.dist_url, args=(args,)) 48 | 49 | 50 | def main_worker(local_rank, args): 51 | args.local_rank = local_rank 52 | args.global_rank = args.local_rank + args.node_rank * args.ngpus_per_node 53 | 54 | # load config 55 | config = args 56 | config = merge_opts_to_config(config, args.opts) 57 | if args.debug: 58 | config = modify_config_for_debug(config) 59 | 60 | # get logger 61 | logger = Logger(args) 62 | 63 | # get model 64 | model = build_model(config, args) 65 | # print(model) 66 | if args.sync_bn: 67 | model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) 68 | 69 | # get dataloader 70 | dataloader_info = build_dataloader(config, args) 71 | 72 | # get solver 73 | solver = Solver(config=config, args=args, model=model, dataloader=dataloader_info, logger=logger) 74 | 75 | # resume 76 | if args.load_path is not None: # only load the model paramters 77 | solver.resume(path=args.load_path, 78 | # load_model=True, 79 | load_optimizer_and_scheduler=False, 80 | load_others=False) 81 | if args.auto_resume: 82 | solver.resume() 83 | 84 | solver.train() 85 | 86 | 87 | if __name__ == '__main__': 88 | main() 89 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import find_packages, setup 2 | 3 | setup( 4 | name = 'diffusion_reward', 5 | packages = find_packages(), 6 | ) --------------------------------------------------------------------------------