├── __init__.py ├── assets ├── stls │ ├── .get │ └── fetch │ │ ├── estop_link.stl │ │ ├── laser_link.stl │ │ ├── gripper_link.stl │ │ ├── torso_fixed_link.stl │ │ ├── base_link_collision.stl │ │ ├── bellows_link_collision.stl │ │ ├── head_pan_link_collision.stl │ │ ├── l_wheel_link_collision.stl │ │ ├── r_wheel_link_collision.stl │ │ ├── elbow_flex_link_collision.stl │ │ ├── head_tilt_link_collision.stl │ │ ├── torso_lift_link_collision.stl │ │ ├── wrist_flex_link_collision.stl │ │ ├── wrist_roll_link_collision.stl │ │ ├── forearm_roll_link_collision.stl │ │ ├── shoulder_lift_link_collision.stl │ │ ├── shoulder_pan_link_collision.stl │ │ └── upperarm_roll_link_collision.stl ├── textures │ ├── block.png │ └── block_hidden.png ├── fetch │ ├── reach.xml │ ├── push.xml │ ├── slide.xml │ ├── pick_and_place.xml │ ├── push_obstacle.xml │ ├── push_wall.xml │ ├── push_wall_obstacle.xml │ ├── push_wall_heavy_obstacle_v2.xml │ ├── push_wall_heavy_obstacle.xml │ ├── push_wall_heavy_obstacle_v5.xml │ ├── generate_xml.py │ ├── pick_and_place_stack3.xml │ ├── pick_and_place_stack.xml │ ├── push_wall_heavy_double_obstacle.xml │ ├── pick_and_place_box.xml │ ├── open_close_box.xml │ └── shared.xml └── masspoint │ ├── maze.xml │ ├── smaze.xml │ ├── single_obstacle.xml │ ├── single_obstacle2.xml │ ├── double_obstacle.xml │ ├── emaze_easy.xml │ └── generate_xml.py ├── plot ├── __init__.py ├── plot_compare_hiro.py ├── debug_plot_value.py ├── plot_experiment_visual.py ├── plot_reuse.py ├── plot_compare.py ├── plot_compare_sac.py ├── plot_walltime.py ├── plot_success_traj.py ├── plot_experiment_success_len.py ├── plot_cl_experiment.py ├── plot_experiment_len.py ├── plot_experiment.py ├── plot_sac_experiment_maze.py ├── visualize_sac_value.py └── plot_sac_experiment.py ├── utils ├── __init__.py ├── eval_stack.py ├── wrapper.py ├── log_utils.py └── replay_buffer.py ├── baselines ├── her │ ├── __init__.py │ ├── utils.py │ └── her.py ├── ppo │ └── __init__.py ├── sac_sir │ └── __init__.py ├── ppo_sir │ └── __init__.py ├── sac_parallel │ └── __init__.py └── __init__.py ├── .gitignore ├── exp_masspoint.sh ├── README.md ├── environment.yml ├── analyse_reduction_trace.py ├── exp_masspoint_3room.sh ├── exp_fetchstack.sh ├── success_len_calculation.py ├── exp_push.sh ├── run_ppo_augment.py ├── run_ppo.py └── run_her_augment.py /__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /assets/stls/.get: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /plot/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /baselines/her/__init__.py: -------------------------------------------------------------------------------- 1 | from baselines.her.her import HER2 -------------------------------------------------------------------------------- /baselines/ppo/__init__.py: -------------------------------------------------------------------------------- 1 | from baselines.ppo.ppo import PPO2 -------------------------------------------------------------------------------- /baselines/sac_sir/__init__.py: -------------------------------------------------------------------------------- 1 | from baselines.sac_sir.sac_sir import SAC_SIR -------------------------------------------------------------------------------- /baselines/ppo_sir/__init__.py: -------------------------------------------------------------------------------- 1 | from baselines.ppo_sir.ppo_sir import PPO2_SIR 2 | -------------------------------------------------------------------------------- /baselines/sac_parallel/__init__.py: -------------------------------------------------------------------------------- 1 | from baselines.sac_parallel.sac_parallel import SAC_parallel 2 | -------------------------------------------------------------------------------- /assets/textures/block.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IrisLi17/self-imitation-via-reduction/HEAD/assets/textures/block.png -------------------------------------------------------------------------------- /assets/stls/fetch/estop_link.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IrisLi17/self-imitation-via-reduction/HEAD/assets/stls/fetch/estop_link.stl -------------------------------------------------------------------------------- /assets/stls/fetch/laser_link.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IrisLi17/self-imitation-via-reduction/HEAD/assets/stls/fetch/laser_link.stl -------------------------------------------------------------------------------- /assets/textures/block_hidden.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IrisLi17/self-imitation-via-reduction/HEAD/assets/textures/block_hidden.png -------------------------------------------------------------------------------- /assets/stls/fetch/gripper_link.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IrisLi17/self-imitation-via-reduction/HEAD/assets/stls/fetch/gripper_link.stl -------------------------------------------------------------------------------- /assets/stls/fetch/torso_fixed_link.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IrisLi17/self-imitation-via-reduction/HEAD/assets/stls/fetch/torso_fixed_link.stl -------------------------------------------------------------------------------- /assets/stls/fetch/base_link_collision.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IrisLi17/self-imitation-via-reduction/HEAD/assets/stls/fetch/base_link_collision.stl -------------------------------------------------------------------------------- /assets/stls/fetch/bellows_link_collision.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IrisLi17/self-imitation-via-reduction/HEAD/assets/stls/fetch/bellows_link_collision.stl -------------------------------------------------------------------------------- /assets/stls/fetch/head_pan_link_collision.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IrisLi17/self-imitation-via-reduction/HEAD/assets/stls/fetch/head_pan_link_collision.stl -------------------------------------------------------------------------------- /assets/stls/fetch/l_wheel_link_collision.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IrisLi17/self-imitation-via-reduction/HEAD/assets/stls/fetch/l_wheel_link_collision.stl -------------------------------------------------------------------------------- /assets/stls/fetch/r_wheel_link_collision.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IrisLi17/self-imitation-via-reduction/HEAD/assets/stls/fetch/r_wheel_link_collision.stl -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | .vscode/* 3 | .idea/* 4 | __pycache__/* 5 | logs/* 6 | model/* 7 | *.png 8 | *.gif 9 | MUJOCO_LOG.TXT 10 | *.npy 11 | *.zip 12 | 13 | -------------------------------------------------------------------------------- /assets/stls/fetch/elbow_flex_link_collision.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IrisLi17/self-imitation-via-reduction/HEAD/assets/stls/fetch/elbow_flex_link_collision.stl -------------------------------------------------------------------------------- /assets/stls/fetch/head_tilt_link_collision.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IrisLi17/self-imitation-via-reduction/HEAD/assets/stls/fetch/head_tilt_link_collision.stl -------------------------------------------------------------------------------- /assets/stls/fetch/torso_lift_link_collision.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IrisLi17/self-imitation-via-reduction/HEAD/assets/stls/fetch/torso_lift_link_collision.stl -------------------------------------------------------------------------------- /assets/stls/fetch/wrist_flex_link_collision.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IrisLi17/self-imitation-via-reduction/HEAD/assets/stls/fetch/wrist_flex_link_collision.stl -------------------------------------------------------------------------------- /assets/stls/fetch/wrist_roll_link_collision.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IrisLi17/self-imitation-via-reduction/HEAD/assets/stls/fetch/wrist_roll_link_collision.stl -------------------------------------------------------------------------------- /assets/stls/fetch/forearm_roll_link_collision.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IrisLi17/self-imitation-via-reduction/HEAD/assets/stls/fetch/forearm_roll_link_collision.stl -------------------------------------------------------------------------------- /assets/stls/fetch/shoulder_lift_link_collision.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IrisLi17/self-imitation-via-reduction/HEAD/assets/stls/fetch/shoulder_lift_link_collision.stl -------------------------------------------------------------------------------- /assets/stls/fetch/shoulder_pan_link_collision.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IrisLi17/self-imitation-via-reduction/HEAD/assets/stls/fetch/shoulder_pan_link_collision.stl -------------------------------------------------------------------------------- /assets/stls/fetch/upperarm_roll_link_collision.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IrisLi17/self-imitation-via-reduction/HEAD/assets/stls/fetch/upperarm_roll_link_collision.stl -------------------------------------------------------------------------------- /baselines/__init__.py: -------------------------------------------------------------------------------- 1 | from baselines.ppo_sir import PPO2_SIR 2 | from baselines.ppo import PPO2 3 | from baselines.her import HER2 4 | from baselines.sac_sir import SAC_SIR 5 | from baselines.sac_parallel import SAC_parallel 6 | -------------------------------------------------------------------------------- /exp_masspoint.sh: -------------------------------------------------------------------------------- 1 | CUDA_VISIBLE_DEVICES=0 python run_ppo.py --env MasspointPushMultiObstacle-v1 --policy AttentionPolicy --n_object 4 --num_timesteps 3e8 --random_ratio 0.25 --log_path logs/MasspointPushMultiObstacle-v1_random0.25/ppo_attention/0 2 | CUDA_VISIBLE_DEVICES=0 python run_ppo_augment.py --env MasspointPushMultiObstacle-v1 --policy AttentionPolicy --n_object 4 --num_timesteps 3e8 --random_ratio 0.25 --n_subgoal 2 --parallel --aug_clip 0.0 --reuse_times 4 --start_augment 5e7 --log_path logs/MasspointPushMultiObstacle-v1_random0.25/ppo_attention_sir/0 3 | CUDA_VISIBLE_DEVICES=0 python run_ppo_augment.py --env MasspointPushMultiObstacle-v1 --policy AttentionPolicy --n_object 4 --num_timesteps 3e8 --random_ratio 0.25 --parallel --aug_clip 0.0 --reuse_times 1 --self_imitate --sil_clip 0.15 --log_path logs/MasspointPushMultiObstacle-v1_random0.25/ppo_attention_sil/0 4 | 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Self-Imitation via Reduction 2 | Paper: [Solving Compositional Reinforcement Learning Problems via Task Reduction](https://openreview.net/forum?id=9SS69KwomAM) 3 | 4 | Project website: https://sites.google.com/view/sir-compositional/. 5 | 6 | ### Get Started 7 | Prerequisite: 8 | 9 | * Ubuntu 16.04 10 | * CUDA 10.0 11 | * [MuJoCo](http://www.mujoco.org/) version 2.0. You can obtain a license and download the binaries from its website. 12 | * [Conda](https://docs.conda.io/en/latest/miniconda.html) 13 | 14 | Install: 15 | 16 | Run ``conda env create -f environment.yml``. You may refer to [Troubleshooting](https://github.com/openai/mujoco-py/blob/master/README.md#troubleshooting) if you have problems installing ``mujoco-py``. 17 | 18 | ### How to Run 19 | The scripts ``exp_push.sh``, ``exp_fetchstack.sh``, ``exp_masspoint.sh`` contain the commands for running different algorithms in *Push*, *Stack* and *Maze* scenarios respectively. 20 | 21 | -------------------------------------------------------------------------------- /assets/fetch/reach.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /assets/fetch/push.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /assets/fetch/slide.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: py36 2 | channels: 3 | - defaults 4 | dependencies: 5 | - _libgcc_mutex=0.1=main 6 | - ca-certificates=2019.8.28=0 7 | - certifi=2019.9.11=py36_0 8 | - libedit=3.1.20181209=hc058e9b_0 9 | - libffi=3.2.1=hd88cf55_4 10 | - libgcc-ng=9.1.0=hdf63c60_0 11 | - libstdcxx-ng=9.1.0=hdf63c60_0 12 | - ncurses=6.1=he6710b0_1 13 | - openssl=1.1.1d=h7b6447c_2 14 | - pip=19.2.3=py36_0 15 | - python=3.6.9=h265db76_0 16 | - readline=7.0=h7b6447c_5 17 | - sqlite=3.30.0=h7b6447c_0 18 | - tk=8.6.8=hbc83047_0 19 | - wheel=0.33.6=py36_0 20 | - xz=5.2.4=h14c3975_4 21 | - zlib=1.2.11=h7b6447c_3 22 | - pip: 23 | - absl-py==0.8.1 24 | - astor==0.8.0 25 | - atari-py==0.2.6 26 | - cffi==1.13.0 27 | - cloudpickle==1.2.2 28 | - cycler==0.10.0 29 | - cython==0.29.13 30 | - fasteners==0.15 31 | - future==0.18.1 32 | - gast==0.3.2 33 | - glfw==1.8.3 34 | - grpcio==1.24.1 35 | - gym==0.14.0 36 | - imageio==2.6.1 37 | - joblib==0.14.0 38 | - kiwisolver==1.1.0 39 | - markdown==3.1.1 40 | - matplotlib==3.1.1 41 | - monotonic==1.5 42 | - mpi4py==3.0.2 43 | - mujoco-py==2.0.2.7 44 | - numpy==1.16.1 45 | - opencv-python==4.1.1.26 46 | - pandas==0.25.2 47 | - pillow==6.2.0 48 | - protobuf==3.10.0 49 | - pycparser==2.19 50 | - pyglet==1.3.2 51 | - pyparsing==2.4.2 52 | - python-dateutil==2.8.0 53 | - pytz==2019.3 54 | - scipy==1.3.1 55 | - setuptools==39.1.0 56 | - six==1.12.0 57 | - stable-baselines==2.8.0 58 | - tensorboard==1.15.0 59 | - tensorflow-gpu==1.15.0 60 | - termcolor==1.1.0 61 | - werkzeug==0.16.0 62 | prefix: /home/lyf/miniconda3/envs/py36 63 | 64 | -------------------------------------------------------------------------------- /assets/fetch/pick_and_place.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | -------------------------------------------------------------------------------- /assets/fetch/push_obstacle.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | -------------------------------------------------------------------------------- /analyse_reduction_trace.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import sys 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | if __name__ == '__main__': 7 | trace_file = sys.argv[1] 8 | initial_states = [] 9 | with open(trace_file, 'rb') as f: 10 | try: 11 | while True: 12 | initial_states.append(pickle.load(f)) 13 | except EOFError: 14 | pass 15 | print('total number of states', len(initial_states)) 16 | 17 | def n_doors_blocked(obs): 18 | agent_pos = obs[:3] 19 | box_pos = obs[3: 6] 20 | goal_pos = obs[-7: -4] 21 | obstacles_pos = [obs[6 + 3 * i: 9 + 3 * i] for i in range(3)] 22 | # return sum([abs(pos[1] - 2.5) < 0.1 for pos in obstacles_pos]) 23 | 24 | max_x, min_x = max(agent_pos[0], box_pos[0], goal_pos[0]), min(agent_pos[0], box_pos[0], goal_pos[0]) 25 | max_n = int(max_x / 1.7) 26 | min_n = int(min_x / 1.7) 27 | count = 0 28 | for pos_obstacle in obstacles_pos: 29 | if abs(pos_obstacle[1] - 2.5) < 1e-3 and min_n < round(pos_obstacle[0] / 1.7) < max_n + 1: 30 | count += 1 31 | return count 32 | 33 | def smooth(arr, window=100): 34 | smoothed = np.zeros_like(arr) 35 | for i in range(arr.shape[0]): 36 | smoothed[i] = np.mean(arr[max(i - window + 1, 0): i + 1]) 37 | return smoothed 38 | 39 | blocked_doors = list(map(n_doors_blocked, initial_states)) 40 | blocked_doors = np.asarray(blocked_doors) 41 | print(len(blocked_doors)) 42 | reduction_masks = [(blocked_doors == i).astype(np.float32) for i in range(4)] 43 | reduction_percents = [smooth(mask, 1000) for mask in reduction_masks] 44 | for i in range(4): 45 | plt.plot(reduction_percents[i], label='%d blocked' % i) 46 | plt.legend() 47 | plt.show() -------------------------------------------------------------------------------- /utils/eval_stack.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | # Deprecated 5 | def pp_eval_model(eval_env, model): 6 | env = eval_env 7 | env.unwrapped.random_ratio = 1.0 8 | temp = env.unwrapped.task_array.copy() 9 | env.unwrapped.task_array = [(env.n_object, i) for i in range(env.n_object)] 10 | n_episode = 0 11 | ep_rewards = [] 12 | ep_successes = [] 13 | while n_episode < 50: 14 | ep_reward = 0.0 15 | ep_success = 0.0 16 | obs = env.reset() 17 | while env.current_nobject != env.n_object or env.task_mode != 0: 18 | obs = env.reset() 19 | done = False 20 | while not done: 21 | action, _ = model.predict(obs) 22 | obs, reward, done, info = env.step(action) 23 | ep_reward += reward 24 | ep_success += info['is_success'] 25 | ep_rewards.append(ep_reward) 26 | ep_successes.append(ep_success) 27 | n_episode += 1 28 | # return np.mean(ep_rewards) 29 | env.unwrapped.task_array = temp 30 | return np.mean(ep_successes) 31 | 32 | 33 | def eval_model(env, model, max_nobject, random_ratio, init_on_table=False): 34 | # random_ratio 0: stack only, 1: pick and place only 35 | temp = env.unwrapped.task_array.copy() 36 | if init_on_table: 37 | env.unwrapped.task_array = [(max_nobject, i) for i in range(min(2, max_nobject))] 38 | else: 39 | env.unwrapped.task_array = [(max_nobject, i) for i in range(max_nobject)] 40 | env.unwrapped.random_ratio = random_ratio 41 | n_episode = 0 42 | ep_successes = [] 43 | while n_episode < 50: 44 | ep_reward = 0.0 45 | ep_success = 0.0 46 | obs = env.reset() 47 | done = False 48 | while not done: 49 | action, _ = model.predict(obs) 50 | obs, reward, done, info = env.step(action) 51 | ep_reward += reward 52 | ep_success += info['is_success'] 53 | ep_successes.append(ep_success) 54 | n_episode += 1 55 | env.unwrapped.task_array = temp 56 | return np.mean(ep_successes) -------------------------------------------------------------------------------- /assets/fetch/push_wall.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | -------------------------------------------------------------------------------- /assets/fetch/push_wall_obstacle.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | -------------------------------------------------------------------------------- /exp_masspoint_3room.sh: -------------------------------------------------------------------------------- 1 | # Hard case 30%. 2 | CUDA_VISIBLE_DEVICES=0 python run_ppo.py --env MasspointPushDoubleObstacle-v1 --policy AttentionPolicy --num_timesteps 3e8 --random_ratio 0.7 --log_path logs/MasspointPushDoubleObstacle-v1_random0.7/ppo_attention/0 3 | CUDA_VISIBLE_DEVICES=0 python run_ppo.py --env MasspointPushDoubleObstacle-v1 --policy AttentionPolicy --num_timesteps 3e8 --random_ratio 0.7 --reward_type dense --log_path logs/MasspointPushDoubleObstacle-v1_random0.7/ppo_attention_ds/0 4 | CUDA_VISIBLE_DEVICES=0 python run_ppo_augment.py --env MasspointPushDoubleObstacle-v1 --policy AttentionPolicy --num_timesteps 3e8 --random_ratio 0.7 --parallel --aug_clip 0.0 --reuse_times 1 --self_imitate --sil_clip 0.15 --log_path logs/MasspointPushDoubleObstacle-v1_random0.7/ppo_attention_sil/0 5 | CUDA_VISIBLE_DEVICES=0 python run_ppo_augment.py --env MasspointPushDoubleObstacle-v1 --policy AttentionPolicy --num_timesteps 3e8 --random_ratio 0.7 --n_subgoal 2 --parallel --aug_clip 0.0 --reuse_times 4 --start_augment 3e7 --log_path logs/MasspointPushDoubleObstacle-v1_random0.7/ppo_attention_sir/0 6 | 7 | # Uniform. 8 | CUDA_VISIBLE_DEVICES=0 python run_ppo.py --env MasspointPushDoubleObstacle-v1 --policy AttentionPolicy --num_timesteps 3e8 --random_ratio 1.0 --log_path logs/MasspointPushDoubleObstacle-v1_random1.0/ppo_attention/0 9 | CUDA_VISIBLE_DEVICES=0 python run_ppo.py --env MasspointPushDoubleObstacle-v1 --policy AttentionPolicy --num_timesteps 3e8 --random_ratio 1.0 --reward_type dense --log_path logs/MasspointPushDoubleObstacle-v1_random1.0/ppo_attention_ds/0 10 | CUDA_VISIBLE_DEVICES=0 python run_ppo_augment.py --env MasspointPushDoubleObstacle-v1 --policy AttentionPolicy --num_timesteps 3e8 --random_ratio 1.0 --parallel --aug_clip 0.0 --reuse_times 1 --self_imitate --sil_clip 0.15 --log_path logs/MasspointPushDoubleObstacle-v1_random1.0/ppo_attention_sil/0 11 | CUDA_VISIBLE_DEVICES=0 python run_ppo_augment.py --env MasspointPushDoubleObstacle-v1 --policy AttentionPolicy --num_timesteps 3e8 --random_ratio 1.0 --n_subgoal 2 --parallel --aug_clip 0.0 --reuse_times 4 --start_augment 3e7 --log_path logs/MasspointPushDoubleObstacle-v1_random1.0/ppo_attention_sir/0 12 | -------------------------------------------------------------------------------- /exp_fetchstack.sh: -------------------------------------------------------------------------------- 1 | # 2 boxes 2 | CUDA_VISIBLE_DEVICES=1 python run_her.py --env FetchStack-v1 --num_timesteps 1e6 --curriculum --num_workers 32 --policy AttentionPolicy --reward_type sparse --n_object 2 --priority --log_path logs/FetchStack-v1_adapt/her_sac_32workers/2obj/0 3 | CUDA_VISIBLE_DEVICES=0 python run_her_augment.py --env FetchStack-v1 --num_timesteps 1e6 --curriculum --num_workers 32 --policy AttentionPolicy --reward_type sparse --n_object 2 --imitation_coef 0.1 --priority --log_path logs/FetchStack-v1_adapt/her_sac_sir_32workers/2obj/0 4 | CUDA_VISIBLE_DEVICES=1 python run_her.py --env FetchStack-v1 --num_timesteps 1e6 --curriculum --num_workers 32 --policy AttentionPolicy --reward_type sparse --n_object 2 --sil --sil_coef 0.1 --priority --log_path logs/FetchStack-v1_adapt/her_sac_sil_32workers/2obj/0 5 | CUDA_VISIBLE_DEVICES=1 python run_her.py --env FetchStack-v1 --num_timesteps 1e6 --curriculum --num_workers 32 --policy AttentionPolicy --reward_type dense --n_object 2 --priority --log_path logs/FetchStack-v1_adapt/her_sac_ds_32workers/2obj/0 6 | # 3 boxes 7 | CUDA_VISIBLE_DEVICES=1 python run_her.py --env FetchStack-v1 --num_timesteps 3.5e6 --curriculum --num_workers 32 --policy AttentionPolicy --reward_type sparse --n_object 3 --priority --log_path logs/FetchStack-v1_adapt/her_sac_32workers/3obj/0 8 | CUDA_VISIBLE_DEVICES=0 python run_her_augment.py --env FetchStack-v1 --num_timesteps 3.5e6 --curriculum --num_workers 32 --policy AttentionPolicy --reward_type sparse --n_object 3 --imitation_coef 0.1 --priority --log_path logs/FetchStack-v1_adapt/her_sac_sir_32workers/3obj/0 9 | CUDA_VISIBLE_DEVICES=1 python run_her.py --env FetchStack-v1 --num_timesteps 3.5e6 --curriculum --num_workers 32 --policy AttentionPolicy --reward_type sparse --n_object 3 --sil --sil_coef 0.1 --priority --log_path logs/FetchStack-v1_adapt/her_sac_sil_32workers/3obj/0 10 | CUDA_VISIBLE_DEVICES=1 python run_her.py --env FetchStack-v1 --num_timesteps 3.5e6 --curriculum --num_workers 32 --policy AttentionPolicy --reward_type dense --n_object 3 --priority --log_path logs/FetchStack-v1_adapt/her_sac_ds_32workers/3obj/0 11 | 12 | 13 | # experimental 14 | CUDA_VISIBLE_DEVICES=0 python run_her.py --env FetchStack-v1 --num_timesteps 3.5e6 --curriculum --num_workers 32 --policy RelationalPolicy --reward_type sparse --n_object 3 --priority --log_path logs/FetchStack-v1_adapt/her_sac_32workers/relational/3obj/0 -------------------------------------------------------------------------------- /plot/plot_compare_hiro.py: -------------------------------------------------------------------------------- 1 | import sys, os 2 | import numpy as np 3 | import pandas 4 | import matplotlib.pyplot as plt 5 | from scipy import interpolate 6 | 7 | 8 | if __name__ == '__main__': 9 | option = sys.argv[1] 10 | log_paths = sys.argv[2:] 11 | assert option in ['eval'] 12 | window = 20 13 | def get_item(log_file, label): 14 | data = pandas.read_csv(log_file, index_col=None, comment='#', error_bad_lines=True) 15 | return data[label].values 16 | def smooth(array, window): 17 | out = np.zeros(array.shape[0] - window) 18 | for i in range(out.shape[0]): 19 | out[i] = np.mean(array[i:i + window]) 20 | return out 21 | fig, ax = plt.subplots(1, 1, figsize=(5, 5)) 22 | for log_path in log_paths: 23 | progress_file = os.path.join(log_path, 'progress.csv') 24 | eval_file = os.path.join(log_path, 'eval.csv') 25 | if 'hiro' in log_path: 26 | eval_reward = get_item(eval_file, 'Value') 27 | eval_step = get_item(eval_file, 'Step') 28 | elif 'dsc' in log_path: 29 | raw_reward = get_item(eval_file, 'Value') 30 | eval_step = get_item(eval_file, 'Step') 31 | # Mean last 100 32 | eval_reward = np.zeros_like(raw_reward) 33 | for i in range(eval_reward.shape[0]): 34 | eval_reward[i] = np.mean(raw_reward[max(i - 100 + 1, 0): i + 1]) 35 | else: 36 | eval_reward = get_item(eval_file, 'mean_eval_reward') 37 | total_timesteps = get_item(progress_file, 'total timesteps') 38 | try: 39 | original_timesteps = get_item(progress_file, 'original_timesteps') 40 | except KeyError: 41 | original_timesteps = total_timesteps 42 | step_expand_fn = interpolate.interp1d(original_timesteps, total_timesteps, fill_value="extrapolate") 43 | eval_step = get_item(eval_file, 'n_updates') 44 | # else: 45 | # eval_step = get_item(progress_file, 'total_timesteps') 46 | # eval_reward = get_item(progress_file, 'ep_reward_mean') 47 | if option == 'eval': 48 | ax.plot(smooth(eval_step, window), smooth(eval_reward, window), label=log_path) 49 | if option == 'eval': 50 | ax.set_title('success rate') 51 | ax.grid() 52 | plt.legend() 53 | plt.show() 54 | 55 | -------------------------------------------------------------------------------- /assets/fetch/push_wall_heavy_obstacle_v2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | -------------------------------------------------------------------------------- /assets/fetch/push_wall_heavy_obstacle.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | -------------------------------------------------------------------------------- /assets/fetch/push_wall_heavy_obstacle_v5.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | -------------------------------------------------------------------------------- /utils/wrapper.py: -------------------------------------------------------------------------------- 1 | import gym 2 | 3 | 4 | class DoneOnSuccessWrapper(gym.Wrapper): 5 | """ 6 | Reset on success and offsets the reward. 7 | Useful for GoalEnv. 8 | """ 9 | def __init__(self, env, reward_offset=1.0): 10 | super(DoneOnSuccessWrapper, self).__init__(env) 11 | self.reward_offset = reward_offset 12 | 13 | def step(self, action): 14 | obs, reward, done, info = self.env.step(action) 15 | done = done or info.get('is_success', False) 16 | reward += self.reward_offset 17 | return obs, reward, done, info 18 | 19 | def compute_reward(self, achieved_goal, desired_goal, info): 20 | reward = self.env.compute_reward(achieved_goal, desired_goal, info) 21 | return reward + self.reward_offset 22 | 23 | 24 | class ScaleRewardWrapper(gym.Wrapper): 25 | def __init__(self, env, reward_scale=1.0): 26 | super(ScaleRewardWrapper, self).__init__(env) 27 | self.reward_scale = reward_scale 28 | 29 | def step(self, action): 30 | obs, reward, done, info = self.env.step(action) 31 | reward /= self.reward_scale 32 | return obs, reward, done, info 33 | 34 | def compute_reward(self, achieved_goal, desired_goal, info): 35 | reward = self.env.compute_reward(achieved_goal, desired_goal, info) 36 | return reward / self.reward_scale 37 | 38 | 39 | class FlexibleTimeLimitWrapper(gym.Wrapper): 40 | ''' 41 | ONLY applicable to Stacking environment! 42 | We can set max_episode_steps = None for gym, (so gym.TimeLimitWrapper is not applied), 43 | then use this class to avoid potential conflict. 44 | ''' 45 | def __init__(self, env, time_limit=None): 46 | super(FlexibleTimeLimitWrapper, self).__init__(env) 47 | self.time_limit = time_limit 48 | assert 'FetchStack' in env.spec.id 49 | assert env.spec.max_episode_steps is None 50 | self._elapsed_steps = None 51 | 52 | def step(self, action): 53 | assert self._elapsed_steps is not None, "Cannot call env.step() before calling reset()" 54 | self.time_limit = self.env.unwrapped.current_nobject * 50 if self.env.unwrapped.current_nobject > 2 else 100 55 | observation, reward, done, info = self.env.step(action) 56 | self._elapsed_steps += 1 57 | if self._elapsed_steps >= self.time_limit: 58 | info['TimeLimit.truncated'] = not done 59 | done = True 60 | return observation, reward, done, info 61 | 62 | def reset(self, **kwargs): 63 | self._elapsed_steps = 0 64 | return self.env.reset(**kwargs) 65 | -------------------------------------------------------------------------------- /assets/masspoint/maze.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | -------------------------------------------------------------------------------- /assets/masspoint/smaze.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | -------------------------------------------------------------------------------- /assets/fetch/generate_xml.py: -------------------------------------------------------------------------------- 1 | BASIC_COLORS = ["0.1 0.1 0.5", "0.1 0.8 0.3", "1.0 0.9 0.0", "0.8 0.2 0.8", "1.0 0.0 0.0", "0 0 0"] 2 | 3 | base = ''' 4 | 5 | 6 | 9 | 10 | 11 | {assets} 12 | 13 | 14 | 15 | 16 | {target_sites} 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | {object_bodies} 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | ''' 34 | 35 | 36 | def generate_xml(num_blocks): 37 | colors = BASIC_COLORS[:num_blocks] 38 | site_base = '' 39 | block_base = ''' 40 | 41 | 42 | 43 | ''' 44 | asset_base = '' 45 | 46 | sites = [] 47 | block_bodies = [] 48 | assets = [] 49 | sites.append(site_base.format(**dict(id=0, color=colors[0]))) 50 | for i in range(num_blocks): 51 | block_bodies.append(block_base.format(**dict(id=i, color=colors[i]))) 52 | assets.append(asset_base.format(**dict(id=i, color=colors[i]))) 53 | 54 | return base.format( 55 | **dict(assets="\n".join(assets), target_sites="\n".join(sites), object_bodies="\n".join(block_bodies))) -------------------------------------------------------------------------------- /assets/fetch/pick_and_place_stack3.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | -------------------------------------------------------------------------------- /assets/fetch/pick_and_place_stack.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 37 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | -------------------------------------------------------------------------------- /plot/debug_plot_value.py: -------------------------------------------------------------------------------- 1 | import sys, os 2 | import numpy as np 3 | import pandas 4 | import matplotlib.pyplot as plt 5 | 6 | 7 | if __name__ == '__main__': 8 | log_path = sys.argv[1] 9 | window = 10 10 | def get_item(log_file, label): 11 | data = pandas.read_csv(log_file, index_col=None, comment='#', error_bad_lines=True) 12 | return data[label].values 13 | def smooth(array, window): 14 | out = np.zeros(array.shape[0] - window) 15 | for i in range(out.shape[0]): 16 | out[i] = np.mean(array[i:i + window]) 17 | return out 18 | # print(get_item(log_path, 'reference_value').shape) 19 | # original_value = get_item(log_path, 'reference_value')[:] 20 | value1 = get_item(log_path, 'value1')[0:] 21 | value2 = get_item(log_path, 'value2')[0:] 22 | min_value = np.min(np.concatenate([np.expand_dims(value1, axis=0), np.expand_dims(value2, axis=0)], axis=0), axis=0) 23 | is_success = get_item(log_path, 'is_success')[0:] 24 | num_timesteps = get_item(log_path, 'num_timesteps')[0:] 25 | print(num_timesteps[20000], num_timesteps[40000], num_timesteps[-1]) 26 | success_idx = np.where(is_success > 0.5)[0] 27 | fail_idx = np.where(is_success < 0.5)[0] 28 | print(value1.shape) 29 | 30 | fig, ax = plt.subplots(1, 1, figsize=(8, 5)) 31 | plt.rcParams.update({'font.size': 22, 'legend.fontsize': 22, 'xtick.labelsize': 18, 'ytick.labelsize': 18, 'axes.labelsize': 18}) 32 | # ax.plot(smooth(original_value, 100), alpha=0.5, label='reference') 33 | # ax.scatter(fail_idx, value1[fail_idx]-original_value[fail_idx], c='tab:orange', s=0.1, label='fail value1') 34 | # ax.scatter(fail_idx, value2[fail_idx]-original_value[fail_idx], c='tab:green', s=0.1, label='fail value2') 35 | # ax.scatter(success_idx, value1[success_idx]-original_value[success_idx], c='tab:red', s=0.1, label='success value1') 36 | # ax.scatter(success_idx, value2[success_idx]-original_value[success_idx], c='tab:purple', s=0.1, label='success value2') 37 | # Mean value 38 | ax.scatter(fail_idx, (value1[fail_idx] + value2[fail_idx]) / 2, c='tab:orange', s=1.0, label='fail mean value') 39 | ax.scatter(success_idx, (value1[success_idx] + value2[success_idx]) / 2, c='tab:green', s=4.0, label='success mean value') 40 | # ax.axhline(0.5, linestyle='--', c='tab:blue') 41 | # ax.axhline(1.0, linestyle='--', c='tab:blue') 42 | # ax.plot(smooth(np.arange(len(value1)), 500), smooth((value1 + value2) / 2, 500), c='tab:red', label='smoothed mean value') 43 | # Min value 44 | # ax.scatter(fail_idx, min_value[fail_idx], c='tab:orange', s=1.0, label='fail min value') 45 | # ax.scatter(success_idx, min_value[success_idx], c='tab:green', s=4.0, label='succes min value') 46 | # ax.plot(smooth(np.arange(len(min_value)), 500), smooth(min_value, 500), c='tab:red', label='smoothed min value') 47 | # ax.scatter(fail_idx, original_value[fail_idx], c='tab:orange', s=0.1, label='fail original value') 48 | # ax.scatter(success_idx, original_value[success_idx], c='tab:green', s=4.0, label='success original value') 49 | # ax.set_yscale('log') 50 | plt.legend(loc="upper right", bbox_to_anchor=(1.0, 1.0)) 51 | plt.tight_layout(pad=0.05) 52 | plt.savefig('value_sigma_sac.png') 53 | plt.show() 54 | -------------------------------------------------------------------------------- /success_len_calculation.py: -------------------------------------------------------------------------------- 1 | import sys, os 2 | import numpy as np 3 | # from run_her import make_env, get_env_kwargs 4 | from baselines import HER2, PPO2 5 | from gym.wrappers import FlattenDictWrapper 6 | 7 | 8 | if __name__ == '__main__': 9 | env_id = sys.argv[1] 10 | algo = sys.argv[2] 11 | assert algo in ['sac', 'ppo'] 12 | model_paths = sys.argv[3:] 13 | from utils.make_env_utils import make_env, get_env_kwargs 14 | env_kwargs = get_env_kwargs(env_id, random_ratio=0.0, n_object=3) 15 | 16 | aug_env_id = env_id.split('-')[0] + 'Unlimit-' + env_id.split('-')[1] 17 | aug_env_kwargs = env_kwargs.copy() 18 | aug_env_kwargs['max_episode_steps'] = None 19 | 20 | aug_env = make_env(aug_env_id, rank=0, flatten_dict=True, kwargs=aug_env_kwargs) 21 | # if algo == 'sac': 22 | # aug_env = FlattenDictWrapper(aug_env, ['observation', 'achieved_goal', 'desired_goal']) 23 | 24 | if env_id == 'FetchStack-v1': 25 | # aug_env.set_task_array([(env_kwargs['n_object'], i) for i in range(env_kwargs['n_object'])]) 26 | aug_env.set_task_array([(3, 0), (3, 1), (3, 2)]) 27 | 28 | goal_dim = aug_env.goal.shape[0] 29 | obs_dim = aug_env.observation_space.shape[0] - 2 * goal_dim 30 | noise_mag = aug_env.size_obstacle[1] 31 | n_object = aug_env.n_object 32 | # model.model.env_id = env_id 33 | # model.model.goal_dim = goal_dim 34 | # model.model.obs_dim = obs_dim 35 | # model.model.noise_mag = noise_mag 36 | # model.model.n_object = n_object 37 | 38 | test_states, test_goals = [], [] 39 | test_selected_objects, test_current_nobject = [], [] 40 | for i in range(500): 41 | obs = aug_env.reset() 42 | goal = obs[-goal_dim:] 43 | initial_state = aug_env.get_state() 44 | test_states.append(initial_state) 45 | test_goals.append(goal) 46 | if env_id == 'FetchStack-v1': 47 | test_selected_objects.append(aug_env.selected_objects) 48 | test_current_nobject.append(aug_env.current_nobject) 49 | for model_path in model_paths: 50 | if algo == 'sac': 51 | model = HER2.load(model_path) 52 | elif algo == 'ppo': 53 | model = PPO2.load(model_path) 54 | if 'ds' in model_path: 55 | aug_env.unwrapped.reward_type = 'dense' 56 | else: 57 | aug_env.unwrapped.reward_type = 'sparse' 58 | success_len = [] 59 | for i in range(len(test_states)): 60 | aug_env.set_state(test_states[i]) 61 | aug_env.set_goal(test_goals[i]) 62 | if env_id == 'FetchStack-v1': 63 | aug_env.unwrapped.selected_objects = test_selected_objects[i] 64 | aug_env.unwrapped.current_nobject = test_current_nobject[i] 65 | obs = aug_env.get_obs() 66 | obs = np.concatenate([obs[key] for key in ['observation', 'achieved_goal', 'desired_goal']]) 67 | done = False 68 | step_so_far = 0 69 | while not done: 70 | action, _ = model.predict(obs, deterministic=True) 71 | obs, reward, done, info = aug_env.step(action) 72 | step_so_far += 1 73 | if step_so_far >= env_kwargs['max_episode_steps']: 74 | break 75 | if done: 76 | success_len.append(step_so_far) 77 | print(model_path, 'mean success len:', np.mean(success_len), 'over %d trajs' % len(success_len)) 78 | -------------------------------------------------------------------------------- /assets/masspoint/single_obstacle.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 60 | 61 | 62 | -------------------------------------------------------------------------------- /exp_push.sh: -------------------------------------------------------------------------------- 1 | # SAC-based 2 | # Hard case 30%. 3 | # SAC 4 | CUDA_VISIBLE_DEVICES=0 python run_her.py --env FetchPushWallObstacle-v4 --random_ratio 0.7 --num_timesteps 1e6 --num_workers 32 --priority --log_path logs/FetchPushWallObstacle-v4_random0.7/her_sac_32workers/0 5 | # SIR 6 | CUDA_VISIBLE_DEVICES=0 python run_her_augment.py --env FetchPushWallObstacle-v4 --random_ratio 0.7 --num_timesteps 1e6 --num_workers 32 --start_augment 0 --imitation_coef 0.1 --priority --log_path logs/FetchPushWallObstacle-v4_random0.7/her_sac_sir_32workers/0 7 | # SIL 8 | CUDA_VISIBLE_DEVICES=0 python run_her.py --env FetchPushWallObstacle-v4 --random_ratio 0.7 --sil --num_timesteps 1e6 --num_workers 32 --sil_coef 0.1 --priority --log_path logs/FetchPushWallObstacle-v4_random0.7/her_sac_sil_32workers/0 9 | # DS 10 | CUDA_VISIBLE_DEVICES=0 python run_her.py --env FetchPushWallObstacle-v4 --random_ratio 0.7 --reward_type dense --num_timesteps 1e6 --num_workers 32 --priority --log_path logs/FetchPushWallObstacle-v4_random0.7/her_sac_ds_32workers/0 11 | 12 | # Uniform. 13 | CUDA_VISIBLE_DEVICES=0 python run_her.py --env FetchPushWallObstacle-v4 --random_ratio 1.0 --num_timesteps 1e6 --num_workers 32 --priority --log_path logs/FetchPushWallObstacle-v4_random1.0/her_sac_32workers/0 14 | CUDA_VISIBLE_DEVICES=0 python run_her_augment.py --env FetchPushWallObstacle-v4 --random_ratio 1.0 --num_timesteps 1e6 --num_workers 32 --start_augment 0 --imitation_coef 0.1 --priority --log_path logs/FetchPushWallObstacle-v4_random1.0/her_sac_sir_32workers/0 15 | CUDA_VISIBLE_DEVICES=0 python run_her.py --env FetchPushWallObstacle-v4 --random_ratio 1.0 --sil --num_timesteps 1e6 --num_workers 32 --sil_coef 0.1 --priority --log_path logs/FetchPushWallObstacle-v4_random1.0/her_sac_sil_32workers/0 16 | CUDA_VISIBLE_DEVICES=0 python run_her.py --env FetchPushWallObstacle-v4 --random_ratio 1.0 --reward_type dense --num_timesteps 1e6 --num_workers 32 --priority --log_path logs/FetchPushWallObstacle-v4_random1.0/her_sac_ds_32workers/0 17 | 18 | # PPO-based 19 | # Hard 30%. 20 | CUDA_VISIBLE_DEVICES=0 python run_ppo.py --env FetchPushWallObstacle-v4 --random_ratio 0.7 --num_timesteps 5e7 --log_path logs/FetchPushWallObstacle-v4_random0.7/ppo/0 21 | CUDA_VISIBLE_DEVICES=0 python run_ppo_augment.py --env FetchPushWallObstacle-v4 --random_ratio 0.7 --num_timesteps 5e7 --n_subgoal 2 --parallel --aug_clip 0.0 --reuse_times 8 --log_path logs/FetchPushWallObstacle-v4_random0.7/ppo_sir/0 22 | CUDA_VISIBLE_DEVICES=0 python run_ppo_augment.py --env FetchPushWallObstacle-v4 --random_ratio 0.7 --num_timesteps 5e7 --n_subgoal 2 --parallel --self_imitate --aug_clip 0.0 --reuse_times 1 --log_path logs/FetchPushWallObstacle-v4_random0.7/ppo_sil/0 23 | CUDA_VISIBLE_DEVICES=0 python run_ppo.py --env FetchPushWallObstacle-v4 --random_ratio 0.7 --num_timesteps 5e7 --reward_type dense --log_path logs/FetchPushWallObstacle-v4_random0.7/ppo_ds/0 24 | 25 | # Uniform. 26 | CUDA_VISIBLE_DEVICES=0 python run_ppo.py --env FetchPushWallObstacle-v4 --random_ratio 1.0 --num_timesteps 5e7 --log_path logs/FetchPushWallObstacle-v4_random1.0/ppo/0 27 | CUDA_VISIBLE_DEVICES=0 python run_ppo_augment.py --env FetchPushWallObstacle-v4 --random_ratio 1.0 --num_timesteps 5e7 --n_subgoal 2 --parallel --aug_clip 0.0 --reuse_times 8 --log_path logs/FetchPushWallObstacle-v4_random1.0/ppo_sir/0 28 | CUDA_VISIBLE_DEVICES=0 python run_ppo_augment.py --env FetchPushWallObstacle-v4 --random_ratio 1.0 --num_timesteps 5e7 --n_subgoal 2 --parallel --self_imitate --aug_clip 0.0 --reuse_times 1 --log_path logs/FetchPushWallObstacle-v4_random1.0/ppo_sil/0 29 | CUDA_VISIBLE_DEVICES=0 python run_ppo.py --env FetchPushWallObstacle-v4 --random_ratio 1.0 --num_timesteps 5e7 --reward_type dense --log_path logs/FetchPushWallObstacle-v4_random1.0/ppo_ds/0 30 | -------------------------------------------------------------------------------- /plot/plot_experiment_visual.py: -------------------------------------------------------------------------------- 1 | import seaborn as sns 2 | import pandas 3 | import numpy as np 4 | import sys, os 5 | from scipy import interpolate 6 | import matplotlib.pyplot as plt 7 | 8 | 9 | def get_item(log_file, label): 10 | data = pandas.read_csv(log_file, index_col=None, comment='#', error_bad_lines=True) 11 | return data[label].values 12 | 13 | 14 | def smooth(array, window): 15 | out = np.zeros(array.shape[0] - window) 16 | for i in range(out.shape[0]): 17 | out[i] = np.mean(array[i:i + window]) 18 | return out 19 | 20 | 21 | if __name__ == '__main__': 22 | folder_name = sys.argv[1] 23 | env_name = sys.argv[2] 24 | assert env_name in ['uwall'] 25 | # assert mode in ['train', 'hard', 'iteration'] 26 | max_timesteps = {'uwall': 2.1e6} 27 | df_timesteps, df_sr, df_legend= [], [], [] 28 | subfolders = ['ppo', 'sir', 'sil'] 29 | # subfolders = ['ppo_attention_new', 'ppo_attention_sir_new', 'ppo_attention_sil_new'] 30 | 31 | for subfolder in subfolders: 32 | last_sr = [] 33 | for i in range(3): 34 | if not os.path.exists(os.path.join(folder_name, subfolder, str(i), 'progress.csv')): 35 | continue 36 | progress_file = os.path.join(folder_name, subfolder, str(i), 'progress.csv') 37 | raw_success_rate = get_item(progress_file, 'ep_success_rate') 38 | raw_total_timesteps = get_item(progress_file, 'total_timesteps') 39 | print(raw_total_timesteps.shape) 40 | sr_f = interpolate.interp1d(raw_total_timesteps, raw_success_rate, fill_value="extrapolate") 41 | timesteps = np.arange(0, max_timesteps[env_name], max_timesteps[env_name] // 70) 42 | print(timesteps[0], timesteps[-1], raw_total_timesteps[0], raw_total_timesteps[-1]) 43 | success_rate = sr_f(timesteps) 44 | timesteps = smooth(timesteps, 5) 45 | success_rate = smooth(success_rate, 5) 46 | df_timesteps.append(timesteps) 47 | df_sr.append(success_rate) 48 | last_sr.append(success_rate[-1]) 49 | df_legend.append(np.array([subfolder.upper()] * len(timesteps))) 50 | 51 | print(subfolder, 'sr', np.mean(last_sr)) 52 | df_timesteps = np.concatenate(df_timesteps, axis=0).tolist() 53 | df_sr = np.concatenate(df_sr, axis=0).tolist() 54 | df_legend = np.concatenate(df_legend, axis=0).tolist() 55 | data = {'samples': df_timesteps, 'success_rate': df_sr, 'algo': df_legend} 56 | sr_timesteps = pandas.DataFrame(data) 57 | 58 | wspace = .3 59 | bottom = .3 60 | margin = .1 61 | # left = .08 62 | left = .1 63 | width = 1.25 / ((1. - left) / (2 + wspace + margin / 2)) 64 | height = 1.5 / ((1. - bottom) / (1 + margin / 2)) 65 | 66 | plt.style.use("ggplot") 67 | # plt.rcParams.update({'legend.fontsize': 14}) 68 | p = sns.color_palette() 69 | sns.set_palette([p[i] for i in range(len(subfolders))]) 70 | f, axes = plt.subplots(1, 1, figsize=(width, height)) 71 | sns.lineplot(x='samples', y='success_rate', hue='algo', ax=axes, data=sr_timesteps) 72 | axes.set_xlabel('samples') 73 | axes.set_ylabel('avg. succ. rate') 74 | axes.get_legend().remove() 75 | 76 | handles, labels = axes.get_legend_handles_labels() 77 | f.legend(handles[:], ['PPO', 'SIR', 'SIL'], loc="lower right", ncol=1, bbox_to_anchor=(0.99, 0.18), title='') 78 | f.subplots_adjust(top=1. - margin / height, bottom=0.21, wspace=wspace, left=left, right=1. - margin / width) 79 | plt.savefig(os.path.join(folder_name, '../', os.path.basename(folder_name) + '.pdf')) 80 | print(os.path.join(folder_name, '../', os.path.basename(folder_name) + '.pdf')) 81 | plt.show() 82 | -------------------------------------------------------------------------------- /plot/plot_reuse.py: -------------------------------------------------------------------------------- 1 | import seaborn as sns 2 | import pandas 3 | import numpy as np 4 | import sys, os 5 | from scipy import interpolate 6 | import matplotlib.pyplot as plt 7 | 8 | 9 | def get_item(log_file, label): 10 | data = pandas.read_csv(log_file, index_col=None, comment='#', error_bad_lines=True) 11 | return data[label].values 12 | 13 | 14 | def smooth(array, window): 15 | out = np.zeros(array.shape[0] - window) 16 | for i in range(out.shape[0]): 17 | out[i] = np.mean(array[i:i + window]) 18 | return out 19 | 20 | if __name__ == '__main__': 21 | folder_name = sys.argv[1] 22 | # mode = sys.argv[2] 23 | # assert mode in ['success_rate', 'augment', 'eval'] 24 | plt.style.use("ggplot") 25 | # plt.rcParams.update({'font.size': 20, 'legend.fontsize': 20, 26 | # 'axes.formatter.limits': [-5, 3]}) 27 | wspace = .3 28 | bottom = .3 29 | margin = .1 30 | left = .08 31 | width = 3.5 / ((1. - left) / (2 + wspace + margin / 2)) 32 | height = 1.5 / ((1. - bottom) / (1 + margin / 2)) 33 | 34 | fig, axes = plt.subplots(1, 3, figsize=(width, height)) 35 | for subfolder in ['sir_re1', 'sir_re4', 'sir_re8', 'sir_re16']: 36 | progress_file = os.path.join(folder_name, subfolder, '0', 'progress.csv') 37 | eval_file = os.path.join(folder_name, subfolder, '0', 'eval.csv') 38 | success_rate = get_item(progress_file, 'ep_reward_mean') 39 | total_timesteps = get_item(progress_file, 'total_timesteps') 40 | original_steps_per_iter = get_item(progress_file, 'original_timesteps')[0] 41 | augment_steps = get_item(progress_file, 'augment_steps') 42 | augment_ratio = augment_steps / (augment_steps + original_steps_per_iter) 43 | eval_reward = get_item(eval_file, 'mean_eval_reward') 44 | L = np.sum(total_timesteps < 3e7) 45 | total_timesteps = smooth(total_timesteps[:L], 20) 46 | success_rate = smooth(success_rate[:L], 20) 47 | augment_ratio = smooth(augment_ratio[:L], 20) 48 | augment_number = smooth(augment_steps[:L], 20) 49 | eval_reward = smooth(eval_reward[:L], 20) 50 | # if mode == 'success_rate': 51 | # ax.plot(total_timesteps, success_rate, label=subfolder.upper()) 52 | # ax.set_ylabel('success rate') 53 | # elif mode == 'augment': 54 | # ax.plot(total_timesteps, augment_number, label=subfolder.upper()) 55 | # ax.set_ylabel('number of augmented data') 56 | # elif mode == 'eval': 57 | # ax.plot(total_timesteps, eval_reward, label=subfolder.upper()) 58 | # ax.set_ylabel('success rate') 59 | # ax.set_xlabel('samples') 60 | axes[0].plot(total_timesteps, success_rate, label=subfolder.upper()) 61 | axes[0].set_xlabel('samples') 62 | axes[0].set_ylabel('success rate') 63 | axes[1].set_xlabel('samples') 64 | axes[1].plot(total_timesteps, eval_reward, label=subfolder.upper()) 65 | axes[1].set_ylabel('success rate') 66 | axes[2].plot(total_timesteps, augment_ratio, label=subfolder.upper()) 67 | axes[2].set_xlabel('samples') 68 | axes[2].set_ylabel('ratio of aug. data') 69 | # axes[0].get_legend().remove() 70 | # axes[1].get_legend().remove() 71 | # axes[2].get_legend().remove() 72 | 73 | # if mode == 'augment': 74 | # plt.legend(loc="lower right", bbox_to_anchor=(1.0, 0.0), ncol=1) 75 | fig.legend(labels=['RE1', 'RE4', 'RE8', 'RE16'], loc="lower center", bbox_to_anchor=(0.5, -0.03), ncol=4) 76 | fig.subplots_adjust(top=1. - margin / height, bottom=0.31, wspace=wspace, left=left, right=1. - margin / width) 77 | plt.savefig('reuse_ablation' + '.pdf') 78 | 79 | 80 | -------------------------------------------------------------------------------- /plot/plot_compare.py: -------------------------------------------------------------------------------- 1 | import sys, os 2 | import numpy as np 3 | import pandas 4 | import matplotlib.pyplot as plt 5 | 6 | 7 | if __name__ == '__main__': 8 | option = sys.argv[1] 9 | log_paths = sys.argv[2:] 10 | assert option in ['success_rate', 'eval', 'entropy', 'aug_ratio', 'self_aug_ratio'] 11 | window = 20 12 | def get_item(log_file, label): 13 | data = pandas.read_csv(log_file, index_col=None, comment='#', error_bad_lines=True) 14 | return data[label].values 15 | def smooth(array, window): 16 | out = np.zeros(array.shape[0] - window) 17 | for i in range(out.shape[0]): 18 | out[i] = np.mean(array[i:i + window]) 19 | return out 20 | fig, ax = plt.subplots(1, 2, figsize=(10, 5)) 21 | for log_path in log_paths: 22 | progress_file = os.path.join(log_path, 'progress.csv') 23 | eval_file = os.path.join(log_path, 'eval.csv') 24 | if 'ds' in log_path: 25 | success_rate = get_item(progress_file, 'ep_success_rate') 26 | else: 27 | success_rate = get_item(progress_file, 'ep_reward_mean') 28 | total_timesteps = get_item(progress_file, 'total_timesteps') 29 | entropy = get_item(progress_file, 'policy_entropy') 30 | try: 31 | eval_reward = get_item(eval_file, 'mean_eval_reward') 32 | n_updates = get_item(eval_file, 'n_updates') 33 | except: 34 | pass 35 | # success_rate = smooth(success_rate, window) 36 | # total_timesteps = smooth(total_timesteps, window) 37 | if option == 'success_rate': 38 | ax[0].plot(smooth(total_timesteps, window), smooth(success_rate, window), label=log_path) 39 | elif option == 'eval': 40 | # ax[0].plot(n_updates*65536, eval_reward, label=log_path) 41 | 42 | ax[0].plot(smooth(total_timesteps[n_updates-1], window), smooth(eval_reward, window), label=log_path) 43 | elif option == 'entropy': 44 | ax[0].plot(smooth(total_timesteps, window), smooth(entropy, window), label=log_path) 45 | elif option == 'aug_ratio': 46 | original_success = get_item(progress_file, 'original_success') 47 | total_success = get_item(progress_file, 'total_success') 48 | aug_ratio = (total_success - original_success) / (total_success + 1e-8) 49 | print(total_timesteps.shape, aug_ratio.shape) 50 | ax[0].plot(smooth(total_timesteps, 2), smooth(aug_ratio, 2), label=log_path) 51 | elif option == 'self_aug_ratio': 52 | self_aug_ratio = get_item(progress_file, 'self_aug_ratio') 53 | ax[0].plot(smooth(total_timesteps, window), smooth(self_aug_ratio, window), label=log_path) 54 | try: 55 | original_steps = get_item(progress_file, 'original_timesteps')[0] 56 | augment_steps = get_item(progress_file, 'augment_steps') / original_steps 57 | # augment_steps = smooth(augment_steps, window) 58 | except: 59 | augment_steps = np.zeros(total_timesteps.shape) 60 | ax[1].plot(smooth(total_timesteps, window), smooth(augment_steps, window), label=log_path) 61 | if option == 'success_rate': 62 | ax[0].set_title('ep reward mean') 63 | elif option == 'eval': 64 | ax[0].set_title('eval success rate') 65 | elif option == 'entropy': 66 | ax[0].set_title('entropy') 67 | elif option == 'aug_ratio': 68 | ax[0].set_title('aug success episode / total success episode') 69 | elif option == 'self_aug_ratio': 70 | ax[0].set_title('self_aug_ratio') 71 | ax[1].set_title('augment steps / original rollout steps') 72 | ax[0].grid() 73 | ax[1].grid() 74 | plt.legend() 75 | plt.show() 76 | 77 | -------------------------------------------------------------------------------- /assets/fetch/push_wall_heavy_double_obstacle.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 42 | 43 | 44 | 45 | 46 | 47 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | -------------------------------------------------------------------------------- /plot/plot_compare_sac.py: -------------------------------------------------------------------------------- 1 | import sys, os 2 | import numpy as np 3 | import pandas 4 | import matplotlib.pyplot as plt 5 | from scipy import interpolate 6 | 7 | 8 | if __name__ == '__main__': 9 | option = sys.argv[1] 10 | log_paths = sys.argv[2:] 11 | assert option in ['success_rate', 'eval', 'entropy', 'aug_ratio', 'self_aug_ratio'] 12 | window = 20 if option == 'eval' else 100 13 | def get_item(log_file, label): 14 | data = pandas.read_csv(log_file, index_col=None, comment='#', error_bad_lines=True) 15 | return data[label].values 16 | def smooth(array, window): 17 | out = np.zeros(array.shape[0] - window) 18 | for i in range(out.shape[0]): 19 | out[i] = np.mean(array[i:i + window]) 20 | return out 21 | fig, ax = plt.subplots(1, 1, figsize=(5, 5)) 22 | for log_path in log_paths: 23 | progress_file = os.path.join(log_path, 'progress.csv') 24 | eval_file = os.path.join(log_path, 'eval.csv') 25 | # success_rate = get_item(progress_file, 'ep_rewmean') 26 | success_rate = get_item(progress_file, 'success rate') 27 | total_timesteps = get_item(progress_file, 'total timesteps') 28 | entropy = get_item(progress_file, 'entropy') 29 | try: 30 | eval_reward = get_item(eval_file, 'mean_eval_reward') 31 | n_updates = get_item(eval_file, 'n_updates') 32 | except: 33 | pass 34 | # success_rate = smooth(success_rate, window) 35 | # total_timesteps = smooth(total_timesteps, window) 36 | if option == 'success_rate': 37 | ax.plot(smooth(total_timesteps, window), smooth(success_rate, window), label=log_path) 38 | elif option == 'eval': 39 | # ax[0].plot(n_updates*65536, eval_reward, label=log_path) 40 | try: 41 | original_steps = get_item(progress_file, 'original_timesteps') 42 | step_expand_fn = interpolate.interp1d(original_steps, total_timesteps, fill_value="extrapolate") 43 | n_updates = step_expand_fn(n_updates) 44 | except: 45 | pass 46 | ax.plot(smooth(n_updates, window), smooth(eval_reward, window), label=log_path) 47 | # ax[0].plot(smooth(total_timesteps[n_updates-1], window), smooth(eval_reward, window), label=log_path) 48 | elif option == 'entropy': 49 | ax.plot(smooth(total_timesteps, window), smooth(entropy, window), label=log_path) 50 | elif option == 'aug_ratio': 51 | original_success = get_item(progress_file, 'original_success') 52 | total_success = get_item(progress_file, 'total_success') 53 | aug_ratio = (total_success - original_success) / (total_success + 1e-8) 54 | print(total_timesteps.shape, aug_ratio.shape) 55 | ax.plot(smooth(total_timesteps, 2), smooth(aug_ratio, 2), label=log_path) 56 | elif option == 'self_aug_ratio': 57 | self_aug_ratio = get_item(progress_file, 'self_aug_ratio') 58 | ax.plot(smooth(total_timesteps, window), smooth(self_aug_ratio, window), label=log_path) 59 | ''' 60 | try: 61 | original_steps = get_item(progress_file, 'original_timesteps')[0] 62 | augment_steps = get_item(progress_file, 'augment_steps') / original_steps 63 | # augment_steps = smooth(augment_steps, window) 64 | except: 65 | augment_steps = np.zeros(total_timesteps.shape) 66 | ax[1].plot(smooth(total_timesteps, window), smooth(augment_steps, window), label=log_path) 67 | ''' 68 | if option == 'success_rate': 69 | ax.set_title('ep reward mean') 70 | elif option == 'eval': 71 | ax.set_title('eval success rate') 72 | elif option == 'entropy': 73 | ax.set_title('entropy') 74 | elif option == 'aug_ratio': 75 | ax.set_title('aug success episode / total success episode') 76 | elif option == 'self_aug_ratio': 77 | ax.set_title('self_aug_ratio') 78 | # ax[1].set_title('augment steps / original rollout steps') 79 | ax.grid() 80 | # ax[1].grid() 81 | plt.legend() 82 | plt.show() 83 | 84 | -------------------------------------------------------------------------------- /assets/fetch/pick_and_place_box.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | -------------------------------------------------------------------------------- /utils/log_utils.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import os 3 | 4 | import numpy as np 5 | from stable_baselines import logger 6 | 7 | 8 | def eval_model(eval_env, model): 9 | env = eval_env 10 | if hasattr(env.unwrapped, 'random_ratio'): 11 | assert abs(env.unwrapped.random_ratio) < 1e-4 12 | n_episode = 0 13 | ep_rewards = [] 14 | ep_successes = [] 15 | while n_episode < 20: 16 | ep_reward = 0.0 17 | ep_success = 0.0 18 | obs = env.reset() 19 | goal_dim = env.goal.shape[0] 20 | if goal_dim > 3: 21 | while (np.argmax(obs[-goal_dim + 3:]) != 0): 22 | obs = env.reset() 23 | done = False 24 | while not done: 25 | action, _ = model.predict(obs) 26 | obs, reward, done, info = env.step(action) 27 | ep_reward += reward 28 | ep_success += info['is_success'] 29 | ep_rewards.append(ep_reward) 30 | ep_successes.append(ep_success) 31 | n_episode += 1 32 | return np.mean(ep_successes) 33 | 34 | 35 | def log_eval(num_update, mean_eval_reward, file_name='eval.csv'): 36 | if not os.path.exists(os.path.join(logger.get_dir(), file_name)): 37 | with open(os.path.join(logger.get_dir(), file_name), 'a', newline='') as csvfile: 38 | csvwriter = csv.writer(csvfile, delimiter=',', quotechar=',', quoting=csv.QUOTE_MINIMAL) 39 | title = ['n_updates', 'mean_eval_reward'] 40 | csvwriter.writerow(title) 41 | with open(os.path.join(logger.get_dir(), file_name), 'a', newline='') as csvfile: 42 | csvwriter = csv.writer(csvfile, delimiter=',', quotechar=',', quoting=csv.QUOTE_MINIMAL) 43 | data = [num_update, mean_eval_reward] 44 | csvwriter.writerow(data) 45 | 46 | 47 | def stack_eval_model(eval_env, model, init_on_table=False): 48 | env = eval_env 49 | env.unwrapped.random_ratio = 0.0 50 | if init_on_table: 51 | env.unwrapped.task_array = [(env.n_object, i) for i in range(min(2, env.n_object))] 52 | else: 53 | env.unwrapped.task_array = [(env.n_object, i) for i in range(env.n_object)] 54 | assert abs(env.unwrapped.random_ratio) < 1e-4 55 | n_episode = 0 56 | ep_rewards = [] 57 | ep_successes = [] 58 | while n_episode < 20: 59 | ep_reward = 0.0 60 | ep_success = 0.0 61 | obs = env.reset() 62 | while env.current_nobject != env.n_object or (hasattr(env, 'task_mode') and env.task_mode != 1): 63 | obs = env.reset() 64 | done = False 65 | while not done: 66 | action, _ = model.predict(obs) 67 | obs, reward, done, info = env.step(action) 68 | ep_reward += reward 69 | ep_success += info['is_success'] 70 | ep_rewards.append(ep_reward) 71 | ep_successes.append(ep_success) 72 | n_episode += 1 73 | return np.mean(ep_successes) 74 | 75 | 76 | def egonav_eval_model(eval_env, model, random_ratio=0.0, goal_idx=3, fixed_goal=None): 77 | env = eval_env 78 | if hasattr(env.unwrapped, 'random_ratio'): 79 | env.unwrapped.random_ratio = random_ratio 80 | n_episode = 0 81 | ep_rewards = [] 82 | ep_successes = [] 83 | while n_episode < 20: 84 | ep_reward = 0.0 85 | ep_success = 0.0 86 | obs = env.reset() 87 | goal_dim = env.goal.shape[0] 88 | if fixed_goal is not None: 89 | env.unwrapped.goal = fixed_goal.copy() 90 | obs = env.get_obs() 91 | obs = np.concatenate([obs[key] for key in ['observation', 'achieved_goal', 'desired_goal']]) 92 | else: 93 | if goal_dim > 3: 94 | while np.argmax(obs[-goal_dim + 3:]) != goal_idx: 95 | obs = env.reset() 96 | done = False 97 | while not done: 98 | action, _ = model.predict(obs) 99 | obs, reward, done, info = env.step(action) 100 | ep_reward += reward 101 | ep_success += info['is_success'] 102 | ep_rewards.append(ep_reward) 103 | ep_successes.append(ep_success) 104 | n_episode += 1 105 | return np.mean(ep_successes) 106 | -------------------------------------------------------------------------------- /assets/masspoint/single_obstacle2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 65 | 66 | 67 | -------------------------------------------------------------------------------- /assets/fetch/open_close_box.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | -------------------------------------------------------------------------------- /plot/plot_walltime.py: -------------------------------------------------------------------------------- 1 | import seaborn as sns 2 | import pandas 3 | import numpy as np 4 | import sys, os 5 | from scipy import interpolate 6 | import matplotlib.pyplot as plt 7 | 8 | 9 | def get_item(log_file, label): 10 | data = pandas.read_csv(log_file, index_col=None, comment='#', error_bad_lines=True) 11 | return data[label].values 12 | 13 | 14 | def smooth(array, window): 15 | out = np.zeros(array.shape[0] - window) 16 | for i in range(out.shape[0]): 17 | out[i] = np.nanmean(array[i:i + window]) 18 | return out 19 | 20 | 21 | if __name__ == '__main__': 22 | folder_name = sys.argv[1] 23 | env_name = sys.argv[2] 24 | assert env_name in ['particle'] 25 | max_timesteps = {'umaze': 1e5, 'maze_ego': 2.5e7, 'maze_box': 4.9e7} 26 | df_walltime, df_sr, df_eval, df_legend = [], [], [], [] 27 | # df_timesteps, df_sr, df_eval, df_legend, df_iteration, df_success_rate_iteration, df_legend_iteration = [], [], [], [], [], [], [] 28 | subfolders = ['sac', 'ppo'] 29 | if env_name == "particle": 30 | for subfolder in subfolders: 31 | last_sr = [] 32 | for i in range(3): 33 | if not os.path.exists(os.path.join(folder_name, subfolder, str(i), 'progress.csv')): 34 | continue 35 | progress_file = os.path.join(folder_name, subfolder, str(i), 'progress.csv') 36 | eval_file = os.path.join(folder_name, subfolder, str(i), 'eval.csv') 37 | raw_walltime = get_item(progress_file, 'time_elapsed') 38 | raw_success_rate = get_item(progress_file, 'ep_success_rate') if subfolder == "ppo" else get_item(progress_file, 'success rate') 39 | if subfolder == "ppo": 40 | raw_walltime = raw_walltime[:560] 41 | raw_success_rate = raw_success_rate[:560] 42 | sr_f = interpolate.interp1d(raw_walltime, raw_success_rate, bounds_error=False) 43 | wall_time = np.arange(0, 3.3e5, 3.3e5 // 250) 44 | success_rate = sr_f(wall_time) 45 | print(wall_time[0], wall_time[-1], raw_walltime[0], raw_walltime[-1]) 46 | 47 | if subfolder == "ppo": 48 | print(len(wall_time)) 49 | print(success_rate[-10:]) 50 | wall_time = smooth(wall_time, 10) 51 | success_rate = smooth(success_rate, 10) 52 | # eval_reward = smooth(eval_reward, 20) 53 | df_walltime.append(wall_time) 54 | df_sr.append(success_rate) 55 | last_sr.append(success_rate[-1]) 56 | # df_eval.append(eval_reward) 57 | df_legend.append(np.array([subfolder.upper()] * len(wall_time))) 58 | 59 | print(subfolder, np.mean(last_sr)) 60 | 61 | # df_timesteps = np.concatenate(df_timesteps, axis=0).tolist() 62 | df_walltime = np.concatenate(df_walltime, axis=0).tolist() 63 | df_sr = np.concatenate(df_sr, axis=0).tolist() 64 | # df_eval = np.concatenate(df_eval, axis=0).tolist() 65 | df_legend = np.concatenate(df_legend, axis=0).tolist() 66 | data = {'wall time': df_walltime, 'success_rate': df_sr, 'algo': df_legend} 67 | sr_walltime = pandas.DataFrame(data) 68 | 69 | wspace = .3 70 | bottom = .3 71 | margin = .1 72 | left = .15 73 | width = 1.5 / ((1. - left) / (2 + wspace + margin / 2)) 74 | height = 1.5 / ((1. - bottom) / (1 + margin / 2)) 75 | 76 | plt.style.use("ggplot") 77 | # plt.rcParams.update({'legend.fontsize': 14}) 78 | p = sns.color_palette() 79 | sns.set_palette([p[i] for i in range(len(subfolders))]) 80 | f, axes = plt.subplots(1, 1, figsize=(width, height)) 81 | sns.lineplot(x='wall time', y='success_rate', hue='algo', ax=axes, data=sr_walltime) 82 | axes.set_xlabel('wall time') 83 | axes.set_ylabel('success_rate') 84 | axes.xaxis.get_major_formatter().set_powerlimits((0, 1)) 85 | axes.get_legend().remove() 86 | 87 | handles, labels = axes.get_legend_handles_labels() 88 | 89 | f.legend(handles[:], ['SAC', 'PPO'], loc="lower right", ncol=1, bbox_to_anchor=(0.99, 0.18), title='') 90 | f.subplots_adjust(top=1. - margin / height, bottom=0.21, wspace=wspace, left=left, right=1. - margin / width) 91 | plt.savefig(os.path.join(folder_name, '../', os.path.basename(folder_name) + env_name + 'walltime.pdf')) 92 | plt.show() 93 | -------------------------------------------------------------------------------- /plot/plot_success_traj.py: -------------------------------------------------------------------------------- 1 | import sys, pandas, os, imageio 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | from mpl_toolkits.mplot3d import Axes3D 5 | 6 | 7 | def get_item(log_file, label): 8 | data = pandas.read_csv(log_file, index_col=None, comment='#') 9 | return data[label].values 10 | 11 | if __name__ == '__main__': 12 | if len(sys.argv) < 2: 13 | print('Usage: python plot_success_traj.py [csv_name]') 14 | exit() 15 | fname = sys.argv[1] 16 | dones = get_item(fname, 'done') 17 | gripper_xs = get_item(fname, 'gripper_x') 18 | gripper_ys = get_item(fname, 'gripper_y') 19 | gripper_zs = get_item(fname, 'gripper_z') 20 | box_xs = get_item(fname, 'box_x') 21 | box_ys = get_item(fname, 'box_y') 22 | box_zs = get_item(fname, 'box_z') 23 | obstacle_xs = get_item(fname, 'obstacle_x') 24 | obstacle_ys = get_item(fname, 'obstacle_y') 25 | obstacle_zs = get_item(fname, 'obstacle_z') 26 | obstacle1_xs = get_item(fname, 'obstacle1_x') 27 | obstacle1_ys = get_item(fname, 'obstacle1_y') 28 | obstacle1_zs = get_item(fname, 'obstacle1_z') 29 | goals = [] 30 | for i in range(6): 31 | goals.append(get_item(fname, 'goal_' + str(i))) 32 | goals = np.asarray(goals) 33 | goals = np.swapaxes(goals, 0, 1) 34 | end_points = np.where(dones > 0.5)[0] 35 | print('#episodes', len(end_points)) 36 | for i in end_points: 37 | assert np.argmax(goals[i][3:]) == 0 38 | # print(goals[i]) 39 | ''' 40 | _print_end_points = np.random.choice(end_points[:len(end_points) // 100], size=20) 41 | _print_end_points2 = np.random.choice(end_points[len(end_points) // 100 * 99:], size=20) 42 | _print_end_points3 = np.random.choice(end_points[len(end_points) // 100 * 50: len(end_points) // 10 * 51], size=20) 43 | print('first percentile') 44 | for i in _print_end_points: 45 | print(i, goals[i]) 46 | print('50 percentile') 47 | for i in _print_end_points2: 48 | print(i, goals[i]) 49 | print('last percentile') 50 | for i in _print_end_points3: 51 | print(i, goals[i]) 52 | ''' 53 | # print(goals[:, 3:]) 54 | 55 | ep_idx = 0 56 | step = 0 57 | has_switch = False 58 | fig = plt.figure() 59 | ax = fig.add_subplot(111) 60 | for i in range(end_points[5]): 61 | ax.cla() 62 | # ax.set_xlim(1.0, 1.6) 63 | # ax.set_ylim(0.4, 1.1) 64 | ax.set_xlim(0.0, 5.0) 65 | ax.set_ylim(0.0, 5.0) 66 | # ax.set_zlim(0, 1.2) 67 | ax.set_xlabel('x') 68 | ax.set_ylabel('y') 69 | ax.scatter(gripper_xs[i], gripper_ys[i], c='tab:gray') 70 | ax.scatter(box_xs[i], box_ys[i], c='tab:blue') 71 | ax.scatter(obstacle_xs[i], obstacle_ys[i], c='tab:brown') 72 | ax.scatter(obstacle1_xs[i], obstacle1_ys[i], c='#ff00ff') 73 | ax.plot([1.5, 1.5, 1.8, 1.8], [0.0, 2.0, 2.0, 0.0], 'tab:gray') 74 | ax.plot([1.5, 1.5, 1.8, 1.8], [5.0, 3.0, 3.0, 5.0], 'tab:gray') 75 | ax.plot([3.5, 3.5, 3.2, 3.2], [0.0, 2.0, 2.0, 0.0], 'tab:gray') 76 | ax.plot([3.5, 3.5, 3.2, 3.2], [5.0, 3.0, 3.0, 5.0], 'tab:gray') 77 | if not has_switch and np.argmax(goals[i][3:]) == 1: 78 | print('episode %d switch step %d' % (ep_idx, step)) 79 | print('restart box', box_xs[i], box_ys[i], 'subgoal', goals[i]) 80 | has_switch = True 81 | if np.argmax(goals[i][3:]) == 0: 82 | marker = '*' 83 | else: 84 | marker = '^' 85 | ax.scatter(goals[i][0], goals[i][1], c='tab:red', marker=marker) 86 | ax.set_title('episode %d step %d' % (ep_idx, step)) 87 | step += 1 88 | if dones[i] > 0.5: 89 | assert np.argmax(goals[i][3:]) == 0 90 | print('ultimate goal', goals[i]) 91 | ep_idx += 1 92 | step = 0 93 | has_switch = False 94 | plt.savefig('tempimg' + str(i) + '.png') 95 | plt.pause(0.1) 96 | os.system('ffmpeg -r 2 -start_number 0 -i tempimg%d.png -c:v libx264 -pix_fmt yuv420p ' + 97 | os.path.join(os.path.dirname(fname), 'augment_data.mp4')) 98 | # images = [] 99 | for i in range(end_points[5]): 100 | # images.append(plt.imread('tempimg' + str(i) + '.png')) 101 | os.remove('tempimg' + str(i) + '.png') 102 | # imageio.mimsave('augment_data.gif', images) 103 | -------------------------------------------------------------------------------- /assets/fetch/shared.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | -------------------------------------------------------------------------------- /plot/plot_experiment_success_len.py: -------------------------------------------------------------------------------- 1 | import seaborn as sns 2 | import pandas 3 | import numpy as np 4 | import sys, os 5 | from scipy import interpolate 6 | import matplotlib.pyplot as plt 7 | from stable_baselines.bench.monitor import load_results 8 | 9 | def get_item(log_file, label): 10 | data = pandas.read_csv(log_file, index_col=None, comment='#', error_bad_lines=True) 11 | return data[label].values 12 | 13 | 14 | def smooth(array, window): 15 | out = np.zeros(array.shape[0] - window) 16 | for i in range(out.shape[0]): 17 | out[i] = np.mean(array[i:i + window]) 18 | return out 19 | 20 | 21 | if __name__ == '__main__': 22 | folder_name = sys.argv[1] 23 | env_name = sys.argv[2] 24 | assert env_name in ['push', 'particle', 'maze', 'stacking'] 25 | alg = sys.argv[3] 26 | assert alg in ['ppo', 'sac'] 27 | # assert mode in ['train', 'hard', 'iteration'] 28 | if alg == 'ppo': 29 | max_timesteps = {'push': 4.99e7, 30 | 'particle': 2.5e8, 31 | 'maze': 1.5e6, 32 | 'stacking': 2e8,} 33 | elif alg == 'sac': 34 | max_timesteps = {'push': 1.36e7,} 35 | # max_iterationss = {'push': 750, 36 | # 'particle': 510, 37 | # 'maze': 245,} 38 | # df_timesteps, df_sr, df_eval, df_legend, df_iteration, df_eval_iteration, df_legend_iteration = [], [], [], [], [], [], [] 39 | df_iteration, df_len_mean, df_legend_iteration = [], [], [] 40 | subfolders = [alg, 'sir', 'sil'] 41 | if 'particle_random0.7' in folder_name: 42 | subfolders = ['ppo', 'sir', 'sil'] 43 | elif 'particle_random1.0' in folder_name: 44 | subfolders = ['ppo', 'sir', 'sil'] 45 | elif 'maze' in folder_name: 46 | subfolders = ['ppo', 'sir_re2'] 47 | for subfolder in subfolders: 48 | last_success_len = [] 49 | for i in range(3): 50 | if not os.path.exists(os.path.join(folder_name, subfolder, str(i), '0.monitor.csv')): 51 | continue 52 | monitor_df = load_results(os.path.join(folder_name, subfolder, str(i))) 53 | raw_len = monitor_df.l 54 | raw_success = monitor_df.is_success 55 | cum_len = raw_len.cumsum() 56 | masked_len = smooth(raw_len[raw_success > 0.5].values, 100) 57 | masked_cum_len = smooth(cum_len[raw_success > 0.5].values, 100) 58 | success_len_f = interpolate.interp1d(masked_cum_len, masked_len, fill_value="extrapolate") 59 | print(masked_cum_len[-1], max_timesteps[env_name]) 60 | timesteps = np.arange(0, max_timesteps[env_name], max_timesteps[env_name] // 500) 61 | success_len = success_len_f(timesteps) 62 | # iterations = timesteps / timesteps[-1] * max_iterationss[env_name] 63 | # iterations = smooth(iterations, 20) 64 | timesteps = smooth(timesteps, 20) 65 | success_len = smooth(success_len, 20) 66 | 67 | last_success_len.append(success_len[-1]) 68 | 69 | df_iteration.append(timesteps) 70 | df_len_mean.append(success_len) 71 | df_legend_iteration.append(np.array([subfolder.upper()] * len(timesteps))) 72 | assert len(timesteps) == len(success_len) 73 | print(subfolder, np.mean(last_success_len)) 74 | df_iteration = np.concatenate(df_iteration, axis=0).tolist() 75 | df_len_mean = np.concatenate(df_len_mean, axis=0).tolist() 76 | df_legend_iteration = np.concatenate(df_legend_iteration, axis=0).tolist() 77 | data = {'timesteps': df_iteration, 'len_mean': df_len_mean, 'algo': df_legend_iteration} 78 | len_mean_iteration = pandas.DataFrame(data) 79 | 80 | wspace = .3 81 | bottom = .3 82 | margin = .1 83 | left = .18 84 | width = 1.2 / ((1. - left) / (2. + wspace + margin / 2)) 85 | height = 1.5 / ((1. - bottom) / (1 + margin / 2)) 86 | 87 | plt.style.use("ggplot") 88 | # plt.rcParams.update({'legend.fontsize': 14}) 89 | p = sns.color_palette() 90 | sns.set_palette([p[i] for i in range(len(subfolders))]) 91 | f, axes = plt.subplots(1, 1, figsize=(width, height)) 92 | sns.lineplot(x='timesteps', y='len_mean', hue='algo', ax=axes, data=len_mean_iteration) 93 | axes.set_xlabel('timesteps') 94 | axes.set_ylabel('episode length') 95 | axes.get_legend().remove() 96 | handles, labels = axes.get_legend_handles_labels() 97 | f.legend(handles[1:], [alg.upper(), 'SIR', 'SIL'], loc="upper right", ncol=1, title='') 98 | f.subplots_adjust(top=1. - margin / height, bottom=0.2, wspace=wspace, left=left, right=1. - margin / width) 99 | plt.savefig(os.path.join(folder_name, '../', os.path.basename(folder_name) + '_successlen.pdf')) 100 | # plt.show() 101 | -------------------------------------------------------------------------------- /assets/masspoint/double_obstacle.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 74 | 75 | 76 | -------------------------------------------------------------------------------- /baselines/her/utils.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | 3 | import numpy as np 4 | from gym import spaces 5 | from stable_baselines.common.vec_env import VecEnv 6 | 7 | # Important: gym mixes up ordered and unordered keys 8 | # and the Dict space may return a different order of keys that the actual one 9 | KEY_ORDER = ['observation', 'achieved_goal', 'desired_goal'] 10 | 11 | 12 | class HERGoalEnvWrapper(object): 13 | """ 14 | A wrapper that allow to use dict observation space (coming from GoalEnv) with 15 | the RL algorithms. 16 | It assumes that all the spaces of the dict space are of the same type. 17 | 18 | :param env: (gym.GoalEnv) 19 | """ 20 | 21 | def __init__(self, env): 22 | super(HERGoalEnvWrapper, self).__init__() 23 | self.env = env 24 | self.metadata = self.env.metadata 25 | self.action_space = env.action_space 26 | self.spaces = list(env.observation_space.spaces.values()) 27 | # Check that all spaces are of the same type 28 | # (current limitation of the wrapper) 29 | space_types = [type(env.observation_space.spaces[key]) for key in KEY_ORDER] 30 | assert len(set(space_types)) == 1, "The spaces for goal and observation"\ 31 | " must be of the same type" 32 | 33 | if isinstance(self.spaces[0], spaces.Discrete): 34 | self.obs_dim = 1 35 | self.goal_dim = 1 36 | else: 37 | goal_space_shape = env.observation_space.spaces['achieved_goal'].shape 38 | self.obs_dim = env.observation_space.spaces['observation'].shape[0] 39 | self.goal_dim = goal_space_shape[0] 40 | 41 | if len(goal_space_shape) == 2: 42 | assert goal_space_shape[1] == 1, "Only 1D observation spaces are supported yet" 43 | else: 44 | assert len(goal_space_shape) == 1, "Only 1D observation spaces are supported yet" 45 | 46 | 47 | if isinstance(self.spaces[0], spaces.MultiBinary): 48 | total_dim = self.obs_dim + 2 * self.goal_dim 49 | self.observation_space = spaces.MultiBinary(total_dim) 50 | 51 | elif isinstance(self.spaces[0], spaces.Box): 52 | lows = np.concatenate([space.low for space in self.spaces]) 53 | highs = np.concatenate([space.high for space in self.spaces]) 54 | self.observation_space = spaces.Box(lows, highs, dtype=np.float32) 55 | 56 | elif isinstance(self.spaces[0], spaces.Discrete): 57 | dimensions = [env.observation_space.spaces[key].n for key in KEY_ORDER] 58 | self.observation_space = spaces.MultiDiscrete(dimensions) 59 | 60 | else: 61 | raise NotImplementedError("{} space is not supported".format(type(self.spaces[0]))) 62 | 63 | if isinstance(self.env, VecEnv): 64 | self.reward_type = self.env.get_attr('reward_type')[0] 65 | else: 66 | self.reward_type = self.env.reward_type 67 | 68 | 69 | def convert_dict_to_obs(self, obs_dict): 70 | """ 71 | :param obs_dict: (dict) 72 | :return: (np.ndarray) 73 | """ 74 | # Note: achieved goal is not removed from the observation 75 | # this is helpful to have a revertible transformation 76 | if isinstance(self.observation_space, spaces.MultiDiscrete): 77 | # Special case for multidiscrete 78 | return np.concatenate([[int(obs_dict[key])] for key in KEY_ORDER]) 79 | return np.concatenate([obs_dict[key] for key in KEY_ORDER], axis=-1) 80 | 81 | def convert_obs_to_dict(self, observations): 82 | """ 83 | Inverse operation of convert_dict_to_obs 84 | 85 | :param observations: (np.ndarray) 86 | :return: (OrderedDict) 87 | """ 88 | return OrderedDict([ 89 | ('observation', observations[...,:self.obs_dim]), 90 | ('achieved_goal', observations[...,self.obs_dim:self.obs_dim + self.goal_dim]), 91 | ('desired_goal', observations[...,self.obs_dim + self.goal_dim:]), 92 | ]) 93 | 94 | def step(self, action): 95 | obs, reward, done, info = self.env.step(action) 96 | return self.convert_dict_to_obs(obs), reward, done, info 97 | 98 | def seed(self, seed=None): 99 | return self.env.seed(seed) 100 | 101 | def reset(self): 102 | return self.convert_dict_to_obs(self.env.reset()) 103 | 104 | def compute_reward(self, achieved_goal, desired_goal, info, indices=None): 105 | if isinstance(self.env, VecEnv): 106 | return self.env.env_method('compute_reward', achieved_goal, desired_goal, info, indices=indices) 107 | return self.env.compute_reward(achieved_goal, desired_goal, info) 108 | 109 | def compute_reward_and_success(self, achieved_goal, desired_goal, info, indices=None): 110 | if isinstance(self.env, VecEnv): 111 | return self.env.env_method('compute_reward_and_success', achieved_goal, desired_goal, info, indices=indices) 112 | return self.env.compute_reward_and_success(achieved_goal, desired_goal, info) 113 | 114 | def render(self, mode='human'): 115 | return self.env.render(mode) 116 | 117 | def close(self): 118 | return self.env.close() 119 | -------------------------------------------------------------------------------- /assets/masspoint/emaze_easy.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /assets/masspoint/generate_xml.py: -------------------------------------------------------------------------------- 1 | template = ''' 2 | 3 | 4 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | {scene} 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | {obstacles} 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 47 | 48 | 49 | ''' 50 | 51 | scene_template = ''' 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | {walls} 62 | ''' 63 | 64 | 65 | def generate_xml(num_obstacles): 66 | wall_template = '' 67 | walls = [] 68 | for i in range(num_obstacles): 69 | walls.append(wall_template.format(**dict(id=2 * i, pos=str(1.7 * (i + 1)) + ' 1.0 0.25'))) 70 | walls.append(wall_template.format(**dict(id=2 * i + 1, pos=str(1.7 * (i + 1)) + ' 4.0 0.25'))) 71 | scene = scene_template.format(**dict(bound0_pos=str(1.7 * (num_obstacles + 1) / 2) + ' -0.2 0.25', 72 | bound1_pos=str(1.7 * (num_obstacles + 1) / 2) + ' 5.2 0.25', 73 | bound2_pos='-0.2 2.5 0.25', 74 | bound3_pos=str(1.7 * (num_obstacles + 1) + 0.2) + ' 2.5 0.25', 75 | bound_v_size=str(1.7 * (num_obstacles + 1) / 2) + ' 0.2 0.25', 76 | bound_h_size='0.2 2.9 0.25', 77 | walls="\n".join(walls))) 78 | obstacle_template = ''' 79 | 80 | 81 | 82 | 83 | 84 | 85 | ''' 86 | obstacles = [obstacle_template.format(**dict(id=i + 1)) for i in range(num_obstacles)] 87 | xml = template.format(**dict(scene=scene, obstacles="\n".join(obstacles))) 88 | return xml 89 | -------------------------------------------------------------------------------- /run_ppo_augment.py: -------------------------------------------------------------------------------- 1 | from baselines import PPO2_SIR 2 | from stable_baselines import logger 3 | from stable_baselines.common import set_global_seeds 4 | from stable_baselines.common.vec_env import SubprocVecEnv 5 | 6 | from utils.log_utils import eval_model, log_eval, stack_eval_model 7 | from utils.parallel_subproc_vec_env import ParallelSubprocVecEnv 8 | from stable_baselines.common.policies import register_policy 9 | 10 | from utils.make_env_utils import configure_logger, make_env, get_num_workers, get_env_kwargs, get_train_kwargs, \ 11 | get_policy_kwargs 12 | 13 | import os, time, argparse 14 | 15 | 16 | def arg_parse(): 17 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) 18 | parser.add_argument('--env', default='FetchPushWallObstacle-v4') 19 | parser.add_argument('--policy', type=str, default='MlpPolicy') 20 | parser.add_argument('--seed', type=int, default=42) 21 | parser.add_argument('--num_timesteps', type=float, default=1e8) 22 | parser.add_argument('--log_path', default=None, type=str) 23 | parser.add_argument('--load_path', default=None, type=str) 24 | parser.add_argument('--random_ratio', default=1.0, type=float) 25 | parser.add_argument('--aug_clip', default=0.1, type=float) 26 | parser.add_argument('--aug_adv_weight', default=1.0, type=float) 27 | parser.add_argument('--n_subgoal', default=4, type=int) 28 | parser.add_argument('--parallel', action="store_true", default=False) 29 | parser.add_argument('--self_imitate', action="store_true", default=False) 30 | parser.add_argument('--sil_clip', default=0.2, type=float) 31 | parser.add_argument('--start_augment', type=float, default=0) 32 | parser.add_argument('--reuse_times', default=1, type=int) 33 | parser.add_argument('--reward_type', default="sparse", type=str) 34 | parser.add_argument('--n_object', default=2, type=int) 35 | parser.add_argument('--curriculum', action="store_true", default=False) 36 | parser.add_argument('--sequential', action="store_true", default=False) 37 | parser.add_argument('--play', action="store_true", default=False) 38 | parser.add_argument('--export_gif', action="store_true", default=False) 39 | parser.add_argument('--log_trace', action="store_true", default=False) 40 | args = parser.parse_args() 41 | return args 42 | 43 | 44 | def main(args): 45 | log_dir = args.log_path if (args.log_path is not None) else \ 46 | "/tmp/stable_baselines_" + time.strftime('%Y-%m-%d-%H-%M-%S') 47 | configure_logger(log_dir) 48 | 49 | set_global_seeds(args.seed) 50 | 51 | n_cpu = get_num_workers(args.env) if not args.play else 1 52 | 53 | env_kwargs = get_env_kwargs(args.env, args.random_ratio, args.sequential, args.reward_type, 54 | args.n_object, args.curriculum) 55 | 56 | def make_thunk(rank): 57 | return lambda: make_env(env_id=args.env, rank=rank, log_dir=log_dir, flatten_dict=True, kwargs=env_kwargs) 58 | 59 | env = SubprocVecEnv([make_thunk(i) for i in range(n_cpu)]) 60 | 61 | aug_env_name = args.env.split('-')[0] + 'Unlimit-' + args.env.split('-')[1] 62 | aug_env_kwargs = env_kwargs.copy() 63 | aug_env_kwargs['max_episode_steps'] = None 64 | 65 | def make_thunk_aug(rank): 66 | return lambda: make_env(env_id=aug_env_name, rank=rank, flatten_dict=True, kwargs=aug_env_kwargs) 67 | 68 | if not args.parallel: 69 | aug_env = make_env(env_id=aug_env_name, rank=0, flatten_dict=True, kwargs=aug_env_kwargs) 70 | else: 71 | aug_env = ParallelSubprocVecEnv([make_thunk_aug(i) for i in range(min(32, n_cpu))], reset_when_done=False) 72 | print(aug_env) 73 | 74 | if os.path.exists(os.path.join(logger.get_dir(), 'eval.csv')): 75 | os.remove(os.path.join(logger.get_dir(), 'eval.csv')) 76 | print('Remove existing eval.csv') 77 | eval_env_kwargs = env_kwargs.copy() 78 | eval_env_kwargs['random_ratio'] = 0.0 79 | if "use_cu" in eval_env_kwargs: 80 | eval_env_kwargs['use_cu'] = False 81 | eval_env = make_env(env_id=args.env, rank=0, flatten_dict=True, kwargs=eval_env_kwargs) 82 | print(eval_env) 83 | 84 | if not args.play: 85 | os.makedirs(log_dir, exist_ok=True) 86 | 87 | from utils.attention_policy import AttentionPolicy 88 | register_policy('AttentionPolicy', AttentionPolicy) 89 | 90 | policy_kwargs = get_policy_kwargs("ppo_sir", args) 91 | 92 | train_kwargs = get_train_kwargs("ppo_sir", args, parsed_action_noise=None, eval_env=eval_env, aug_env=aug_env) 93 | 94 | model = PPO2_SIR(args.policy, env, verbose=1, nminibatches=32, lam=0.95, gamma=0.99, noptepochs=10, 95 | ent_coef=0.01, learning_rate=3e-4, cliprange=0.2, policy_kwargs=policy_kwargs, 96 | horizon=env_kwargs['max_episode_steps'], **train_kwargs) 97 | 98 | def callback(_locals, _globals): 99 | num_update = _locals["update"] 100 | if 'FetchStack' in args.env: 101 | mean_eval_reward = stack_eval_model(eval_env, _locals["self"]) 102 | else: 103 | mean_eval_reward = eval_model(eval_env, _locals["self"]) 104 | log_eval(num_update, mean_eval_reward) 105 | if num_update % 10 == 0: 106 | model_path = os.path.join(log_dir, 'model_' + str(num_update // 10)) 107 | model.save(model_path) 108 | print('model saved to', model_path) 109 | return True 110 | 111 | model.learn(total_timesteps=int(args.num_timesteps), callback=callback, seed=args.seed, log_interval=1) 112 | model.save(os.path.join(log_dir, 'final')) 113 | 114 | 115 | if __name__ == '__main__': 116 | args = arg_parse() 117 | print('arg parsed') 118 | main(args) 119 | -------------------------------------------------------------------------------- /utils/replay_buffer.py: -------------------------------------------------------------------------------- 1 | from stable_baselines.deepq import ReplayBuffer, PrioritizedReplayBuffer 2 | import numpy as np 3 | import random 4 | 5 | 6 | class DoublePrioritizedReplayWrapper(object): 7 | def __init__(self, buffer1, buffer2): 8 | assert isinstance(buffer1, PrioritizedReplayBuffer) 9 | assert isinstance(buffer2, PrioritizedReplayBuffer) 10 | self.buffer1 = buffer1 11 | self.buffer2 = buffer2 12 | self.min_tree_operation = buffer1._it_min._operation 13 | self.sum_tree_operation = buffer1._it_sum._operation 14 | 15 | def _sample_proportional(self, batch_size): 16 | res1, res2 = [], [] 17 | _sum1 = self.buffer1._it_sum.sum() 18 | _sum2 = self.buffer2._it_sum.sum() 19 | for i in range(batch_size): 20 | mass = random.random() * (_sum1 + _sum2) 21 | if mass < _sum1: 22 | idx = self.buffer1._it_sum.find_prefixsum_idx(mass) 23 | res1.append(idx) 24 | else: 25 | idx = self.buffer2._it_sum.find_prefixsum_idx(mass - _sum1) 26 | res2.append(idx) 27 | return res1, res2 28 | 29 | def sample(self, batch_size, beta=0): 30 | assert beta > 0 31 | 32 | idxes1, idxes2 = self._sample_proportional(batch_size) 33 | 34 | weights1, weights2 = [], [] 35 | p_min = self.min_tree_operation(self.buffer1._it_min.min(), self.buffer2._it_min.min()) / self.sum_tree_operation(self.buffer1._it_sum.sum(), self.buffer2._it_sum.sum()) 36 | max_weight = (p_min * (len(self.buffer1._storage) + len(self.buffer2._storage))) ** (-beta) 37 | 38 | for idx in idxes1: 39 | p_sample = self.buffer1._it_sum[idx] / (self.buffer1._it_sum.sum() + self.buffer2._it_sum.sum()) 40 | weight = (p_sample * (len(self.buffer1._storage) + len(self.buffer2._storage))) ** (-beta) 41 | weights1.append(weight / max_weight) 42 | for idx in idxes2: 43 | p_sample = self.buffer2._it_sum[idx] / (self.buffer1._it_sum.sum() + self.buffer2._it_sum.sum()) 44 | weight = (p_sample * (len(self.buffer1._storage) + len(self.buffer2._storage))) ** (-beta) 45 | weights2.append(weight / max_weight) 46 | 47 | weights1 = np.array(weights1) 48 | weights2 = np.array(weights2) 49 | encoded_sample1 = self.buffer1._encode_sample(idxes1) 50 | encoded_sample2 = self.buffer2._encode_sample(idxes2) 51 | return tuple(list(encoded_sample1) + [weights1, idxes1]), tuple(list(encoded_sample2) + [weights2, idxes2]) 52 | 53 | 54 | class MultiWorkerReplayBuffer(ReplayBuffer): 55 | def __init__(self, size, num_workers=1, gamma=0.99): 56 | super(MultiWorkerReplayBuffer, self).__init__(size) 57 | self.num_workers = num_workers 58 | self.gamma = gamma 59 | self.local_transitions = [[] for _ in range(self.num_workers)] 60 | 61 | def add(self, obs_t, action, reward, obs_tp1, done): 62 | assert obs_t.shape[0] == self.num_workers 63 | for i in range(self.num_workers): 64 | self.local_transitions[i].append([obs_t[i], action[i], reward[i], obs_tp1[i], done[i]]) 65 | if done[i]: 66 | for j in range(len(self.local_transitions[i])): 67 | super().add(*(self.local_transitions[i][j])) 68 | self.local_transitions[i] = [] 69 | 70 | 71 | class PrioritizedMultiWorkerReplayBuffer(PrioritizedReplayBuffer): 72 | def __init__(self, size, alpha, num_workers=1, gamma=0.99): 73 | super(PrioritizedMultiWorkerReplayBuffer, self).__init__(size, alpha) 74 | self.num_workers = num_workers 75 | self.gamma = gamma 76 | self.local_transitions = [[] for _ in range(self.num_workers)] 77 | self.model = None 78 | 79 | def set_model(self, model): 80 | self.model = model 81 | 82 | def add(self, obs_t, action, reward, obs_tp1, done): 83 | assert obs_t.shape[0] == self.num_workers 84 | for i in range(self.num_workers): 85 | self.local_transitions[i].append([obs_t[i], action[i], reward[i], obs_tp1[i], done[i]]) 86 | # assert len(self.local_priorities[i]) == len(self.local_transitions[i]) 87 | if done[i]: 88 | batch_obs, batch_act, batch_reward, batch_next_obs, batch_done = zip(*(self.local_transitions[i])) 89 | batch_obs, batch_act, batch_reward, batch_next_obs, batch_done = \ 90 | map(lambda v: np.asarray(v),[batch_obs, batch_act, batch_reward, batch_next_obs, batch_done]) 91 | priorities = compute_priority(self.model, batch_obs, batch_act, 92 | batch_next_obs, batch_reward, batch_done) 93 | for j in range(len(self.local_transitions[i])): 94 | p_idx = self._next_idx # The add call will change self._next_idx 95 | super().add(*(self.local_transitions[i][j])) 96 | self.update_priorities([p_idx], [priorities[j]]) 97 | self.local_transitions[i] = [] 98 | 99 | 100 | def discounted_sum(arr, gamma): 101 | arr = np.asarray(arr) 102 | return np.sum(arr * np.power(gamma, np.arange(arr.shape[0]))) 103 | 104 | 105 | def compute_priority(sac_model, batch_obs, batch_act, batch_next_obs, batch_reward, batch_done): 106 | q1, value = sac_model.sess.run([sac_model.step_ops[4], sac_model.value_target], feed_dict={ 107 | sac_model.observations_ph: batch_obs, 108 | sac_model.actions_ph: batch_act, 109 | sac_model.next_observations_ph: batch_next_obs, 110 | }) 111 | priorities = np.reshape(batch_reward, q1.shape) + ( 112 | 1 - np.reshape(batch_done, q1.shape)) * sac_model.gamma * value - q1 113 | priorities = np.squeeze(np.abs(priorities) + 1e-4, axis=-1).tolist() 114 | return priorities 115 | -------------------------------------------------------------------------------- /plot/plot_cl_experiment.py: -------------------------------------------------------------------------------- 1 | import seaborn as sns 2 | import pandas 3 | import numpy as np 4 | import sys, os 5 | from scipy import interpolate 6 | import matplotlib.pyplot as plt 7 | 8 | 9 | def get_item(log_file, label): 10 | data = pandas.read_csv(log_file, index_col=None, comment='#', error_bad_lines=True) 11 | return data[label].values 12 | 13 | 14 | def smooth(array, window): 15 | out = np.zeros(array.shape[0] - window) 16 | for i in range(out.shape[0]): 17 | out[i] = np.mean(array[i:i + window]) 18 | return out 19 | 20 | 21 | if __name__ == '__main__': 22 | folder_name = sys.argv[1] 23 | env_name = sys.argv[2] 24 | assert env_name in ['maze'] 25 | # assert mode in ['train', 'hard', 'iteration'] 26 | max_timesteps = {'maze': 4e6, 27 | } 28 | max_iterationss = {'maze': 245,} 29 | df_timesteps, df_sr, df_eval, df_legend, df_iteration, df_success_rate_iteration, df_legend_iteration = [], [], [], [], [], [], [] 30 | # subfolders = ['ppo_sir', 'ppo_cu', 'goal_gan_b1000', ] # 'goal_gan_b10000' 31 | subfolders = ['goal_gan_b1000', 'goal_gan_b10000'] 32 | last_sr = [] 33 | for subfolder in subfolders: 34 | if subfolder == "ppo_cu": 35 | for i in range(3): 36 | eval_file = os.path.join(folder_name, subfolder, str(i), 'eval.csv') 37 | raw_success_rate = get_item(eval_file, 'mean_eval_reward') 38 | raw_total_timesteps = get_item(eval_file, 'n_updates') * 1000 39 | sr_f = interpolate.interp1d(raw_total_timesteps, raw_success_rate, bounds_error=False) 40 | timesteps = np.arange(0, max_timesteps[env_name], max_timesteps[env_name] // 245) 41 | print(timesteps[0], timesteps[-1], raw_total_timesteps[0], raw_total_timesteps[-1]) 42 | success_rate = sr_f(timesteps) 43 | # eval_reward = eval_f(timesteps) 44 | timesteps = smooth(timesteps, 10) 45 | success_rate = smooth(success_rate, 10) 46 | # eval_reward = smooth(eval_reward, 20) 47 | df_timesteps.append(timesteps) 48 | df_sr.append(success_rate) 49 | last_sr.append(success_rate[-1]) 50 | # df_eval.append(eval_reward) 51 | df_legend.append(np.array([subfolder.upper()] * len(timesteps))) 52 | else: 53 | for i in range(3): 54 | if not os.path.exists(os.path.join(folder_name, subfolder, str(i), 'progress.csv')): 55 | continue 56 | progress_file = os.path.join(folder_name, subfolder, str(i), 'progress.csv') 57 | raw_success_rate = get_item(progress_file, 'ep_reward_mean' if subfolder == "ppo_sir" or subfolder == "ppo" else 'Outer_MeanRewards') 58 | raw_total_timesteps = get_item(progress_file, 'total_timesteps' if subfolder == "ppo_sir" or subfolder == "ppo" else 'Outer_timesteps') 59 | sr_f = interpolate.interp1d(raw_total_timesteps, raw_success_rate, bounds_error=False) 60 | timesteps = np.arange(0, max_timesteps[env_name], max_timesteps[env_name] // 245) 61 | print(timesteps[0], timesteps[-1], raw_total_timesteps[0], raw_total_timesteps[-1]) 62 | success_rate = sr_f(timesteps) 63 | # eval_reward = eval_f(timesteps) 64 | timesteps = smooth(timesteps, 5) 65 | success_rate = smooth(success_rate, 5) 66 | # eval_reward = smooth(eval_reward, 20) 67 | df_timesteps.append(timesteps) 68 | df_sr.append(success_rate) 69 | last_sr.append(success_rate[-1]) 70 | # df_eval.append(eval_reward) 71 | df_legend.append(np.array([subfolder.upper()] * len(timesteps))) 72 | 73 | print(subfolder, np.mean(last_sr)) 74 | df_timesteps = np.concatenate(df_timesteps, axis=0).tolist() 75 | df_sr = np.concatenate(df_sr, axis=0).tolist() 76 | # df_eval = np.concatenate(df_eval, axis=0).tolist() 77 | df_legend = np.concatenate(df_legend, axis=0).tolist() 78 | 79 | data = {'samples': df_timesteps, 'success_rate': df_sr, 'algo': df_legend} 80 | sr_timesteps = pandas.DataFrame(data) 81 | # data = {'samples': df_timesteps, 'eval': df_eval, 'algo': df_legend} 82 | # eval_timesteps = pandas.DataFrame(data) 83 | 84 | wspace = .3 85 | bottom = .3 86 | margin = .1 87 | left = .15 88 | width = 1.7 / ((1. - left) / (2 + wspace + margin / 2)) 89 | height = 1.5 / ((1. - bottom) / (1 + margin / 2)) 90 | 91 | plt.style.use("ggplot") 92 | # plt.rcParams.update({'legend.fontsize': 14}) 93 | p = sns.color_palette() 94 | sns.set_palette([p[0], p[1], p[2], p[3]]) 95 | f, axes = plt.subplots(1, 1, figsize=(width, height)) 96 | sns.lineplot(x='samples', y='success_rate', hue='algo', ax=axes, data=sr_timesteps) 97 | axes.set_xlabel('samples') 98 | axes.set_ylabel('success_rate') 99 | axes.get_legend().remove() 100 | # sns.lineplot(x='samples', y='eval', hue='algo', ax=axes[1], data=eval_timesteps) 101 | # axes[1].set_xlabel('samples') 102 | # axes[1].set_ylabel('') 103 | # axes[1].get_legend().remove() 104 | handles, labels = axes.get_legend_handles_labels() 105 | # if mode == 'train': 106 | # sns.lineplot(x='samples', y='success_rate', hue='algo', data=sr_timesteps) 107 | # axes.set_xlabel('samples') 108 | # elif mode == 'hard': 109 | # sns.lineplot(x='samples', y='eval', hue='algo', data=eval_timesteps) 110 | # axes.set_xlabel('samples') 111 | # elif mode == 'iteration': 112 | # sns.lineplot(x='iterations', y='eval', hue='algo', ax=axes, data=eval_iteration) 113 | # axes.set_xlabel('iterations') 114 | # axes.set_ylabel('success rate') 115 | # axes.get_legend().remove() 116 | # handles, labels = axes.get_legend_handles_labels() 117 | # f.legend(handles[:], ['SIR', 'Manual CL', 'GoalGAN', 'GoalGAN_10k'], loc="lower right", ncol=1, bbox_to_anchor=(0.99, 0.18), title='') 118 | f.legend(handles[:], ['GoalGAN_1k', 'GoalGAN_10k'], loc="lower right", ncol=1, 119 | bbox_to_anchor=(0.99, 0.18), title='') 120 | f.subplots_adjust(top=1. - margin / height, bottom=0.21, wspace=wspace, left=left, right=1. - margin / width) 121 | plt.savefig(os.path.join(folder_name, '../', os.path.basename(folder_name) + '.pdf')) 122 | # plt.show() 123 | -------------------------------------------------------------------------------- /plot/plot_experiment_len.py: -------------------------------------------------------------------------------- 1 | import seaborn as sns 2 | import pandas 3 | import numpy as np 4 | import sys, os 5 | from scipy import interpolate 6 | import matplotlib.pyplot as plt 7 | 8 | 9 | def get_item(log_file, label): 10 | data = pandas.read_csv(log_file, index_col=None, comment='#', error_bad_lines=True) 11 | return data[label].values 12 | 13 | 14 | def smooth(array, window): 15 | out = np.zeros(array.shape[0] - window) 16 | for i in range(out.shape[0]): 17 | out[i] = np.mean(array[i:i + window]) 18 | return out 19 | 20 | 21 | if __name__ == '__main__': 22 | folder_name = sys.argv[1] 23 | env_name = sys.argv[2] 24 | assert env_name in ['push', 'particle', 'maze', 'stacking'] 25 | # assert mode in ['train', 'hard', 'iteration'] 26 | max_timesteps = {'push': 4.99e7, 27 | 'particle': 2.8e8, 28 | 'maze': 1.5e6, 29 | 'stacking': 2e8,} 30 | max_iterationss = {'push': 750, 31 | 'particle': 510, 32 | 'maze': 245,} 33 | # df_timesteps, df_sr, df_eval, df_legend, df_iteration, df_eval_iteration, df_legend_iteration = [], [], [], [], [], [], [] 34 | df_iteration, df_len_mean, df_legend_iteration = [], [], [] 35 | subfolders = ['ppo', 'sir_re8'] 36 | if 'particle_random0.7' in folder_name: 37 | subfolders = ['ppo', 'sir_re1-8'] 38 | elif 'particle_random1.0' in folder_name: 39 | subfolders = ['ppo', 'sir_re1-8'] 40 | elif 'maze' in folder_name: 41 | subfolders = ['ppo', 'sir_re2'] 42 | for subfolder in subfolders: 43 | for i in range(3): 44 | if not os.path.exists(os.path.join(folder_name, subfolder, str(i), 'progress.csv')): 45 | continue 46 | progress_file = os.path.join(folder_name, subfolder, str(i), 'progress.csv') 47 | # eval_file = os.path.join(folder_name, subfolder, str(i), 'eval.csv') 48 | # raw_success_rate = get_item(progress_file, 'ep_reward_mean') 49 | # raw_total_timesteps = get_item(progress_file, 'total_timesteps') 50 | raw_len_mean = get_item(progress_file, 'ep_len_mean') 51 | raw_iterations = get_item(progress_file, 'n_updates') 52 | # raw_eval_reward = get_item(eval_file, 'mean_eval_reward') 53 | # sr_f = interpolate.interp1d(raw_total_timesteps, raw_success_rate, fill_value="extrapolate") 54 | # eval_f = interpolate.interp1d(raw_total_timesteps, raw_eval_reward, fill_value="extrapolate") 55 | # timesteps = np.arange(0, max_timesteps[env_name], max_timesteps[env_name] // 500) 56 | # print(timesteps[0], timesteps[-1], raw_total_timesteps[0], raw_total_timesteps[-1]) 57 | # success_rate = sr_f(timesteps) 58 | # eval_reward = eval_f(timesteps) 59 | L = max_iterationss[env_name] 60 | iterations = smooth(raw_iterations[:L], 20) 61 | len_mean = smooth(raw_len_mean[:L], 20) 62 | # timesteps = smooth(timesteps, 20) 63 | # success_rate = smooth(success_rate, 20) 64 | # eval_reward = smooth(eval_reward, 20) 65 | df_iteration.append(iterations) 66 | df_len_mean.append(len_mean) 67 | # df_timesteps.append(timesteps) 68 | # df_sr.append(success_rate) 69 | # df_eval.append(eval_reward) 70 | # df_legend.append(np.array([subfolder.upper()] * len(timesteps))) 71 | 72 | 73 | # eval_iteration = smooth(raw_eval_reward[:L], 20) 74 | # df_iteration.append(iterations) 75 | # df_eval_iteration.append(eval_iteration) 76 | df_legend_iteration.append(np.array([subfolder.upper()] * len(iterations))) 77 | # df_timesteps = np.concatenate(df_timesteps, axis=0).tolist() 78 | # df_sr = np.concatenate(df_sr, axis=0).tolist() 79 | # df_eval = np.concatenate(df_eval, axis=0).tolist() 80 | # df_legend = np.concatenate(df_legend, axis=0).tolist() 81 | df_iteration = np.concatenate(df_iteration, axis=0).tolist() 82 | df_len_mean = np.concatenate(df_len_mean, axis=0).tolist() 83 | # df_eval_iteration = np.concatenate(df_eval_iteration, axis=0).tolist() 84 | df_legend_iteration = np.concatenate(df_legend_iteration, axis=0).tolist() 85 | # data = {'samples': df_timesteps, 'success_rate': df_sr, 'algo': df_legend} 86 | # sr_timesteps = pandas.DataFrame(data) 87 | # data = {'samples': df_timesteps, 'eval': df_eval, 'algo': df_legend} 88 | # eval_timesteps = pandas.DataFrame(data) 89 | data = {'iterations': df_iteration, 'len_mean': df_len_mean, 'algo': df_legend_iteration} 90 | len_mean_iteration = pandas.DataFrame(data) 91 | 92 | wspace = .3 93 | bottom = .3 94 | margin = .1 95 | left = .18 96 | width = 1.2 / ((1. - left) / (2. + wspace + margin / 2)) 97 | height = 1.5 / ((1. - bottom) / (1 + margin / 2)) 98 | 99 | plt.style.use("ggplot") 100 | # plt.rcParams.update({'legend.fontsize': 14}) 101 | p = sns.color_palette() 102 | sns.set_palette([p[0], p[1]]) 103 | f, axes = plt.subplots(1, 1, figsize=(width, height)) 104 | # sns.lineplot(x='samples', y='success_rate', hue='algo', ax=axes[0], data=sr_timesteps) 105 | # axes[0].set_xlabel('samples') 106 | # axes[0].set_ylabel('success_rate') 107 | # axes[0].get_legend().remove() 108 | # sns.lineplot(x='samples', y='eval', hue='algo', ax=axes[1], data=eval_timesteps) 109 | # axes[1].set_xlabel('samples') 110 | # axes[1].set_ylabel('') 111 | # axes[1].get_legend().remove() 112 | sns.lineplot(x='iterations', y='len_mean', hue='algo', ax=axes, data=len_mean_iteration) 113 | axes.set_xlabel('iterations') 114 | axes.set_ylabel('episode length') 115 | axes.get_legend().remove() 116 | handles, labels = axes.get_legend_handles_labels() 117 | # if mode == 'train': 118 | # sns.lineplot(x='samples', y='success_rate', hue='algo', data=sr_timesteps) 119 | # axes.set_xlabel('samples') 120 | # elif mode == 'hard': 121 | # sns.lineplot(x='samples', y='eval', hue='algo', data=eval_timesteps) 122 | # axes.set_xlabel('samples') 123 | # elif mode == 'iteration': 124 | # sns.lineplot(x='iterations', y='eval', hue='algo', ax=axes, data=eval_iteration) 125 | # axes.set_xlabel('iterations') 126 | # axes.set_ylabel('success rate') 127 | # axes.get_legend().remove() 128 | # handles, labels = axes.get_legend_handles_labels() 129 | f.legend(handles[1:], ['PPO', 'SIR'], loc="upper right", ncol=1, title='') 130 | f.subplots_adjust(top=1. - margin / height, bottom=0.2, wspace=wspace, left=left, right=1. - margin / width) 131 | plt.savefig(os.path.join(folder_name, '../', os.path.basename(folder_name) + '_len.pdf')) 132 | # plt.show() 133 | -------------------------------------------------------------------------------- /plot/plot_experiment.py: -------------------------------------------------------------------------------- 1 | import seaborn as sns 2 | import pandas 3 | import numpy as np 4 | import sys, os 5 | from scipy import interpolate 6 | import matplotlib.pyplot as plt 7 | 8 | 9 | def get_item(log_file, label): 10 | data = pandas.read_csv(log_file, index_col=None, comment='#', error_bad_lines=True) 11 | return data[label].values 12 | 13 | 14 | def smooth(array, window): 15 | out = np.zeros(array.shape[0] - window) 16 | for i in range(out.shape[0]): 17 | out[i] = np.mean(array[i:i + window]) 18 | return out 19 | 20 | 21 | if __name__ == '__main__': 22 | folder_name = sys.argv[1] 23 | env_name = sys.argv[2] 24 | assert env_name in ['push', 'particle', 'maze', 'stacking'] 25 | # assert mode in ['train', 'hard', 'iteration'] 26 | max_timesteps = {'push': 3e7, 27 | 'particle': 3.0e8, 28 | 'maze': 1.5e6, 29 | 'stacking': 2e8,} 30 | max_iterationss = {'push': 750, 31 | 'particle': 550, 32 | 'maze': 245,} 33 | df_timesteps, df_sr, df_eval, df_legend, df_iteration, df_eval_iteration, df_legend_iteration = [], [], [], [], [], [], [] 34 | subfolders = ['ppo', 'sir', 'sil', 'ds'] 35 | for subfolder in subfolders: 36 | last_sr = [] 37 | last_eval = [] 38 | for i in range(3): 39 | if not os.path.exists(os.path.join(folder_name, subfolder, str(i), 'progress.csv')): 40 | continue 41 | progress_file = os.path.join(folder_name, subfolder, str(i), 'progress.csv') 42 | eval_file = os.path.join(folder_name, subfolder, str(i), 'eval.csv') 43 | if 'ds' in subfolder: 44 | raw_success_rate = get_item(progress_file, 'ep_success_rate') 45 | else: 46 | raw_success_rate = get_item(progress_file, 'ep_reward_mean') 47 | raw_total_timesteps = get_item(progress_file, 'total_timesteps') 48 | raw_eval_reward = get_item(eval_file, 'mean_eval_reward') 49 | print(raw_total_timesteps.shape, raw_eval_reward.shape) 50 | sr_f = interpolate.interp1d(raw_total_timesteps, raw_success_rate, fill_value="extrapolate") 51 | eval_f = interpolate.interp1d(raw_total_timesteps, raw_eval_reward, fill_value="extrapolate") 52 | timesteps = np.arange(0, max_timesteps[env_name], max_timesteps[env_name] // 500) 53 | print(timesteps[0], timesteps[-1], raw_total_timesteps[0], raw_total_timesteps[-1]) 54 | success_rate = sr_f(timesteps) 55 | eval_reward = eval_f(timesteps) 56 | timesteps = smooth(timesteps, 20) 57 | success_rate = smooth(success_rate, 20) 58 | eval_reward = smooth(eval_reward, 20) 59 | df_timesteps.append(timesteps) 60 | df_sr.append(success_rate) 61 | df_eval.append(eval_reward) 62 | last_sr.append(success_rate[-1]) 63 | last_eval.append(eval_reward[-1]) 64 | df_legend.append(np.array([subfolder.upper()] * len(timesteps))) 65 | 66 | raw_iterations = get_item(progress_file, 'n_updates') 67 | L = max_iterationss[env_name] 68 | iterations = smooth(raw_iterations[:L], 20) 69 | eval_iteration = smooth(raw_eval_reward[:L], 20) 70 | df_iteration.append(iterations) 71 | df_eval_iteration.append(eval_iteration) 72 | df_legend_iteration.append(np.array([subfolder.upper()] * len(iterations))) 73 | print(subfolder, 'sr', np.mean(last_sr), 'eval', np.mean(last_eval)) 74 | df_timesteps = np.concatenate(df_timesteps, axis=0).tolist() 75 | df_sr = np.concatenate(df_sr, axis=0).tolist() 76 | df_eval = np.concatenate(df_eval, axis=0).tolist() 77 | df_legend = np.concatenate(df_legend, axis=0).tolist() 78 | df_iteration = np.concatenate(df_iteration, axis=0).tolist() 79 | df_eval_iteration = np.concatenate(df_eval_iteration, axis=0).tolist() 80 | df_legend_iteration = np.concatenate(df_legend_iteration, axis=0).tolist() 81 | data = {'samples': df_timesteps, 'success_rate': df_sr, 'algo': df_legend} 82 | sr_timesteps = pandas.DataFrame(data) 83 | data = {'samples': df_timesteps, 'eval': df_eval, 'algo': df_legend} 84 | eval_timesteps = pandas.DataFrame(data) 85 | data = {'iterations': df_iteration, 'eval': df_eval_iteration, 'algo': df_legend_iteration} 86 | eval_iteration = pandas.DataFrame(data) 87 | 88 | wspace = .3 89 | bottom = .3 90 | margin = .1 91 | # left = .08 92 | left = .1 93 | width = 2.15 / ((1. - left) / (2 + wspace + margin / 2)) 94 | height = 1.5 / ((1. - bottom) / (1 + margin / 2)) 95 | 96 | plt.style.use("ggplot") 97 | # plt.rcParams.update({'legend.fontsize': 14}) 98 | p = sns.color_palette() 99 | sns.set_palette([p[i] for i in range(len(subfolders))]) 100 | f, axes = plt.subplots(1, 2, figsize=(width, height)) 101 | sns.lineplot(x='samples', y='success_rate', hue='algo', ax=axes[0], data=sr_timesteps) 102 | axes[0].set_xlabel('samples') 103 | axes[0].set_ylabel('avg. succ. rate') 104 | axes[0].get_legend().remove() 105 | sns.lineplot(x='samples', y='eval', hue='algo', ax=axes[1], data=eval_timesteps) 106 | axes[1].set_xlabel('samples') 107 | axes[1].set_ylabel('hard succ. rate') 108 | axes[1].get_legend().remove() 109 | # sns.lineplot(x='iterations', y='eval', hue='algo', ax=axes[2], data=eval_iteration) 110 | # axes[2].set_xlabel('iterations') 111 | # axes[2].set_ylabel('') 112 | # axes[2].get_legend().remove() 113 | handles, labels = axes[1].get_legend_handles_labels() 114 | # if mode == 'train': 115 | # sns.lineplot(x='samples', y='success_rate', hue='algo', data=sr_timesteps) 116 | # axes.set_xlabel('samples') 117 | # elif mode == 'hard': 118 | # sns.lineplot(x='samples', y='eval', hue='algo', data=eval_timesteps) 119 | # axes.set_xlabel('samples') 120 | # elif mode == 'iteration': 121 | # sns.lineplot(x='iterations', y='eval', hue='algo', ax=axes, data=eval_iteration) 122 | # axes.set_xlabel('iterations') 123 | # axes.set_ylabel('success rate') 124 | # axes.get_legend().remove() 125 | # handles, labels = axes.get_legend_handles_labels() 126 | # f.legend(handles[:], ['PPO', 'SIR', 'SIL', 'DS'], loc="lower right", ncol=1, bbox_to_anchor=(0.49, 0.18), title='') 127 | f.legend(handles[:], ['PPO', 'SIR', 'SIL'], loc="lower right", ncol=1, bbox_to_anchor=(0.49, 0.18), title='') 128 | f.subplots_adjust(top=1. - margin / height, bottom=0.21, wspace=wspace, left=left, right=1. - margin / width) 129 | plt.savefig(os.path.join(folder_name, '../', os.path.basename(folder_name) + '.pdf')) 130 | print(os.path.join(folder_name, '../', os.path.basename(folder_name) + '.pdf')) 131 | plt.show() 132 | -------------------------------------------------------------------------------- /plot/plot_sac_experiment_maze.py: -------------------------------------------------------------------------------- 1 | import seaborn as sns 2 | import pandas 3 | import numpy as np 4 | import sys, os 5 | from scipy import interpolate 6 | import matplotlib.pyplot as plt 7 | 8 | 9 | def get_item(log_file, label): 10 | data = pandas.read_csv(log_file, index_col=None, comment='#', error_bad_lines=True) 11 | return data[label].values 12 | 13 | 14 | def smooth(array, window): 15 | out = np.zeros(array.shape[0] - window) 16 | for i in range(out.shape[0]): 17 | out[i] = np.mean(array[i:i + window]) 18 | return out 19 | 20 | 21 | if __name__ == '__main__': 22 | folder_name = sys.argv[1] 23 | env_name = sys.argv[2] 24 | assert env_name in ['umaze', 'maze_ego', 'maze_box'] 25 | max_timesteps = {'umaze': 1e5, 'maze_ego': 2.5e7, 'maze_box': 4.9e7} 26 | df_timesteps, df_sr, df_eval, df_legend, df_iteration, df_success_rate_iteration, df_legend_iteration = [], [], [], [], [], [], [] 27 | subfolders = ['sir', 'hiro', 'dsc'] 28 | if env_name == 'umaze': 29 | for subfolder in subfolders: 30 | last_sr = [] 31 | for i in range(3): 32 | if not os.path.exists(os.path.join(folder_name, subfolder, str(i), 'progress.csv')): 33 | continue 34 | progress_file = os.path.join(folder_name, subfolder, str(i), 'progress.csv') 35 | if subfolder == 'hiro': 36 | raw_success_rate = get_item(progress_file, 'Value') 37 | raw_total_timesteps = get_item(progress_file, 'Step') 38 | elif subfolder == 'dsc': 39 | presmooth_success_rate = get_item(progress_file, 'Value') 40 | raw_success_rate = np.zeros_like(presmooth_success_rate) 41 | for j in range(presmooth_success_rate.shape[0]): 42 | raw_success_rate[j] = np.mean(presmooth_success_rate[max(j - 100 + 1, 0): j + 1]) 43 | raw_total_timesteps = get_item(progress_file, 'Step') 44 | else: 45 | raw_success_rate = get_item(progress_file, 'ep_rewmean') 46 | raw_total_timesteps = get_item(progress_file, 'total timesteps') 47 | sr_f = interpolate.interp1d(raw_total_timesteps, raw_success_rate, fill_value="extrapolate") 48 | timesteps = np.arange(600, max_timesteps[env_name], max_timesteps[env_name] // 50) 49 | print(timesteps[0], timesteps[-1], raw_total_timesteps[0], raw_total_timesteps[-1]) 50 | success_rate = sr_f(timesteps) 51 | # timesteps = smooth(timesteps, 20) 52 | # success_rate = smooth(success_rate, 20) 53 | # eval_reward = smooth(eval_reward, 20) 54 | df_timesteps.append(timesteps) 55 | df_sr.append(success_rate) 56 | last_sr.append(success_rate[-1]) 57 | # df_eval.append(eval_reward) 58 | df_legend.append(np.array([subfolder.upper()] * len(timesteps))) 59 | 60 | print(subfolder, np.mean(last_sr)) 61 | else: 62 | for subfolder in subfolders: 63 | last_sr = [] 64 | for i in range(3): 65 | if not os.path.exists(os.path.join(folder_name, subfolder, str(i), 'eval_box.csv')): 66 | continue 67 | eval_file = os.path.join(folder_name, subfolder, str(i), 'eval.csv' if env_name == 'maze_ego' else 'eval_box.csv') 68 | if subfolder == 'hiro': 69 | raw_success_rate = get_item(eval_file, 'Value') 70 | raw_total_timesteps = get_item(eval_file, 'Step') 71 | sr_f = interpolate.interp1d(raw_total_timesteps, raw_success_rate, fill_value="extrapolate") 72 | elif subfolder == 'dsc': 73 | raw_success_rate = get_item(eval_file, 'success_rate') 74 | raw_total_timesteps = get_item(eval_file, 'timesteps') 75 | sr_f = interpolate.interp1d(raw_total_timesteps, raw_success_rate, bounds_error=False) 76 | else: 77 | progress_file = os.path.join(folder_name, subfolder, str(i), 'progress.csv') 78 | raw_total_timesteps = get_item(progress_file, 'total timesteps') 79 | if subfolder == 'sir': 80 | original_timesteps = get_item(progress_file, 'original_timesteps') 81 | else: 82 | original_timesteps = raw_total_timesteps 83 | expand_fn = interpolate.interp1d(original_timesteps, raw_total_timesteps, fill_value="extrapolate") 84 | success_rate = get_item(eval_file, 'mean_eval_reward') 85 | eval_steps = get_item(eval_file, 'n_updates') 86 | eval_steps = expand_fn(eval_steps) 87 | sr_f = interpolate.interp1d(eval_steps, success_rate, fill_value="extrapolate") 88 | timesteps = np.arange(0, max_timesteps[env_name], max_timesteps[env_name] // 250) 89 | print(timesteps[0], timesteps[-1], raw_total_timesteps[0], raw_total_timesteps[-1]) 90 | success_rate = sr_f(timesteps) 91 | timesteps = smooth(timesteps, 20) 92 | success_rate = smooth(success_rate, 20) 93 | # eval_reward = smooth(eval_reward, 20) 94 | df_timesteps.append(timesteps) 95 | df_sr.append(success_rate) 96 | last_sr.append(success_rate[-1]) 97 | # df_eval.append(eval_reward) 98 | df_legend.append(np.array([subfolder.upper()] * len(timesteps))) 99 | 100 | print(subfolder, np.mean(last_sr)) 101 | 102 | df_timesteps = np.concatenate(df_timesteps, axis=0).tolist() 103 | df_sr = np.concatenate(df_sr, axis=0).tolist() 104 | # df_eval = np.concatenate(df_eval, axis=0).tolist() 105 | df_legend = np.concatenate(df_legend, axis=0).tolist() 106 | data = {'samples': df_timesteps, 'success_rate': df_sr, 'algo': df_legend} 107 | sr_timesteps = pandas.DataFrame(data) 108 | 109 | wspace = .3 110 | bottom = .3 111 | margin = .1 112 | left = .1 113 | width = 1.5 / ((1. - left) / (2 + wspace + margin / 2)) 114 | height = 1.5 / ((1. - bottom) / (1 + margin / 2)) 115 | 116 | plt.style.use("ggplot") 117 | # plt.rcParams.update({'legend.fontsize': 14}) 118 | p = sns.color_palette() 119 | sns.set_palette([p[i] for i in range(len(subfolders))]) 120 | f, axes = plt.subplots(1, 1, figsize=(width, height)) 121 | sns.lineplot(x='samples', y='success_rate', hue='algo', ax=axes, data=sr_timesteps) 122 | axes.set_xlabel('samples') 123 | axes.set_ylabel('success_rate') 124 | axes.xaxis.get_major_formatter().set_powerlimits((0, 1)) 125 | axes.get_legend().remove() 126 | 127 | handles, labels = axes.get_legend_handles_labels() 128 | 129 | f.legend(handles[:], ['SIR', 'HIRO', 'DSC'], loc="lower right", ncol=1, bbox_to_anchor=(0.99, 0.18), title='') 130 | f.subplots_adjust(top=1. - margin / height, bottom=0.21, wspace=wspace, left=left, right=1. - margin / width) 131 | plt.savefig(os.path.join(folder_name, '../', os.path.basename(folder_name) + env_name + '.pdf')) 132 | plt.show() 133 | -------------------------------------------------------------------------------- /plot/visualize_sac_value.py: -------------------------------------------------------------------------------- 1 | import sys, os 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | from matplotlib import cm 5 | from utils.make_env_utils import make_env, get_env_kwargs 6 | from baselines import HER2 7 | 8 | 9 | def gen_value_with_obstacle(obs, model, env_hyperparam): 10 | obstacle_xpos, obstacle_ypos = np.meshgrid(np.linspace(env_hyperparam['xlim'][0], env_hyperparam['xlim'][1], 21), 11 | np.linspace(env_hyperparam['ylim'][0], env_hyperparam['ylim'][1], 21)) 12 | grid_shape = obstacle_xpos.shape 13 | _obstacle_xpos = np.reshape(obstacle_xpos, (-1, 1)) 14 | _obstacle_ypos = np.reshape(obstacle_ypos, (-1, 1)) 15 | batch_obs = np.tile(obs, (_obstacle_xpos.shape[0], 1)) 16 | batch_obs[:, 6] = _obstacle_xpos[:, 0] 17 | batch_obs[:, 7] = _obstacle_ypos[:, 0] 18 | batch_obs[:, 12] = batch_obs[:, 6] - batch_obs[:, 0] 19 | batch_obs[:, 13] = batch_obs[:, 7] - batch_obs[:, 1] 20 | # Compute value2 21 | batch_value = model.model.sess.run(model.model.step_ops[6], 22 | {model.model.observations_ph: batch_obs}) 23 | grid_value = np.reshape(batch_value, grid_shape) 24 | 25 | # Compute value1 26 | subgoal_obs = np.tile(obs, (_obstacle_xpos.shape[0], 1)) 27 | # Achieved goal (current obstacle pos) 28 | subgoal_obs[:, -10: -7] = subgoal_obs[:, 6: 9] 29 | subgoal_obs[:, -7: -5] = np.array([[0., 1.]]) 30 | # Desired goal (sampled perturbed obstacle pos) 31 | obstacle_xy = np.concatenate([_obstacle_xpos, _obstacle_ypos, subgoal_obs[:, 8:9]], axis=-1) 32 | subgoal_obs[:, -5: -2] = obstacle_xy 33 | subgoal_obs[:, -2: ] = np.array([[0., 1.]]) 34 | # Value1 aim to answer if the subgoal is easy to achieve 35 | value1 = model.model.sess.run(model.model.step_ops[6], 36 | {model.model.observations_ph: subgoal_obs}) 37 | grid_value1 = np.reshape(value1, grid_shape) 38 | 39 | # min_value = np.min(np.concatenate([np.expand_dims(value1, 1), np.expand_dims(batch_value,1)], axis=1), axis=1) 40 | # grid_value_min = np.reshape(min_value, grid_shape) 41 | normalized_value1 = (value1 - np.min(value1)) / (np.max(value1) - np.min(value1)) 42 | normalized_value2 = (batch_value - np.min(batch_value)) / (np.max(batch_value) - np.min(batch_value)) 43 | value_prod = normalized_value1 * normalized_value2 44 | grid_value_prod = np.reshape(value_prod, grid_shape) 45 | 46 | return obstacle_xpos, obstacle_ypos, grid_value, grid_value1, grid_value_prod 47 | 48 | 49 | if __name__ == '__main__': 50 | if len(sys.argv) < 2: 51 | print('Usage: python -m plot.visualize_sac_value [load_path]') 52 | load_path = sys.argv[1] 53 | env_name = 'FetchPushWallObstacle-v4' 54 | env_kwargs = get_env_kwargs(env_name, random_ratio=0.0, reward_type="sparse") 55 | env_hyperparam = dict(xlim=(1.05, 1.55), ylim=(0.4, 1.1)) 56 | n_cpu = 1 57 | env = make_env(env_id=env_name, rank=0, log_dir=None,flatten_dict=True, kwargs=env_kwargs) 58 | 59 | model = HER2.load(load_path) 60 | fig, ax = plt.subplots(1, 1, figsize=(8, 8)) 61 | plt.rcParams.update({'font.size': 20, 'xtick.labelsize': 20, 'ytick.labelsize': 20, 62 | 'axes.labelsize': 20}) 63 | obs = env.reset() 64 | while not (obs[4] > 0.70 and obs[4] < 0.80 and obs[3] < 1.5): 65 | obs = env.reset() 66 | env.set_goal(np.array([1.195, 0.75, 0.425, 1, 0])) 67 | obs = env.get_obs() 68 | obs = np.concatenate([obs[key] for key in ['observation', 'achieved_goal', 'desired_goal']]) 69 | img = env.render(mode='rgb_array') 70 | xs, ys, zs, value1s, value_prods = gen_value_with_obstacle(obs, model, env_hyperparam) 71 | print('gripper', obs[:3], 'box', obs[3:6], 'obstacle', obs[6:9], ) 72 | np.save('xs.npy', xs) 73 | np.save('ys.npy', ys) 74 | np.save('value1.npy', value1s) 75 | np.save('value2.npy', zs) 76 | np.save('value_prod.npy', value_prods) 77 | plt.imsave(os.path.join(os.path.dirname(load_path), 'obs.png'), img) 78 | 79 | ax.cla() 80 | surf = ax.contourf((xs - 1.05) / 0.5, (ys - 0.4) / 0.7, value_prods, 15, cmap=cm.coolwarm, vmin=-0.0, vmax=1) 81 | ax.set_xlabel('x', fontsize=24) 82 | ax.set_ylabel('y', fontsize=24) 83 | ax.plot([(1.25 - 1.05) / 0.5, (1.25 - 1.05) / 0.5], [0, (0.65 - 0.4) / 0.7], 'k', linestyle='--') 84 | ax.plot([(1.25 - 1.05) / 0.5, (1.25 - 1.05) / 0.5], [(0.85 - 0.4) / 0.7, (1.1 - 0.4) / 0.7], 'k', linestyle='--') 85 | ax.axis([0., 1., 0., 1.]) 86 | cb = plt.colorbar(surf) 87 | plt.tight_layout() 88 | plt.savefig('value_prod.png') 89 | ax.set_title("Value prod") 90 | plt.show() 91 | 92 | fig, ax = plt.subplots(1, 1, figsize=(8, 8)) 93 | surf = ax.contourf((xs - 1.05) / 0.5, (ys - 0.4) / 0.7, value1s, 15, cmap=cm.coolwarm, vmin=-0.0, vmax=1) 94 | ax.set_xlabel('x', fontsize=24) 95 | ax.set_ylabel('y', fontsize=24) 96 | ax.plot([(1.25 - 1.05) / 0.5, (1.25 - 1.05) / 0.5], [0, (0.65 - 0.4) / 0.7], 'k', linestyle='--') 97 | ax.plot([(1.25 - 1.05) / 0.5, (1.25 - 1.05) / 0.5], [(0.85 - 0.4) / 0.7, (1.1 - 0.4) / 0.7], 'k', linestyle='--') 98 | ax.axis([0., 1., 0., 1.]) 99 | cb = plt.colorbar(surf) 100 | plt.tight_layout() 101 | plt.savefig('value1.png') 102 | ax.set_title("Value 1") 103 | plt.show() 104 | 105 | fig, ax = plt.subplots(1, 1, figsize=(8, 8)) 106 | surf = ax.contourf((xs - 1.05) / 0.5, (ys - 0.4) / 0.7, zs, 15, cmap=cm.coolwarm, vmin=-0.0, vmax=1) 107 | ax.set_xlabel('x', fontsize=24) 108 | ax.set_ylabel('y', fontsize=24) 109 | ax.plot([(1.25 - 1.05) / 0.5, (1.25 - 1.05) / 0.5], [0, (0.65 - 0.4) / 0.7], 'k', linestyle='--') 110 | ax.plot([(1.25 - 1.05) / 0.5, (1.25 - 1.05) / 0.5], [(0.85 - 0.4) / 0.7, (1.1 - 0.4) / 0.7], 'k', linestyle='--') 111 | ax.axis([0., 1., 0., 1.]) 112 | cb = plt.colorbar(surf) 113 | plt.tight_layout() 114 | plt.savefig('value2.png') 115 | ax.set_title("Value 2") 116 | plt.show() 117 | 118 | fig, ax = plt.subplots(1, 1, figsize=(8, 8)) 119 | surf = ax.contourf((xs - 1.05) / 0.5, (ys - 0.4) / 0.7, (value1s + zs) / 2, 15, cmap=cm.coolwarm, vmin=-0.0, vmax=1) 120 | ax.set_xlabel('x', fontsize=24) 121 | ax.set_ylabel('y', fontsize=24) 122 | ax.plot([(1.25 - 1.05) / 0.5, (1.25 - 1.05) / 0.5], [0, (0.65 - 0.4) / 0.7], 'k', linestyle='--') 123 | ax.plot([(1.25 - 1.05) / 0.5, (1.25 - 1.05) / 0.5], [(0.85 - 0.4) / 0.7, (1.1 - 0.4) / 0.7], 'k', linestyle='--') 124 | ax.axis([0., 1., 0., 1.]) 125 | cb = plt.colorbar(surf) 126 | plt.tight_layout() 127 | plt.savefig('value_ave.png') 128 | ax.set_title("Value average") 129 | plt.show() 130 | 131 | fig, ax = plt.subplots(1, 1, figsize=(8, 8)) 132 | surf = ax.contourf((xs - 1.05) / 0.5, (ys - 0.4) / 0.7, np.min(np.array([value1s, zs]), axis=0), 15, cmap=cm.coolwarm, vmin=-0.0, vmax=1) 133 | ax.set_xlabel('x', fontsize=24) 134 | ax.set_ylabel('y', fontsize=24) 135 | ax.plot([(1.25 - 1.05) / 0.5, (1.25 - 1.05) / 0.5], [0, (0.65 - 0.4) / 0.7], 'k', linestyle='--') 136 | ax.plot([(1.25 - 1.05) / 0.5, (1.25 - 1.05) / 0.5], [(0.85 - 0.4) / 0.7, (1.1 - 0.4) / 0.7], 'k', linestyle='--') 137 | ax.axis([0., 1., 0., 1.]) 138 | cb = plt.colorbar(surf) 139 | plt.tight_layout() 140 | plt.savefig('value_min.png') 141 | ax.set_title("Value min") 142 | plt.show() 143 | -------------------------------------------------------------------------------- /plot/plot_sac_experiment.py: -------------------------------------------------------------------------------- 1 | import seaborn as sns 2 | import pandas 3 | import numpy as np 4 | import sys, os 5 | from scipy import interpolate 6 | import matplotlib.pyplot as plt 7 | 8 | 9 | def get_item(log_file, label): 10 | data = pandas.read_csv(log_file, index_col=None, comment='#', error_bad_lines=True) 11 | return data[label].values 12 | 13 | 14 | def smooth(array, window): 15 | out = np.zeros(array.shape[0] - window) 16 | for i in range(out.shape[0]): 17 | out[i] = np.mean(array[i:i + window]) 18 | return out 19 | 20 | 21 | if __name__ == '__main__': 22 | folder_name = sys.argv[1] 23 | env_name = sys.argv[2] 24 | assert env_name in ['push', 'stack2', 'stack3', 'particle'] 25 | # assert mode in ['train', 'hard', 'iteration'] 26 | max_timesteps = {'push': 1.45e7, 27 | # 'stack2': 2.8e7, 28 | 'stack2': 2.3e7, 29 | 'stack3': 1e8, 30 | 'particle': 9.5e7, 31 | } 32 | max_iterationss = {'push': 440000, 33 | 'stack2': 8.9e5, 34 | 'stack3': 2.5e6, 35 | } 36 | df_timesteps, df_sr, df_eval, df_legend, df_iteration, df_eval_iteration, df_legend_iteration = [], [], [], [], [], [], [] 37 | subfolders = ['sac', 'sir', 'sil', 'ds'] 38 | if 'particle' in folder_name: 39 | subfolders = ['sac', 'sir', 'sil'] 40 | elif 'push_random0.7' in folder_name: 41 | subfolders = ['sac', 'sir', 'sil', 'ds2'] 42 | elif 'push_random1.0' in folder_name: 43 | subfolders = ['sac', 'sir', 'sil', 'ds'] 44 | elif 'stack_2obj' in folder_name or 'stack_3obj' in folder_name: 45 | subfolders = ['sac', 'sir_noknowledge', 'sil', 'ds'] 46 | for subfolder in subfolders: 47 | last_sr = [] 48 | last_eval = [] 49 | for i in range(4): 50 | if not os.path.exists(os.path.join(folder_name, subfolder, str(i), 'progress.csv')): 51 | continue 52 | progress_file = os.path.join(folder_name, subfolder, str(i), 'progress.csv') 53 | eval_file = os.path.join(folder_name, subfolder, str(i), 'eval.csv') 54 | if subfolder is 'ds' or subfolder is 'ds2': 55 | raw_success_rate = get_item(progress_file, 'success rate') 56 | else: 57 | raw_success_rate = get_item(progress_file, 'ep_rewmean') 58 | raw_total_timesteps = get_item(progress_file, 'total timesteps') 59 | try: 60 | raw_original_timesteps = get_item(progress_file, 'original_timesteps') 61 | except KeyError: 62 | raw_original_timesteps = raw_total_timesteps 63 | raw_eval_timesteps = get_item(eval_file, 'n_updates') 64 | raw_eval_reward = get_item(eval_file, 'mean_eval_reward') 65 | print(raw_total_timesteps.shape, raw_eval_reward.shape) 66 | sr_f = interpolate.interp1d(raw_total_timesteps, raw_success_rate, fill_value="extrapolate") 67 | eval_f = interpolate.interp1d(raw_eval_timesteps, raw_eval_reward, fill_value="extrapolate") 68 | step_shrink_fn = interpolate.interp1d(raw_total_timesteps, raw_original_timesteps, fill_value="extrapolate") 69 | 70 | timesteps = np.arange(0, max_timesteps[env_name], max_timesteps[env_name] // 500) 71 | print(timesteps[0], timesteps[-1], raw_total_timesteps[0], raw_total_timesteps[-1]) 72 | success_rate = sr_f(timesteps) 73 | eval_reward = eval_f(step_shrink_fn(timesteps)) 74 | timesteps = smooth(timesteps, 50) 75 | success_rate = smooth(success_rate, 50) 76 | eval_reward = smooth(eval_reward, 50) 77 | df_timesteps.append(timesteps) 78 | df_sr.append(success_rate) 79 | df_eval.append(eval_reward) 80 | last_sr.append(success_rate[-1]) 81 | last_eval.append(eval_reward[-1]) 82 | df_legend.append(np.array([subfolder.upper()] * len(timesteps))) 83 | 84 | # raw_iterations = get_item(progress_file, 'n_updates') 85 | # iter_step_convert_fn = interpolate.interp1d(raw_iterations, raw_original_timesteps, fill_value="extrapolate") 86 | # iterations = np.arange(0, max_iterationss[env_name], max_iterationss[env_name] // 500) 87 | # eval_iteration = eval_f(iter_step_convert_fn(iterations)) 88 | # iterations = smooth(iterations, 50) 89 | # eval_iteration = smooth(eval_iteration, 50) 90 | # df_iteration.append(iterations) 91 | # df_eval_iteration.append(eval_iteration) 92 | # df_legend_iteration.append(np.array([subfolder.upper()] * len(iterations))) 93 | print(subfolder, 'sr', np.mean(last_sr), 'eval', np.mean(last_eval)) 94 | df_timesteps = np.concatenate(df_timesteps, axis=0).tolist() 95 | df_sr = np.concatenate(df_sr, axis=0).tolist() 96 | df_eval = np.concatenate(df_eval, axis=0).tolist() 97 | df_legend = np.concatenate(df_legend, axis=0).tolist() 98 | # df_iteration = np.concatenate(df_iteration, axis=0).tolist() 99 | # df_eval_iteration = np.concatenate(df_eval_iteration, axis=0).tolist() 100 | # df_legend_iteration = np.concatenate(df_legend_iteration, axis=0).tolist() 101 | data = {'samples': df_timesteps, 'success_rate': df_sr, 'algo': df_legend} 102 | sr_timesteps = pandas.DataFrame(data) 103 | data = {'samples': df_timesteps, 'eval': df_eval, 'algo': df_legend} 104 | eval_timesteps = pandas.DataFrame(data) 105 | # data = {'iterations': df_iteration, 'eval': df_eval_iteration, 'algo': df_legend_iteration} 106 | # eval_iteration = pandas.DataFrame(data) 107 | 108 | wspace = .3 109 | bottom = .3 110 | margin = .1 111 | # left = .08 112 | left = .1 113 | # width = 3.5 / ((1. - left) / (2 + wspace + margin / 2)) 114 | width = 2.15 / ((1. - left) / (2 + wspace + margin / 2)) 115 | height = 1.5 / ((1. - bottom) / (1 + margin / 2)) 116 | 117 | plt.style.use("ggplot") 118 | # plt.rcParams.update({'legend.fontsize': 14}) 119 | p = sns.color_palette() 120 | sns.set_palette([p[i] for i in range(len(subfolders))]) 121 | # f, axes = plt.subplots(1, 3, figsize=(width, height)) 122 | f, axes = plt.subplots(1, 2, figsize=(width, height)) 123 | sns.lineplot(x='samples', y='success_rate', hue='algo', ax=axes[0], data=sr_timesteps) 124 | axes[0].set_xlabel('samples') 125 | axes[0].set_ylabel('avg. succ. rate') 126 | axes[0].get_legend().remove() 127 | sns.lineplot(x='samples', y='eval', hue='algo', ax=axes[1], data=eval_timesteps) 128 | axes[1].set_xlabel('samples') 129 | axes[1].set_ylabel('hard succ. rate') 130 | axes[1].get_legend().remove() 131 | # sns.lineplot(x='iterations', y='eval', hue='algo', ax=axes[2], data=eval_iteration) 132 | # axes[2].xaxis.get_major_formatter().set_powerlimits((0, 1)) 133 | # axes[2].set_xlabel('iterations') 134 | # axes[2].set_ylabel('') 135 | # axes[2].get_legend().remove() 136 | handles, labels = axes[1].get_legend_handles_labels() 137 | print(handles) 138 | 139 | f.legend(handles[:], ['SAC', 'SIR', 'SIL', 'DS'][:len(subfolders)], loc="lower right", ncol=1, bbox_to_anchor=(0.49, 0.18), title='') 140 | f.subplots_adjust(top=1. - margin / height, bottom=0.21, wspace=wspace, left=left, right=1. - margin / width) 141 | plt.savefig(os.path.join(folder_name, '../', os.path.basename(folder_name) + 'clean.pdf')) 142 | plt.show() 143 | -------------------------------------------------------------------------------- /run_ppo.py: -------------------------------------------------------------------------------- 1 | from baselines import PPO2 2 | from stable_baselines.common.policies import register_policy 3 | from stable_baselines.common import set_global_seeds 4 | from stable_baselines.common.vec_env import SubprocVecEnv 5 | from utils.log_utils import eval_model, log_eval, stack_eval_model 6 | 7 | from utils.make_env_utils import make_env, configure_logger, get_env_kwargs, get_policy_kwargs, get_train_kwargs, \ 8 | get_num_workers 9 | import numpy as np 10 | 11 | import os, time, argparse 12 | import matplotlib.pyplot as plt 13 | 14 | 15 | def arg_parse(): 16 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) 17 | parser.add_argument('--env', default='FetchPushWallObstacle-v4') 18 | parser.add_argument('--policy', type=str, default='MlpPolicy') 19 | parser.add_argument('--seed', type=int, default=42) 20 | parser.add_argument('--num_timesteps', type=float, default=1e8) 21 | parser.add_argument('--reward_type', type=str, default='sparse') 22 | parser.add_argument('--n_object', type=int, default=2) # Only used for stacking 23 | parser.add_argument('--log_path', default=None, type=str) 24 | parser.add_argument('--load_path', default=None, type=str) 25 | parser.add_argument('--random_ratio', default=1.0, type=float) 26 | parser.add_argument('--curriculum', action="store_true", default=False) 27 | parser.add_argument('--sequential', action="store_true", default=False) 28 | parser.add_argument('--gamma', default=0.99, type=float) 29 | parser.add_argument('--play', action="store_true", default=False) 30 | parser.add_argument('--export_video', action="store_true", default=False) 31 | args = parser.parse_args() 32 | return args 33 | 34 | 35 | def main(args): 36 | log_dir = args.log_path if (args.log_path is not None) else \ 37 | "/tmp/stable_baselines_" + time.strftime('%Y-%m-%d-%H-%M-%S') 38 | configure_logger(log_dir) 39 | 40 | set_global_seeds(args.seed) 41 | 42 | n_cpu = get_num_workers(args.env) if not args.play else 1 43 | env_kwargs = get_env_kwargs(args.env, args.random_ratio, args.sequential, args.reward_type, 44 | args.n_object, args.curriculum) 45 | def make_thunk(rank): 46 | return lambda: make_env(env_id=args.env, rank=rank, log_dir=log_dir, flatten_dict=True, kwargs=env_kwargs) 47 | env = SubprocVecEnv([make_thunk(i) for i in range(n_cpu)]) 48 | 49 | eval_env_kwargs = env_kwargs.copy() 50 | eval_env_kwargs['random_ratio'] = 0.0 51 | if "use_cu" in eval_env_kwargs: 52 | eval_env_kwargs['use_cu'] = False 53 | eval_env = make_env(env_id=args.env, rank=0, flatten_dict=True, kwargs=eval_env_kwargs) 54 | print(eval_env) 55 | if not args.play: 56 | os.makedirs(log_dir, exist_ok=True) 57 | train_kwargs = get_train_kwargs("ppo", args, parsed_action_noise=None, eval_env=eval_env) 58 | 59 | # policy = 'MlpPolicy' 60 | from utils.attention_policy import AttentionPolicy 61 | register_policy('AttentionPolicy', AttentionPolicy) 62 | policy_kwargs = get_policy_kwargs("ppo", args) 63 | print(policy_kwargs) 64 | 65 | model = PPO2(args.policy, env, verbose=1, nminibatches=32, lam=0.95, noptepochs=10, 66 | ent_coef=0.01, learning_rate=3e-4, cliprange=0.2, policy_kwargs=policy_kwargs, **train_kwargs) 67 | print(model.get_parameter_list()) 68 | 69 | def callback(_locals, _globals): 70 | num_update = _locals["update"] 71 | if 'FetchStack' in args.env: 72 | mean_eval_reward = stack_eval_model(eval_env, _locals["self"]) 73 | else: 74 | mean_eval_reward = eval_model(eval_env, _locals["self"]) 75 | log_eval(num_update, mean_eval_reward) 76 | if num_update % 10 == 0: 77 | model_path = os.path.join(log_dir, 'model_' + str(num_update // 10)) 78 | model.save(model_path) 79 | print('model saved to', model_path) 80 | return True 81 | 82 | model.learn(total_timesteps=int(args.num_timesteps), callback=callback, seed=args.seed, log_interval=1) 83 | model.save(os.path.join(log_dir, 'final')) 84 | 85 | else: 86 | assert args.load_path is not None 87 | model = PPO2.load(args.load_path) 88 | fig, ax = plt.subplots(1, 1, figsize=(8, 8)) 89 | obs = env.reset() 90 | goal_dim = env.get_attr('goal')[0].shape[0] 91 | if 'FetchStack' in args.env: 92 | while env.get_attr('current_nobject')[0] != env.get_attr('n_object')[0] or \ 93 | env.get_attr('task_mode')[0] != 1: 94 | obs = env.reset() 95 | elif 'FetchPush' in args.env: 96 | while not (1.25 < obs[0][6] < 1.33 and obs[0][7] < 0.61 and 0.7 < obs[0][4] < 0.8): 97 | obs = env.reset() 98 | env.env_method('set_goal', np.array([1.2, 0.75, 0.425, 1, 0])) 99 | obs = env.env_method('get_obs') 100 | obs[0] = np.concatenate([obs[0][key] for key in ['observation', 'achieved_goal', 'desired_goal']]) 101 | else: 102 | while np.argmax(obs[0][-goal_dim+3:]) != 0: 103 | obs = env.reset() 104 | print('achieved_goal', obs[0][-2*goal_dim: -goal_dim], 'goal', obs[0][-goal_dim:]) 105 | episode_reward = 0.0 106 | num_episode = 0 107 | frame_idx = 0 108 | images = [] 109 | if 'max_episode_steps' not in env_kwargs.keys(): 110 | env_kwargs['max_episode_steps'] = 100 111 | for i in range(env_kwargs['max_episode_steps'] * 10): 112 | img = env.render(mode='rgb_array') 113 | ax.cla() 114 | ax.imshow(img) 115 | if env.get_attr('goal')[0].shape[0] <= 3: 116 | ax.set_title('episode ' + str(num_episode) + ', frame ' + str(frame_idx)) 117 | else: 118 | ax.set_title('episode ' + str(num_episode) + ', frame ' + str(frame_idx) + 119 | ', goal idx ' + str(np.argmax(env.get_attr('goal')[0][3:]))) 120 | if 'FetchStack' in args.env: 121 | tasks = ['pick and place', 'stack'] 122 | ax.set_title('episode ' + str(num_episode) + ', frame ' + str(frame_idx) 123 | + ', task: ' + tasks[np.argmax(obs[0][-2*goal_dim-2:-2*goal_dim])]) 124 | images.append(img) 125 | action, _ = model.predict(obs) 126 | obs, reward, done, _ = env.step(action) 127 | episode_reward += reward 128 | frame_idx += 1 129 | if not args.export_video: 130 | plt.pause(0.1) 131 | else: 132 | plt.imsave(os.path.join(os.path.dirname(args.load_path), 'tempimg%d.png' % i), img) 133 | if done: 134 | print('episode_reward', episode_reward) 135 | if 'FetchStack' in args.env: 136 | while env.get_attr('current_nobject')[0] != env.get_attr('n_object')[0] or \ 137 | env.get_attr('task_mode')[0] != 1: 138 | obs = env.reset() 139 | else: 140 | while np.argmax(obs[0][-goal_dim + 3:]) != 0: 141 | obs = env.reset() 142 | print('goal', obs[0][-goal_dim:]) 143 | episode_reward = 0.0 144 | frame_idx = 0 145 | num_episode += 1 146 | if num_episode >= 10: 147 | break 148 | if args.export_video: 149 | os.system('ffmpeg -r 5 -start_number 0 -i ' + os.path.dirname(args.load_path) + 150 | '/tempimg%d.png -c:v libx264 -pix_fmt yuv420p ' + 151 | os.path.join(os.path.dirname(args.load_path), args.env + '.mp4')) 152 | for i in range(env_kwargs['max_episode_steps'] * 10): 153 | try: 154 | os.remove(os.path.join(os.path.dirname(args.load_path), 'tempimg' + str(i) + '.png')) 155 | except: 156 | pass 157 | 158 | 159 | if __name__ == '__main__': 160 | args = arg_parse() 161 | main(args) 162 | -------------------------------------------------------------------------------- /baselines/her/her.py: -------------------------------------------------------------------------------- 1 | import functools 2 | 3 | from stable_baselines.common import BaseRLModel 4 | from stable_baselines.common import OffPolicyRLModel 5 | from stable_baselines.common.base_class import _UnvecWrapper 6 | from stable_baselines.common.vec_env import VecEnvWrapper 7 | from .replay_buffer import HindsightExperienceReplayWrapper, KEY_TO_GOAL_STRATEGY 8 | from .utils import HERGoalEnvWrapper 9 | 10 | 11 | class HER2(BaseRLModel): 12 | """ 13 | Hindsight Experience Replay (HER) https://arxiv.org/abs/1707.01495 14 | :param policy: (BasePolicy or str) The policy model to use (MlpPolicy, CnnPolicy, CnnLstmPolicy, ...) 15 | :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str) 16 | :param model_class: (OffPolicyRLModel) The off policy RL model to apply Hindsight Experience Replay 17 | currently supported: DQN, DDPG, SAC 18 | :param n_sampled_goal: (int) 19 | :param goal_selection_strategy: (GoalSelectionStrategy or str) 20 | """ 21 | 22 | def __init__(self, policy, env, model_class, n_sampled_goal=4, 23 | goal_selection_strategy='future', num_workers=1, *args, **kwargs): 24 | 25 | assert not isinstance(env, VecEnvWrapper), "HER does not support VecEnvWrapper" 26 | 27 | super().__init__(policy=policy, env=env, verbose=kwargs.get('verbose', 0), 28 | # policy_base=None, requires_vec_env=(num_workers > 1)) 29 | policy_base=None, requires_vec_env=kwargs.get('requires_vec_env', True)) 30 | 31 | self.model_class = model_class 32 | self.replay_wrapper = None 33 | self.n_workers = num_workers 34 | # Save dict observation space (used for checks at loading time) 35 | if env is not None: 36 | self.observation_space = env.observation_space 37 | self.action_space = env.action_space 38 | 39 | # Convert string to GoalSelectionStrategy object 40 | if isinstance(goal_selection_strategy, str): 41 | assert goal_selection_strategy in KEY_TO_GOAL_STRATEGY.keys(), "Unknown goal selection strategy" 42 | goal_selection_strategy = KEY_TO_GOAL_STRATEGY[goal_selection_strategy] 43 | 44 | self.n_sampled_goal = n_sampled_goal 45 | self.goal_selection_strategy = goal_selection_strategy 46 | 47 | if self.env is not None: 48 | self._create_replay_wrapper(self.env) 49 | 50 | assert issubclass(model_class, OffPolicyRLModel), \ 51 | "Error: HER only works with Off policy model (such as DDPG, SAC, TD3 and DQN)." 52 | 53 | self.model = self.model_class(policy, self.env, *args, **kwargs) 54 | # Patch to support saving/loading 55 | self.model._save_to_file = self._save_to_file 56 | 57 | def _create_replay_wrapper(self, env): 58 | """ 59 | Wrap the environment in a HERGoalEnvWrapper 60 | if needed and create the replay buffer wrapper. 61 | """ 62 | if not isinstance(env, HERGoalEnvWrapper): 63 | env = HERGoalEnvWrapper(env) 64 | 65 | self.env = env 66 | # NOTE: we cannot do that check directly with VecEnv 67 | # maybe we can try calling `compute_reward()` ? 68 | # assert isinstance(self.env, gym.GoalEnv), "HER only supports gym.GoalEnv" 69 | 70 | # if self.n_workers > 1: 71 | # replay_wrapper = HindsightExperienceReplayWrapper 72 | # else: 73 | # replay_wrapper = SingleHindsightExperienceReplayWrapper 74 | replay_wrapper = HindsightExperienceReplayWrapper 75 | self.replay_wrapper = functools.partial(replay_wrapper, 76 | n_sampled_goal=self.n_sampled_goal, 77 | goal_selection_strategy=self.goal_selection_strategy, 78 | wrapped_env=self.env) 79 | 80 | def set_env(self, env): 81 | assert not isinstance(env, VecEnvWrapper), "HER does not support VecEnvWrapper" 82 | super().set_env(env) 83 | self._create_replay_wrapper(self.env) 84 | self.model.set_env(self.env) 85 | 86 | def get_env(self): 87 | return self.env 88 | 89 | def get_parameter_list(self): 90 | return self.model.get_parameter_list() 91 | 92 | def __getattr__(self, attr): 93 | """ 94 | Wrap the RL model. 95 | :param attr: (str) 96 | :return: (Any) 97 | """ 98 | if attr in self.__dict__: 99 | return getattr(self, attr) 100 | return getattr(self.model, attr) 101 | 102 | def __set_attr__(self, attr, value): 103 | if attr in self.__dict__: 104 | setattr(self, attr, value) 105 | else: 106 | setattr(self.model, attr, value) 107 | 108 | def _get_pretrain_placeholders(self): 109 | return self.model._get_pretrain_placeholders() 110 | 111 | def setup_model(self): 112 | pass 113 | 114 | def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="HER", 115 | reset_num_timesteps=True): 116 | return self.model.learn(total_timesteps, callback=callback, seed=seed, log_interval=log_interval, 117 | tb_log_name=tb_log_name, reset_num_timesteps=reset_num_timesteps, 118 | replay_wrapper=self.replay_wrapper) 119 | 120 | def _check_obs(self, observation): 121 | if isinstance(observation, dict): 122 | if self.env is not None: 123 | if len(observation['observation'].shape) > 1: 124 | observation = _UnvecWrapper.unvec_obs(observation) 125 | return [self.env.convert_dict_to_obs(observation)] 126 | return self.env.convert_dict_to_obs(observation) 127 | else: 128 | raise ValueError("You must either pass an env to HER or wrap your env using HERGoalEnvWrapper") 129 | return observation 130 | 131 | def predict(self, observation, state=None, mask=None, deterministic=True): 132 | return self.model.predict(self._check_obs(observation), state, mask, deterministic) 133 | 134 | def action_probability(self, observation, state=None, mask=None, actions=None, logp=False): 135 | return self.model.action_probability(self._check_obs(observation), state, mask, actions, logp) 136 | 137 | def _save_to_file(self, save_path, data=None, params=None, cloudpickle=False): 138 | # HACK to save the replay wrapper 139 | # or better to save only the replay strategy and its params? 140 | # it will not work with VecEnv 141 | data['n_sampled_goal'] = self.n_sampled_goal 142 | data['goal_selection_strategy'] = self.goal_selection_strategy 143 | data['model_class'] = self.model_class 144 | data['her_obs_space'] = self.observation_space 145 | data['her_action_space'] = self.action_space 146 | super()._save_to_file(save_path, data, params, cloudpickle=cloudpickle) 147 | 148 | def save(self, save_path, cloudpickle=False): 149 | self.model.save(save_path, cloudpickle=cloudpickle) 150 | 151 | @classmethod 152 | def load(cls, load_path, env=None, custom_objects=None, **kwargs): 153 | data, _ = cls._load_from_file(load_path, custom_objects=custom_objects) 154 | 155 | if 'policy_kwargs' in kwargs and kwargs['policy_kwargs'] != data['policy_kwargs']: 156 | raise ValueError("The specified policy kwargs do not equal the stored policy kwargs. " 157 | "Stored kwargs: {}, specified kwargs: {}".format(data['policy_kwargs'], 158 | kwargs['policy_kwargs'])) 159 | 160 | model = cls(policy=data["policy"], env=env, model_class=data['model_class'], 161 | n_sampled_goal=data['n_sampled_goal'], 162 | goal_selection_strategy=data['goal_selection_strategy'], 163 | _init_setup_model=False) 164 | model.__dict__['observation_space'] = data['her_obs_space'] 165 | model.__dict__['action_space'] = data['her_action_space'] 166 | model.model = data['model_class'].load(load_path, model.get_env(), **kwargs) 167 | model.model._save_to_file = model._save_to_file 168 | return model -------------------------------------------------------------------------------- /run_her_augment.py: -------------------------------------------------------------------------------- 1 | from baselines import HER2, SAC_SIR 2 | from stable_baselines.sac.policies import FeedForwardPolicy as SACPolicy 3 | from stable_baselines.common.policies import register_policy 4 | from utils.parallel_subproc_vec_env import ParallelSubprocVecEnv 5 | from gym.wrappers import FlattenDictWrapper 6 | from stable_baselines.common import set_global_seeds 7 | from stable_baselines import logger 8 | from utils.make_env_utils import make_env, get_env_kwargs, get_train_kwargs, get_policy_kwargs 9 | import os, time 10 | import argparse 11 | import numpy as np 12 | from utils.log_utils import eval_model, log_eval, stack_eval_model, egonav_eval_model 13 | 14 | try: 15 | from mpi4py import MPI 16 | except ImportError: 17 | MPI = None 18 | 19 | 20 | def arg_parse(): 21 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) 22 | parser.add_argument('--env', default='FetchPushWallObstacle-v4') 23 | parser.add_argument('--seed', type=int, default=42) 24 | parser.add_argument('--policy', type=str, default='CustomSACPolicy') 25 | parser.add_argument('--num_workers', type=int, default=32) 26 | parser.add_argument('--learning_rate', type=float, default=3e-4) 27 | parser.add_argument('--action_noise', type=str, default='none') 28 | parser.add_argument('--num_timesteps', type=float, default=3e6) 29 | parser.add_argument('--log_path', default=None, type=str) 30 | parser.add_argument('--load_path', default=None, type=str) 31 | parser.add_argument('--play', action="store_true", default=False) 32 | parser.add_argument('--batch_size', type=int, default=64) 33 | parser.add_argument('--random_ratio', type=float, default=1.0) 34 | parser.add_argument('--gamma', type=float, default=0.95) 35 | parser.add_argument('--reward_type', type=str, default='sparse') 36 | parser.add_argument('--n_object', type=int, default=2) 37 | parser.add_argument('--start_augment', type=float, default=0) 38 | parser.add_argument('--priority', action="store_true", default=False) 39 | parser.add_argument('--curriculum', action="store_true", default=False) 40 | parser.add_argument('--imitation_coef', type=float, default=5) 41 | parser.add_argument('--sequential', action="store_true", default=False) 42 | parser.add_argument('--export_gif', action="store_true", default=False) 43 | args = parser.parse_args() 44 | return args 45 | 46 | 47 | def configure_logger(log_path, **kwargs): 48 | if log_path is not None: 49 | logger.configure(log_path) 50 | else: 51 | logger.configure(**kwargs) 52 | 53 | 54 | def main(args): 55 | log_dir = args.log_path if (args.log_path is not None) else "/tmp/stable_baselines_" + time.strftime('%Y-%m-%d-%H-%M-%S') 56 | if MPI is None or MPI.COMM_WORLD.Get_rank() == 0: 57 | rank = 0 58 | configure_logger(log_dir) 59 | else: 60 | rank = MPI.COMM_WORLD.Get_rank() 61 | configure_logger(log_dir, format_strs=[]) 62 | 63 | set_global_seeds(args.seed) 64 | 65 | model_class = SAC_SIR # works also with SAC, DDPG and TD3 66 | 67 | env_kwargs = get_env_kwargs(args.env, random_ratio=args.random_ratio, sequential=args.sequential, 68 | reward_type=args.reward_type, n_object=args.n_object) 69 | 70 | def make_thunk(rank): 71 | return lambda: make_env(env_id=args.env, rank=rank, log_dir=log_dir, kwargs=env_kwargs) 72 | 73 | env = ParallelSubprocVecEnv([make_thunk(i) for i in range(args.num_workers)], reset_when_done=True) 74 | 75 | def make_thunk_aug(rank): 76 | return lambda: FlattenDictWrapper(make_env(env_id=aug_env_name, rank=rank, kwargs=aug_env_kwargs), 77 | ['observation', 'achieved_goal', 'desired_goal']) 78 | 79 | aug_env_kwargs = env_kwargs.copy() 80 | del aug_env_kwargs['max_episode_steps'] 81 | aug_env_name = args.env.split('-')[0] + 'Unlimit-' + args.env.split('-')[1] 82 | aug_env = ParallelSubprocVecEnv([make_thunk_aug(i) for i in range(args.num_workers)], reset_when_done=False) 83 | 84 | if os.path.exists(os.path.join(logger.get_dir(), 'eval.csv')): 85 | os.remove(os.path.join(logger.get_dir(), 'eval.csv')) 86 | print('Remove existing eval.csv') 87 | eval_env_kwargs = env_kwargs.copy() 88 | eval_env_kwargs['random_ratio'] = 0.0 89 | eval_env = make_env(env_id=args.env, rank=0, kwargs=eval_env_kwargs) 90 | eval_env = FlattenDictWrapper(eval_env, ['observation', 'achieved_goal', 'desired_goal']) 91 | 92 | if not args.play: 93 | os.makedirs(log_dir, exist_ok=True) 94 | 95 | # Available strategies (cf paper): future, final, episode, random 96 | goal_selection_strategy = 'future' # equivalent to GoalSelectionStrategy.FUTURE 97 | 98 | if not args.play: 99 | from stable_baselines.ddpg.noise import NormalActionNoise 100 | noise_type = args.action_noise.split('_')[0] 101 | if noise_type == 'none': 102 | parsed_action_noise = None 103 | elif noise_type == 'normal': 104 | sigma = float(args.action_noise.split('_')[1]) 105 | parsed_action_noise = NormalActionNoise(mean=np.zeros(env.action_space.shape), 106 | sigma=sigma * np.ones(env.action_space.shape)) 107 | else: 108 | raise NotImplementedError 109 | 110 | train_kwargs = get_train_kwargs("sac_sir", args, parsed_action_noise, eval_env, aug_env) 111 | 112 | def callback(_locals, _globals): 113 | if _locals['step'] % int(1e3) == 0: 114 | if 'FetchStack' in args.env: 115 | mean_eval_reward = stack_eval_model(eval_env, _locals["self"], 116 | init_on_table=(args.env=='FetchStack-v2')) 117 | elif 'MasspointPushDoubleObstacle-v2' in args.env: 118 | mean_eval_reward = egonav_eval_model(eval_env, _locals["self"], env_kwargs["random_ratio"], fixed_goal=np.array([4., 4., 0.15, 0., 0., 0., 1.])) 119 | mean_eval_reward2 = egonav_eval_model(eval_env, _locals["self"], env_kwargs["random_ratio"], 120 | goal_idx=0, fixed_goal=np.array([4., 4., 0.15, 1., 0., 0., 0.])) 121 | log_eval(_locals['self'].num_timesteps, mean_eval_reward2, file_name="eval_box.csv") 122 | else: 123 | mean_eval_reward = eval_model(eval_env, _locals["self"]) 124 | log_eval(_locals['self'].num_timesteps, mean_eval_reward) 125 | if _locals['step'] % int(2e4) == 0: 126 | model_path = os.path.join(log_dir, 'model_' + str(_locals['step'] // int(2e4))) 127 | model.save(model_path) 128 | print('model saved to', model_path) 129 | return True 130 | 131 | class CustomSACPolicy(SACPolicy): 132 | def __init__(self, *model_args, **model_kwargs): 133 | super(CustomSACPolicy, self).__init__(*model_args, **model_kwargs, 134 | layers=[256, 256] if 'MasspointPushDoubleObstacle' in args.env else [256, 256, 256, 256], 135 | feature_extraction="mlp") 136 | register_policy('CustomSACPolicy', CustomSACPolicy) 137 | from utils.sac_attention_policy import AttentionPolicy 138 | register_policy('AttentionPolicy', AttentionPolicy) 139 | policy_kwargs = get_policy_kwargs("sac_sir", args) 140 | 141 | if rank == 0: 142 | print('train_kwargs', train_kwargs) 143 | print('policy_kwargs', policy_kwargs) 144 | # Wrap the model 145 | model = HER2(args.policy, env, model_class, n_sampled_goal=4, 146 | start_augment_time=args.start_augment, 147 | goal_selection_strategy=goal_selection_strategy, 148 | num_workers=args.num_workers, 149 | policy_kwargs=policy_kwargs, 150 | verbose=1, 151 | **train_kwargs) 152 | print(model.get_parameter_list()) 153 | 154 | # Train the model 155 | model.learn(int(args.num_timesteps), seed=args.seed, callback=callback, log_interval=100 if not ('MasspointMaze-v3' in args.env) else 10) 156 | 157 | if rank == 0: 158 | model.save(os.path.join(log_dir, 'final')) 159 | 160 | 161 | if __name__ == '__main__': 162 | args = arg_parse() 163 | main(args) 164 | --------------------------------------------------------------------------------