├── __init__.py
├── assets
├── stls
│ ├── .get
│ └── fetch
│ │ ├── estop_link.stl
│ │ ├── laser_link.stl
│ │ ├── gripper_link.stl
│ │ ├── torso_fixed_link.stl
│ │ ├── base_link_collision.stl
│ │ ├── bellows_link_collision.stl
│ │ ├── head_pan_link_collision.stl
│ │ ├── l_wheel_link_collision.stl
│ │ ├── r_wheel_link_collision.stl
│ │ ├── elbow_flex_link_collision.stl
│ │ ├── head_tilt_link_collision.stl
│ │ ├── torso_lift_link_collision.stl
│ │ ├── wrist_flex_link_collision.stl
│ │ ├── wrist_roll_link_collision.stl
│ │ ├── forearm_roll_link_collision.stl
│ │ ├── shoulder_lift_link_collision.stl
│ │ ├── shoulder_pan_link_collision.stl
│ │ └── upperarm_roll_link_collision.stl
├── textures
│ ├── block.png
│ └── block_hidden.png
├── fetch
│ ├── reach.xml
│ ├── push.xml
│ ├── slide.xml
│ ├── pick_and_place.xml
│ ├── push_obstacle.xml
│ ├── push_wall.xml
│ ├── push_wall_obstacle.xml
│ ├── push_wall_heavy_obstacle_v2.xml
│ ├── push_wall_heavy_obstacle.xml
│ ├── push_wall_heavy_obstacle_v5.xml
│ ├── generate_xml.py
│ ├── pick_and_place_stack3.xml
│ ├── pick_and_place_stack.xml
│ ├── push_wall_heavy_double_obstacle.xml
│ ├── pick_and_place_box.xml
│ ├── open_close_box.xml
│ └── shared.xml
└── masspoint
│ ├── maze.xml
│ ├── smaze.xml
│ ├── single_obstacle.xml
│ ├── single_obstacle2.xml
│ ├── double_obstacle.xml
│ ├── emaze_easy.xml
│ └── generate_xml.py
├── plot
├── __init__.py
├── plot_compare_hiro.py
├── debug_plot_value.py
├── plot_experiment_visual.py
├── plot_reuse.py
├── plot_compare.py
├── plot_compare_sac.py
├── plot_walltime.py
├── plot_success_traj.py
├── plot_experiment_success_len.py
├── plot_cl_experiment.py
├── plot_experiment_len.py
├── plot_experiment.py
├── plot_sac_experiment_maze.py
├── visualize_sac_value.py
└── plot_sac_experiment.py
├── utils
├── __init__.py
├── eval_stack.py
├── wrapper.py
├── log_utils.py
└── replay_buffer.py
├── baselines
├── her
│ ├── __init__.py
│ ├── utils.py
│ └── her.py
├── ppo
│ └── __init__.py
├── sac_sir
│ └── __init__.py
├── ppo_sir
│ └── __init__.py
├── sac_parallel
│ └── __init__.py
└── __init__.py
├── .gitignore
├── exp_masspoint.sh
├── README.md
├── environment.yml
├── analyse_reduction_trace.py
├── exp_masspoint_3room.sh
├── exp_fetchstack.sh
├── success_len_calculation.py
├── exp_push.sh
├── run_ppo_augment.py
├── run_ppo.py
└── run_her_augment.py
/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/assets/stls/.get:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/plot/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/baselines/her/__init__.py:
--------------------------------------------------------------------------------
1 | from baselines.her.her import HER2
--------------------------------------------------------------------------------
/baselines/ppo/__init__.py:
--------------------------------------------------------------------------------
1 | from baselines.ppo.ppo import PPO2
--------------------------------------------------------------------------------
/baselines/sac_sir/__init__.py:
--------------------------------------------------------------------------------
1 | from baselines.sac_sir.sac_sir import SAC_SIR
--------------------------------------------------------------------------------
/baselines/ppo_sir/__init__.py:
--------------------------------------------------------------------------------
1 | from baselines.ppo_sir.ppo_sir import PPO2_SIR
2 |
--------------------------------------------------------------------------------
/baselines/sac_parallel/__init__.py:
--------------------------------------------------------------------------------
1 | from baselines.sac_parallel.sac_parallel import SAC_parallel
2 |
--------------------------------------------------------------------------------
/assets/textures/block.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IrisLi17/self-imitation-via-reduction/HEAD/assets/textures/block.png
--------------------------------------------------------------------------------
/assets/stls/fetch/estop_link.stl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IrisLi17/self-imitation-via-reduction/HEAD/assets/stls/fetch/estop_link.stl
--------------------------------------------------------------------------------
/assets/stls/fetch/laser_link.stl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IrisLi17/self-imitation-via-reduction/HEAD/assets/stls/fetch/laser_link.stl
--------------------------------------------------------------------------------
/assets/textures/block_hidden.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IrisLi17/self-imitation-via-reduction/HEAD/assets/textures/block_hidden.png
--------------------------------------------------------------------------------
/assets/stls/fetch/gripper_link.stl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IrisLi17/self-imitation-via-reduction/HEAD/assets/stls/fetch/gripper_link.stl
--------------------------------------------------------------------------------
/assets/stls/fetch/torso_fixed_link.stl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IrisLi17/self-imitation-via-reduction/HEAD/assets/stls/fetch/torso_fixed_link.stl
--------------------------------------------------------------------------------
/assets/stls/fetch/base_link_collision.stl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IrisLi17/self-imitation-via-reduction/HEAD/assets/stls/fetch/base_link_collision.stl
--------------------------------------------------------------------------------
/assets/stls/fetch/bellows_link_collision.stl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IrisLi17/self-imitation-via-reduction/HEAD/assets/stls/fetch/bellows_link_collision.stl
--------------------------------------------------------------------------------
/assets/stls/fetch/head_pan_link_collision.stl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IrisLi17/self-imitation-via-reduction/HEAD/assets/stls/fetch/head_pan_link_collision.stl
--------------------------------------------------------------------------------
/assets/stls/fetch/l_wheel_link_collision.stl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IrisLi17/self-imitation-via-reduction/HEAD/assets/stls/fetch/l_wheel_link_collision.stl
--------------------------------------------------------------------------------
/assets/stls/fetch/r_wheel_link_collision.stl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IrisLi17/self-imitation-via-reduction/HEAD/assets/stls/fetch/r_wheel_link_collision.stl
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | .vscode/*
3 | .idea/*
4 | __pycache__/*
5 | logs/*
6 | model/*
7 | *.png
8 | *.gif
9 | MUJOCO_LOG.TXT
10 | *.npy
11 | *.zip
12 |
13 |
--------------------------------------------------------------------------------
/assets/stls/fetch/elbow_flex_link_collision.stl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IrisLi17/self-imitation-via-reduction/HEAD/assets/stls/fetch/elbow_flex_link_collision.stl
--------------------------------------------------------------------------------
/assets/stls/fetch/head_tilt_link_collision.stl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IrisLi17/self-imitation-via-reduction/HEAD/assets/stls/fetch/head_tilt_link_collision.stl
--------------------------------------------------------------------------------
/assets/stls/fetch/torso_lift_link_collision.stl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IrisLi17/self-imitation-via-reduction/HEAD/assets/stls/fetch/torso_lift_link_collision.stl
--------------------------------------------------------------------------------
/assets/stls/fetch/wrist_flex_link_collision.stl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IrisLi17/self-imitation-via-reduction/HEAD/assets/stls/fetch/wrist_flex_link_collision.stl
--------------------------------------------------------------------------------
/assets/stls/fetch/wrist_roll_link_collision.stl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IrisLi17/self-imitation-via-reduction/HEAD/assets/stls/fetch/wrist_roll_link_collision.stl
--------------------------------------------------------------------------------
/assets/stls/fetch/forearm_roll_link_collision.stl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IrisLi17/self-imitation-via-reduction/HEAD/assets/stls/fetch/forearm_roll_link_collision.stl
--------------------------------------------------------------------------------
/assets/stls/fetch/shoulder_lift_link_collision.stl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IrisLi17/self-imitation-via-reduction/HEAD/assets/stls/fetch/shoulder_lift_link_collision.stl
--------------------------------------------------------------------------------
/assets/stls/fetch/shoulder_pan_link_collision.stl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IrisLi17/self-imitation-via-reduction/HEAD/assets/stls/fetch/shoulder_pan_link_collision.stl
--------------------------------------------------------------------------------
/assets/stls/fetch/upperarm_roll_link_collision.stl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IrisLi17/self-imitation-via-reduction/HEAD/assets/stls/fetch/upperarm_roll_link_collision.stl
--------------------------------------------------------------------------------
/baselines/__init__.py:
--------------------------------------------------------------------------------
1 | from baselines.ppo_sir import PPO2_SIR
2 | from baselines.ppo import PPO2
3 | from baselines.her import HER2
4 | from baselines.sac_sir import SAC_SIR
5 | from baselines.sac_parallel import SAC_parallel
6 |
--------------------------------------------------------------------------------
/exp_masspoint.sh:
--------------------------------------------------------------------------------
1 | CUDA_VISIBLE_DEVICES=0 python run_ppo.py --env MasspointPushMultiObstacle-v1 --policy AttentionPolicy --n_object 4 --num_timesteps 3e8 --random_ratio 0.25 --log_path logs/MasspointPushMultiObstacle-v1_random0.25/ppo_attention/0
2 | CUDA_VISIBLE_DEVICES=0 python run_ppo_augment.py --env MasspointPushMultiObstacle-v1 --policy AttentionPolicy --n_object 4 --num_timesteps 3e8 --random_ratio 0.25 --n_subgoal 2 --parallel --aug_clip 0.0 --reuse_times 4 --start_augment 5e7 --log_path logs/MasspointPushMultiObstacle-v1_random0.25/ppo_attention_sir/0
3 | CUDA_VISIBLE_DEVICES=0 python run_ppo_augment.py --env MasspointPushMultiObstacle-v1 --policy AttentionPolicy --n_object 4 --num_timesteps 3e8 --random_ratio 0.25 --parallel --aug_clip 0.0 --reuse_times 1 --self_imitate --sil_clip 0.15 --log_path logs/MasspointPushMultiObstacle-v1_random0.25/ppo_attention_sil/0
4 |
5 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Self-Imitation via Reduction
2 | Paper: [Solving Compositional Reinforcement Learning Problems via Task Reduction](https://openreview.net/forum?id=9SS69KwomAM)
3 |
4 | Project website: https://sites.google.com/view/sir-compositional/.
5 |
6 | ### Get Started
7 | Prerequisite:
8 |
9 | * Ubuntu 16.04
10 | * CUDA 10.0
11 | * [MuJoCo](http://www.mujoco.org/) version 2.0. You can obtain a license and download the binaries from its website.
12 | * [Conda](https://docs.conda.io/en/latest/miniconda.html)
13 |
14 | Install:
15 |
16 | Run ``conda env create -f environment.yml``. You may refer to [Troubleshooting](https://github.com/openai/mujoco-py/blob/master/README.md#troubleshooting) if you have problems installing ``mujoco-py``.
17 |
18 | ### How to Run
19 | The scripts ``exp_push.sh``, ``exp_fetchstack.sh``, ``exp_masspoint.sh`` contain the commands for running different algorithms in *Push*, *Stack* and *Maze* scenarios respectively.
20 |
21 |
--------------------------------------------------------------------------------
/assets/fetch/reach.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
--------------------------------------------------------------------------------
/assets/fetch/push.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
--------------------------------------------------------------------------------
/assets/fetch/slide.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
1 | name: py36
2 | channels:
3 | - defaults
4 | dependencies:
5 | - _libgcc_mutex=0.1=main
6 | - ca-certificates=2019.8.28=0
7 | - certifi=2019.9.11=py36_0
8 | - libedit=3.1.20181209=hc058e9b_0
9 | - libffi=3.2.1=hd88cf55_4
10 | - libgcc-ng=9.1.0=hdf63c60_0
11 | - libstdcxx-ng=9.1.0=hdf63c60_0
12 | - ncurses=6.1=he6710b0_1
13 | - openssl=1.1.1d=h7b6447c_2
14 | - pip=19.2.3=py36_0
15 | - python=3.6.9=h265db76_0
16 | - readline=7.0=h7b6447c_5
17 | - sqlite=3.30.0=h7b6447c_0
18 | - tk=8.6.8=hbc83047_0
19 | - wheel=0.33.6=py36_0
20 | - xz=5.2.4=h14c3975_4
21 | - zlib=1.2.11=h7b6447c_3
22 | - pip:
23 | - absl-py==0.8.1
24 | - astor==0.8.0
25 | - atari-py==0.2.6
26 | - cffi==1.13.0
27 | - cloudpickle==1.2.2
28 | - cycler==0.10.0
29 | - cython==0.29.13
30 | - fasteners==0.15
31 | - future==0.18.1
32 | - gast==0.3.2
33 | - glfw==1.8.3
34 | - grpcio==1.24.1
35 | - gym==0.14.0
36 | - imageio==2.6.1
37 | - joblib==0.14.0
38 | - kiwisolver==1.1.0
39 | - markdown==3.1.1
40 | - matplotlib==3.1.1
41 | - monotonic==1.5
42 | - mpi4py==3.0.2
43 | - mujoco-py==2.0.2.7
44 | - numpy==1.16.1
45 | - opencv-python==4.1.1.26
46 | - pandas==0.25.2
47 | - pillow==6.2.0
48 | - protobuf==3.10.0
49 | - pycparser==2.19
50 | - pyglet==1.3.2
51 | - pyparsing==2.4.2
52 | - python-dateutil==2.8.0
53 | - pytz==2019.3
54 | - scipy==1.3.1
55 | - setuptools==39.1.0
56 | - six==1.12.0
57 | - stable-baselines==2.8.0
58 | - tensorboard==1.15.0
59 | - tensorflow-gpu==1.15.0
60 | - termcolor==1.1.0
61 | - werkzeug==0.16.0
62 | prefix: /home/lyf/miniconda3/envs/py36
63 |
64 |
--------------------------------------------------------------------------------
/assets/fetch/pick_and_place.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
--------------------------------------------------------------------------------
/assets/fetch/push_obstacle.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
--------------------------------------------------------------------------------
/analyse_reduction_trace.py:
--------------------------------------------------------------------------------
1 | import pickle
2 | import sys
3 | import numpy as np
4 | import matplotlib.pyplot as plt
5 |
6 | if __name__ == '__main__':
7 | trace_file = sys.argv[1]
8 | initial_states = []
9 | with open(trace_file, 'rb') as f:
10 | try:
11 | while True:
12 | initial_states.append(pickle.load(f))
13 | except EOFError:
14 | pass
15 | print('total number of states', len(initial_states))
16 |
17 | def n_doors_blocked(obs):
18 | agent_pos = obs[:3]
19 | box_pos = obs[3: 6]
20 | goal_pos = obs[-7: -4]
21 | obstacles_pos = [obs[6 + 3 * i: 9 + 3 * i] for i in range(3)]
22 | # return sum([abs(pos[1] - 2.5) < 0.1 for pos in obstacles_pos])
23 |
24 | max_x, min_x = max(agent_pos[0], box_pos[0], goal_pos[0]), min(agent_pos[0], box_pos[0], goal_pos[0])
25 | max_n = int(max_x / 1.7)
26 | min_n = int(min_x / 1.7)
27 | count = 0
28 | for pos_obstacle in obstacles_pos:
29 | if abs(pos_obstacle[1] - 2.5) < 1e-3 and min_n < round(pos_obstacle[0] / 1.7) < max_n + 1:
30 | count += 1
31 | return count
32 |
33 | def smooth(arr, window=100):
34 | smoothed = np.zeros_like(arr)
35 | for i in range(arr.shape[0]):
36 | smoothed[i] = np.mean(arr[max(i - window + 1, 0): i + 1])
37 | return smoothed
38 |
39 | blocked_doors = list(map(n_doors_blocked, initial_states))
40 | blocked_doors = np.asarray(blocked_doors)
41 | print(len(blocked_doors))
42 | reduction_masks = [(blocked_doors == i).astype(np.float32) for i in range(4)]
43 | reduction_percents = [smooth(mask, 1000) for mask in reduction_masks]
44 | for i in range(4):
45 | plt.plot(reduction_percents[i], label='%d blocked' % i)
46 | plt.legend()
47 | plt.show()
--------------------------------------------------------------------------------
/utils/eval_stack.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | # Deprecated
5 | def pp_eval_model(eval_env, model):
6 | env = eval_env
7 | env.unwrapped.random_ratio = 1.0
8 | temp = env.unwrapped.task_array.copy()
9 | env.unwrapped.task_array = [(env.n_object, i) for i in range(env.n_object)]
10 | n_episode = 0
11 | ep_rewards = []
12 | ep_successes = []
13 | while n_episode < 50:
14 | ep_reward = 0.0
15 | ep_success = 0.0
16 | obs = env.reset()
17 | while env.current_nobject != env.n_object or env.task_mode != 0:
18 | obs = env.reset()
19 | done = False
20 | while not done:
21 | action, _ = model.predict(obs)
22 | obs, reward, done, info = env.step(action)
23 | ep_reward += reward
24 | ep_success += info['is_success']
25 | ep_rewards.append(ep_reward)
26 | ep_successes.append(ep_success)
27 | n_episode += 1
28 | # return np.mean(ep_rewards)
29 | env.unwrapped.task_array = temp
30 | return np.mean(ep_successes)
31 |
32 |
33 | def eval_model(env, model, max_nobject, random_ratio, init_on_table=False):
34 | # random_ratio 0: stack only, 1: pick and place only
35 | temp = env.unwrapped.task_array.copy()
36 | if init_on_table:
37 | env.unwrapped.task_array = [(max_nobject, i) for i in range(min(2, max_nobject))]
38 | else:
39 | env.unwrapped.task_array = [(max_nobject, i) for i in range(max_nobject)]
40 | env.unwrapped.random_ratio = random_ratio
41 | n_episode = 0
42 | ep_successes = []
43 | while n_episode < 50:
44 | ep_reward = 0.0
45 | ep_success = 0.0
46 | obs = env.reset()
47 | done = False
48 | while not done:
49 | action, _ = model.predict(obs)
50 | obs, reward, done, info = env.step(action)
51 | ep_reward += reward
52 | ep_success += info['is_success']
53 | ep_successes.append(ep_success)
54 | n_episode += 1
55 | env.unwrapped.task_array = temp
56 | return np.mean(ep_successes)
--------------------------------------------------------------------------------
/assets/fetch/push_wall.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
--------------------------------------------------------------------------------
/assets/fetch/push_wall_obstacle.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
--------------------------------------------------------------------------------
/exp_masspoint_3room.sh:
--------------------------------------------------------------------------------
1 | # Hard case 30%.
2 | CUDA_VISIBLE_DEVICES=0 python run_ppo.py --env MasspointPushDoubleObstacle-v1 --policy AttentionPolicy --num_timesteps 3e8 --random_ratio 0.7 --log_path logs/MasspointPushDoubleObstacle-v1_random0.7/ppo_attention/0
3 | CUDA_VISIBLE_DEVICES=0 python run_ppo.py --env MasspointPushDoubleObstacle-v1 --policy AttentionPolicy --num_timesteps 3e8 --random_ratio 0.7 --reward_type dense --log_path logs/MasspointPushDoubleObstacle-v1_random0.7/ppo_attention_ds/0
4 | CUDA_VISIBLE_DEVICES=0 python run_ppo_augment.py --env MasspointPushDoubleObstacle-v1 --policy AttentionPolicy --num_timesteps 3e8 --random_ratio 0.7 --parallel --aug_clip 0.0 --reuse_times 1 --self_imitate --sil_clip 0.15 --log_path logs/MasspointPushDoubleObstacle-v1_random0.7/ppo_attention_sil/0
5 | CUDA_VISIBLE_DEVICES=0 python run_ppo_augment.py --env MasspointPushDoubleObstacle-v1 --policy AttentionPolicy --num_timesteps 3e8 --random_ratio 0.7 --n_subgoal 2 --parallel --aug_clip 0.0 --reuse_times 4 --start_augment 3e7 --log_path logs/MasspointPushDoubleObstacle-v1_random0.7/ppo_attention_sir/0
6 |
7 | # Uniform.
8 | CUDA_VISIBLE_DEVICES=0 python run_ppo.py --env MasspointPushDoubleObstacle-v1 --policy AttentionPolicy --num_timesteps 3e8 --random_ratio 1.0 --log_path logs/MasspointPushDoubleObstacle-v1_random1.0/ppo_attention/0
9 | CUDA_VISIBLE_DEVICES=0 python run_ppo.py --env MasspointPushDoubleObstacle-v1 --policy AttentionPolicy --num_timesteps 3e8 --random_ratio 1.0 --reward_type dense --log_path logs/MasspointPushDoubleObstacle-v1_random1.0/ppo_attention_ds/0
10 | CUDA_VISIBLE_DEVICES=0 python run_ppo_augment.py --env MasspointPushDoubleObstacle-v1 --policy AttentionPolicy --num_timesteps 3e8 --random_ratio 1.0 --parallel --aug_clip 0.0 --reuse_times 1 --self_imitate --sil_clip 0.15 --log_path logs/MasspointPushDoubleObstacle-v1_random1.0/ppo_attention_sil/0
11 | CUDA_VISIBLE_DEVICES=0 python run_ppo_augment.py --env MasspointPushDoubleObstacle-v1 --policy AttentionPolicy --num_timesteps 3e8 --random_ratio 1.0 --n_subgoal 2 --parallel --aug_clip 0.0 --reuse_times 4 --start_augment 3e7 --log_path logs/MasspointPushDoubleObstacle-v1_random1.0/ppo_attention_sir/0
12 |
--------------------------------------------------------------------------------
/exp_fetchstack.sh:
--------------------------------------------------------------------------------
1 | # 2 boxes
2 | CUDA_VISIBLE_DEVICES=1 python run_her.py --env FetchStack-v1 --num_timesteps 1e6 --curriculum --num_workers 32 --policy AttentionPolicy --reward_type sparse --n_object 2 --priority --log_path logs/FetchStack-v1_adapt/her_sac_32workers/2obj/0
3 | CUDA_VISIBLE_DEVICES=0 python run_her_augment.py --env FetchStack-v1 --num_timesteps 1e6 --curriculum --num_workers 32 --policy AttentionPolicy --reward_type sparse --n_object 2 --imitation_coef 0.1 --priority --log_path logs/FetchStack-v1_adapt/her_sac_sir_32workers/2obj/0
4 | CUDA_VISIBLE_DEVICES=1 python run_her.py --env FetchStack-v1 --num_timesteps 1e6 --curriculum --num_workers 32 --policy AttentionPolicy --reward_type sparse --n_object 2 --sil --sil_coef 0.1 --priority --log_path logs/FetchStack-v1_adapt/her_sac_sil_32workers/2obj/0
5 | CUDA_VISIBLE_DEVICES=1 python run_her.py --env FetchStack-v1 --num_timesteps 1e6 --curriculum --num_workers 32 --policy AttentionPolicy --reward_type dense --n_object 2 --priority --log_path logs/FetchStack-v1_adapt/her_sac_ds_32workers/2obj/0
6 | # 3 boxes
7 | CUDA_VISIBLE_DEVICES=1 python run_her.py --env FetchStack-v1 --num_timesteps 3.5e6 --curriculum --num_workers 32 --policy AttentionPolicy --reward_type sparse --n_object 3 --priority --log_path logs/FetchStack-v1_adapt/her_sac_32workers/3obj/0
8 | CUDA_VISIBLE_DEVICES=0 python run_her_augment.py --env FetchStack-v1 --num_timesteps 3.5e6 --curriculum --num_workers 32 --policy AttentionPolicy --reward_type sparse --n_object 3 --imitation_coef 0.1 --priority --log_path logs/FetchStack-v1_adapt/her_sac_sir_32workers/3obj/0
9 | CUDA_VISIBLE_DEVICES=1 python run_her.py --env FetchStack-v1 --num_timesteps 3.5e6 --curriculum --num_workers 32 --policy AttentionPolicy --reward_type sparse --n_object 3 --sil --sil_coef 0.1 --priority --log_path logs/FetchStack-v1_adapt/her_sac_sil_32workers/3obj/0
10 | CUDA_VISIBLE_DEVICES=1 python run_her.py --env FetchStack-v1 --num_timesteps 3.5e6 --curriculum --num_workers 32 --policy AttentionPolicy --reward_type dense --n_object 3 --priority --log_path logs/FetchStack-v1_adapt/her_sac_ds_32workers/3obj/0
11 |
12 |
13 | # experimental
14 | CUDA_VISIBLE_DEVICES=0 python run_her.py --env FetchStack-v1 --num_timesteps 3.5e6 --curriculum --num_workers 32 --policy RelationalPolicy --reward_type sparse --n_object 3 --priority --log_path logs/FetchStack-v1_adapt/her_sac_32workers/relational/3obj/0
--------------------------------------------------------------------------------
/plot/plot_compare_hiro.py:
--------------------------------------------------------------------------------
1 | import sys, os
2 | import numpy as np
3 | import pandas
4 | import matplotlib.pyplot as plt
5 | from scipy import interpolate
6 |
7 |
8 | if __name__ == '__main__':
9 | option = sys.argv[1]
10 | log_paths = sys.argv[2:]
11 | assert option in ['eval']
12 | window = 20
13 | def get_item(log_file, label):
14 | data = pandas.read_csv(log_file, index_col=None, comment='#', error_bad_lines=True)
15 | return data[label].values
16 | def smooth(array, window):
17 | out = np.zeros(array.shape[0] - window)
18 | for i in range(out.shape[0]):
19 | out[i] = np.mean(array[i:i + window])
20 | return out
21 | fig, ax = plt.subplots(1, 1, figsize=(5, 5))
22 | for log_path in log_paths:
23 | progress_file = os.path.join(log_path, 'progress.csv')
24 | eval_file = os.path.join(log_path, 'eval.csv')
25 | if 'hiro' in log_path:
26 | eval_reward = get_item(eval_file, 'Value')
27 | eval_step = get_item(eval_file, 'Step')
28 | elif 'dsc' in log_path:
29 | raw_reward = get_item(eval_file, 'Value')
30 | eval_step = get_item(eval_file, 'Step')
31 | # Mean last 100
32 | eval_reward = np.zeros_like(raw_reward)
33 | for i in range(eval_reward.shape[0]):
34 | eval_reward[i] = np.mean(raw_reward[max(i - 100 + 1, 0): i + 1])
35 | else:
36 | eval_reward = get_item(eval_file, 'mean_eval_reward')
37 | total_timesteps = get_item(progress_file, 'total timesteps')
38 | try:
39 | original_timesteps = get_item(progress_file, 'original_timesteps')
40 | except KeyError:
41 | original_timesteps = total_timesteps
42 | step_expand_fn = interpolate.interp1d(original_timesteps, total_timesteps, fill_value="extrapolate")
43 | eval_step = get_item(eval_file, 'n_updates')
44 | # else:
45 | # eval_step = get_item(progress_file, 'total_timesteps')
46 | # eval_reward = get_item(progress_file, 'ep_reward_mean')
47 | if option == 'eval':
48 | ax.plot(smooth(eval_step, window), smooth(eval_reward, window), label=log_path)
49 | if option == 'eval':
50 | ax.set_title('success rate')
51 | ax.grid()
52 | plt.legend()
53 | plt.show()
54 |
55 |
--------------------------------------------------------------------------------
/assets/fetch/push_wall_heavy_obstacle_v2.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
--------------------------------------------------------------------------------
/assets/fetch/push_wall_heavy_obstacle.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
--------------------------------------------------------------------------------
/assets/fetch/push_wall_heavy_obstacle_v5.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
--------------------------------------------------------------------------------
/utils/wrapper.py:
--------------------------------------------------------------------------------
1 | import gym
2 |
3 |
4 | class DoneOnSuccessWrapper(gym.Wrapper):
5 | """
6 | Reset on success and offsets the reward.
7 | Useful for GoalEnv.
8 | """
9 | def __init__(self, env, reward_offset=1.0):
10 | super(DoneOnSuccessWrapper, self).__init__(env)
11 | self.reward_offset = reward_offset
12 |
13 | def step(self, action):
14 | obs, reward, done, info = self.env.step(action)
15 | done = done or info.get('is_success', False)
16 | reward += self.reward_offset
17 | return obs, reward, done, info
18 |
19 | def compute_reward(self, achieved_goal, desired_goal, info):
20 | reward = self.env.compute_reward(achieved_goal, desired_goal, info)
21 | return reward + self.reward_offset
22 |
23 |
24 | class ScaleRewardWrapper(gym.Wrapper):
25 | def __init__(self, env, reward_scale=1.0):
26 | super(ScaleRewardWrapper, self).__init__(env)
27 | self.reward_scale = reward_scale
28 |
29 | def step(self, action):
30 | obs, reward, done, info = self.env.step(action)
31 | reward /= self.reward_scale
32 | return obs, reward, done, info
33 |
34 | def compute_reward(self, achieved_goal, desired_goal, info):
35 | reward = self.env.compute_reward(achieved_goal, desired_goal, info)
36 | return reward / self.reward_scale
37 |
38 |
39 | class FlexibleTimeLimitWrapper(gym.Wrapper):
40 | '''
41 | ONLY applicable to Stacking environment!
42 | We can set max_episode_steps = None for gym, (so gym.TimeLimitWrapper is not applied),
43 | then use this class to avoid potential conflict.
44 | '''
45 | def __init__(self, env, time_limit=None):
46 | super(FlexibleTimeLimitWrapper, self).__init__(env)
47 | self.time_limit = time_limit
48 | assert 'FetchStack' in env.spec.id
49 | assert env.spec.max_episode_steps is None
50 | self._elapsed_steps = None
51 |
52 | def step(self, action):
53 | assert self._elapsed_steps is not None, "Cannot call env.step() before calling reset()"
54 | self.time_limit = self.env.unwrapped.current_nobject * 50 if self.env.unwrapped.current_nobject > 2 else 100
55 | observation, reward, done, info = self.env.step(action)
56 | self._elapsed_steps += 1
57 | if self._elapsed_steps >= self.time_limit:
58 | info['TimeLimit.truncated'] = not done
59 | done = True
60 | return observation, reward, done, info
61 |
62 | def reset(self, **kwargs):
63 | self._elapsed_steps = 0
64 | return self.env.reset(**kwargs)
65 |
--------------------------------------------------------------------------------
/assets/masspoint/maze.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
--------------------------------------------------------------------------------
/assets/masspoint/smaze.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
--------------------------------------------------------------------------------
/assets/fetch/generate_xml.py:
--------------------------------------------------------------------------------
1 | BASIC_COLORS = ["0.1 0.1 0.5", "0.1 0.8 0.3", "1.0 0.9 0.0", "0.8 0.2 0.8", "1.0 0.0 0.0", "0 0 0"]
2 |
3 | base = '''
4 |
5 |
6 |
9 |
10 |
11 | {assets}
12 |
13 |
14 |
15 |
16 | {target_sites}
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 | {object_bodies}
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 | '''
34 |
35 |
36 | def generate_xml(num_blocks):
37 | colors = BASIC_COLORS[:num_blocks]
38 | site_base = ''
39 | block_base = '''
40 |
41 |
42 |
43 | '''
44 | asset_base = ''
45 |
46 | sites = []
47 | block_bodies = []
48 | assets = []
49 | sites.append(site_base.format(**dict(id=0, color=colors[0])))
50 | for i in range(num_blocks):
51 | block_bodies.append(block_base.format(**dict(id=i, color=colors[i])))
52 | assets.append(asset_base.format(**dict(id=i, color=colors[i])))
53 |
54 | return base.format(
55 | **dict(assets="\n".join(assets), target_sites="\n".join(sites), object_bodies="\n".join(block_bodies)))
--------------------------------------------------------------------------------
/assets/fetch/pick_and_place_stack3.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
--------------------------------------------------------------------------------
/assets/fetch/pick_and_place_stack.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
37 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
--------------------------------------------------------------------------------
/plot/debug_plot_value.py:
--------------------------------------------------------------------------------
1 | import sys, os
2 | import numpy as np
3 | import pandas
4 | import matplotlib.pyplot as plt
5 |
6 |
7 | if __name__ == '__main__':
8 | log_path = sys.argv[1]
9 | window = 10
10 | def get_item(log_file, label):
11 | data = pandas.read_csv(log_file, index_col=None, comment='#', error_bad_lines=True)
12 | return data[label].values
13 | def smooth(array, window):
14 | out = np.zeros(array.shape[0] - window)
15 | for i in range(out.shape[0]):
16 | out[i] = np.mean(array[i:i + window])
17 | return out
18 | # print(get_item(log_path, 'reference_value').shape)
19 | # original_value = get_item(log_path, 'reference_value')[:]
20 | value1 = get_item(log_path, 'value1')[0:]
21 | value2 = get_item(log_path, 'value2')[0:]
22 | min_value = np.min(np.concatenate([np.expand_dims(value1, axis=0), np.expand_dims(value2, axis=0)], axis=0), axis=0)
23 | is_success = get_item(log_path, 'is_success')[0:]
24 | num_timesteps = get_item(log_path, 'num_timesteps')[0:]
25 | print(num_timesteps[20000], num_timesteps[40000], num_timesteps[-1])
26 | success_idx = np.where(is_success > 0.5)[0]
27 | fail_idx = np.where(is_success < 0.5)[0]
28 | print(value1.shape)
29 |
30 | fig, ax = plt.subplots(1, 1, figsize=(8, 5))
31 | plt.rcParams.update({'font.size': 22, 'legend.fontsize': 22, 'xtick.labelsize': 18, 'ytick.labelsize': 18, 'axes.labelsize': 18})
32 | # ax.plot(smooth(original_value, 100), alpha=0.5, label='reference')
33 | # ax.scatter(fail_idx, value1[fail_idx]-original_value[fail_idx], c='tab:orange', s=0.1, label='fail value1')
34 | # ax.scatter(fail_idx, value2[fail_idx]-original_value[fail_idx], c='tab:green', s=0.1, label='fail value2')
35 | # ax.scatter(success_idx, value1[success_idx]-original_value[success_idx], c='tab:red', s=0.1, label='success value1')
36 | # ax.scatter(success_idx, value2[success_idx]-original_value[success_idx], c='tab:purple', s=0.1, label='success value2')
37 | # Mean value
38 | ax.scatter(fail_idx, (value1[fail_idx] + value2[fail_idx]) / 2, c='tab:orange', s=1.0, label='fail mean value')
39 | ax.scatter(success_idx, (value1[success_idx] + value2[success_idx]) / 2, c='tab:green', s=4.0, label='success mean value')
40 | # ax.axhline(0.5, linestyle='--', c='tab:blue')
41 | # ax.axhline(1.0, linestyle='--', c='tab:blue')
42 | # ax.plot(smooth(np.arange(len(value1)), 500), smooth((value1 + value2) / 2, 500), c='tab:red', label='smoothed mean value')
43 | # Min value
44 | # ax.scatter(fail_idx, min_value[fail_idx], c='tab:orange', s=1.0, label='fail min value')
45 | # ax.scatter(success_idx, min_value[success_idx], c='tab:green', s=4.0, label='succes min value')
46 | # ax.plot(smooth(np.arange(len(min_value)), 500), smooth(min_value, 500), c='tab:red', label='smoothed min value')
47 | # ax.scatter(fail_idx, original_value[fail_idx], c='tab:orange', s=0.1, label='fail original value')
48 | # ax.scatter(success_idx, original_value[success_idx], c='tab:green', s=4.0, label='success original value')
49 | # ax.set_yscale('log')
50 | plt.legend(loc="upper right", bbox_to_anchor=(1.0, 1.0))
51 | plt.tight_layout(pad=0.05)
52 | plt.savefig('value_sigma_sac.png')
53 | plt.show()
54 |
--------------------------------------------------------------------------------
/success_len_calculation.py:
--------------------------------------------------------------------------------
1 | import sys, os
2 | import numpy as np
3 | # from run_her import make_env, get_env_kwargs
4 | from baselines import HER2, PPO2
5 | from gym.wrappers import FlattenDictWrapper
6 |
7 |
8 | if __name__ == '__main__':
9 | env_id = sys.argv[1]
10 | algo = sys.argv[2]
11 | assert algo in ['sac', 'ppo']
12 | model_paths = sys.argv[3:]
13 | from utils.make_env_utils import make_env, get_env_kwargs
14 | env_kwargs = get_env_kwargs(env_id, random_ratio=0.0, n_object=3)
15 |
16 | aug_env_id = env_id.split('-')[0] + 'Unlimit-' + env_id.split('-')[1]
17 | aug_env_kwargs = env_kwargs.copy()
18 | aug_env_kwargs['max_episode_steps'] = None
19 |
20 | aug_env = make_env(aug_env_id, rank=0, flatten_dict=True, kwargs=aug_env_kwargs)
21 | # if algo == 'sac':
22 | # aug_env = FlattenDictWrapper(aug_env, ['observation', 'achieved_goal', 'desired_goal'])
23 |
24 | if env_id == 'FetchStack-v1':
25 | # aug_env.set_task_array([(env_kwargs['n_object'], i) for i in range(env_kwargs['n_object'])])
26 | aug_env.set_task_array([(3, 0), (3, 1), (3, 2)])
27 |
28 | goal_dim = aug_env.goal.shape[0]
29 | obs_dim = aug_env.observation_space.shape[0] - 2 * goal_dim
30 | noise_mag = aug_env.size_obstacle[1]
31 | n_object = aug_env.n_object
32 | # model.model.env_id = env_id
33 | # model.model.goal_dim = goal_dim
34 | # model.model.obs_dim = obs_dim
35 | # model.model.noise_mag = noise_mag
36 | # model.model.n_object = n_object
37 |
38 | test_states, test_goals = [], []
39 | test_selected_objects, test_current_nobject = [], []
40 | for i in range(500):
41 | obs = aug_env.reset()
42 | goal = obs[-goal_dim:]
43 | initial_state = aug_env.get_state()
44 | test_states.append(initial_state)
45 | test_goals.append(goal)
46 | if env_id == 'FetchStack-v1':
47 | test_selected_objects.append(aug_env.selected_objects)
48 | test_current_nobject.append(aug_env.current_nobject)
49 | for model_path in model_paths:
50 | if algo == 'sac':
51 | model = HER2.load(model_path)
52 | elif algo == 'ppo':
53 | model = PPO2.load(model_path)
54 | if 'ds' in model_path:
55 | aug_env.unwrapped.reward_type = 'dense'
56 | else:
57 | aug_env.unwrapped.reward_type = 'sparse'
58 | success_len = []
59 | for i in range(len(test_states)):
60 | aug_env.set_state(test_states[i])
61 | aug_env.set_goal(test_goals[i])
62 | if env_id == 'FetchStack-v1':
63 | aug_env.unwrapped.selected_objects = test_selected_objects[i]
64 | aug_env.unwrapped.current_nobject = test_current_nobject[i]
65 | obs = aug_env.get_obs()
66 | obs = np.concatenate([obs[key] for key in ['observation', 'achieved_goal', 'desired_goal']])
67 | done = False
68 | step_so_far = 0
69 | while not done:
70 | action, _ = model.predict(obs, deterministic=True)
71 | obs, reward, done, info = aug_env.step(action)
72 | step_so_far += 1
73 | if step_so_far >= env_kwargs['max_episode_steps']:
74 | break
75 | if done:
76 | success_len.append(step_so_far)
77 | print(model_path, 'mean success len:', np.mean(success_len), 'over %d trajs' % len(success_len))
78 |
--------------------------------------------------------------------------------
/assets/masspoint/single_obstacle.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
60 |
61 |
62 |
--------------------------------------------------------------------------------
/exp_push.sh:
--------------------------------------------------------------------------------
1 | # SAC-based
2 | # Hard case 30%.
3 | # SAC
4 | CUDA_VISIBLE_DEVICES=0 python run_her.py --env FetchPushWallObstacle-v4 --random_ratio 0.7 --num_timesteps 1e6 --num_workers 32 --priority --log_path logs/FetchPushWallObstacle-v4_random0.7/her_sac_32workers/0
5 | # SIR
6 | CUDA_VISIBLE_DEVICES=0 python run_her_augment.py --env FetchPushWallObstacle-v4 --random_ratio 0.7 --num_timesteps 1e6 --num_workers 32 --start_augment 0 --imitation_coef 0.1 --priority --log_path logs/FetchPushWallObstacle-v4_random0.7/her_sac_sir_32workers/0
7 | # SIL
8 | CUDA_VISIBLE_DEVICES=0 python run_her.py --env FetchPushWallObstacle-v4 --random_ratio 0.7 --sil --num_timesteps 1e6 --num_workers 32 --sil_coef 0.1 --priority --log_path logs/FetchPushWallObstacle-v4_random0.7/her_sac_sil_32workers/0
9 | # DS
10 | CUDA_VISIBLE_DEVICES=0 python run_her.py --env FetchPushWallObstacle-v4 --random_ratio 0.7 --reward_type dense --num_timesteps 1e6 --num_workers 32 --priority --log_path logs/FetchPushWallObstacle-v4_random0.7/her_sac_ds_32workers/0
11 |
12 | # Uniform.
13 | CUDA_VISIBLE_DEVICES=0 python run_her.py --env FetchPushWallObstacle-v4 --random_ratio 1.0 --num_timesteps 1e6 --num_workers 32 --priority --log_path logs/FetchPushWallObstacle-v4_random1.0/her_sac_32workers/0
14 | CUDA_VISIBLE_DEVICES=0 python run_her_augment.py --env FetchPushWallObstacle-v4 --random_ratio 1.0 --num_timesteps 1e6 --num_workers 32 --start_augment 0 --imitation_coef 0.1 --priority --log_path logs/FetchPushWallObstacle-v4_random1.0/her_sac_sir_32workers/0
15 | CUDA_VISIBLE_DEVICES=0 python run_her.py --env FetchPushWallObstacle-v4 --random_ratio 1.0 --sil --num_timesteps 1e6 --num_workers 32 --sil_coef 0.1 --priority --log_path logs/FetchPushWallObstacle-v4_random1.0/her_sac_sil_32workers/0
16 | CUDA_VISIBLE_DEVICES=0 python run_her.py --env FetchPushWallObstacle-v4 --random_ratio 1.0 --reward_type dense --num_timesteps 1e6 --num_workers 32 --priority --log_path logs/FetchPushWallObstacle-v4_random1.0/her_sac_ds_32workers/0
17 |
18 | # PPO-based
19 | # Hard 30%.
20 | CUDA_VISIBLE_DEVICES=0 python run_ppo.py --env FetchPushWallObstacle-v4 --random_ratio 0.7 --num_timesteps 5e7 --log_path logs/FetchPushWallObstacle-v4_random0.7/ppo/0
21 | CUDA_VISIBLE_DEVICES=0 python run_ppo_augment.py --env FetchPushWallObstacle-v4 --random_ratio 0.7 --num_timesteps 5e7 --n_subgoal 2 --parallel --aug_clip 0.0 --reuse_times 8 --log_path logs/FetchPushWallObstacle-v4_random0.7/ppo_sir/0
22 | CUDA_VISIBLE_DEVICES=0 python run_ppo_augment.py --env FetchPushWallObstacle-v4 --random_ratio 0.7 --num_timesteps 5e7 --n_subgoal 2 --parallel --self_imitate --aug_clip 0.0 --reuse_times 1 --log_path logs/FetchPushWallObstacle-v4_random0.7/ppo_sil/0
23 | CUDA_VISIBLE_DEVICES=0 python run_ppo.py --env FetchPushWallObstacle-v4 --random_ratio 0.7 --num_timesteps 5e7 --reward_type dense --log_path logs/FetchPushWallObstacle-v4_random0.7/ppo_ds/0
24 |
25 | # Uniform.
26 | CUDA_VISIBLE_DEVICES=0 python run_ppo.py --env FetchPushWallObstacle-v4 --random_ratio 1.0 --num_timesteps 5e7 --log_path logs/FetchPushWallObstacle-v4_random1.0/ppo/0
27 | CUDA_VISIBLE_DEVICES=0 python run_ppo_augment.py --env FetchPushWallObstacle-v4 --random_ratio 1.0 --num_timesteps 5e7 --n_subgoal 2 --parallel --aug_clip 0.0 --reuse_times 8 --log_path logs/FetchPushWallObstacle-v4_random1.0/ppo_sir/0
28 | CUDA_VISIBLE_DEVICES=0 python run_ppo_augment.py --env FetchPushWallObstacle-v4 --random_ratio 1.0 --num_timesteps 5e7 --n_subgoal 2 --parallel --self_imitate --aug_clip 0.0 --reuse_times 1 --log_path logs/FetchPushWallObstacle-v4_random1.0/ppo_sil/0
29 | CUDA_VISIBLE_DEVICES=0 python run_ppo.py --env FetchPushWallObstacle-v4 --random_ratio 1.0 --num_timesteps 5e7 --reward_type dense --log_path logs/FetchPushWallObstacle-v4_random1.0/ppo_ds/0
30 |
--------------------------------------------------------------------------------
/plot/plot_experiment_visual.py:
--------------------------------------------------------------------------------
1 | import seaborn as sns
2 | import pandas
3 | import numpy as np
4 | import sys, os
5 | from scipy import interpolate
6 | import matplotlib.pyplot as plt
7 |
8 |
9 | def get_item(log_file, label):
10 | data = pandas.read_csv(log_file, index_col=None, comment='#', error_bad_lines=True)
11 | return data[label].values
12 |
13 |
14 | def smooth(array, window):
15 | out = np.zeros(array.shape[0] - window)
16 | for i in range(out.shape[0]):
17 | out[i] = np.mean(array[i:i + window])
18 | return out
19 |
20 |
21 | if __name__ == '__main__':
22 | folder_name = sys.argv[1]
23 | env_name = sys.argv[2]
24 | assert env_name in ['uwall']
25 | # assert mode in ['train', 'hard', 'iteration']
26 | max_timesteps = {'uwall': 2.1e6}
27 | df_timesteps, df_sr, df_legend= [], [], []
28 | subfolders = ['ppo', 'sir', 'sil']
29 | # subfolders = ['ppo_attention_new', 'ppo_attention_sir_new', 'ppo_attention_sil_new']
30 |
31 | for subfolder in subfolders:
32 | last_sr = []
33 | for i in range(3):
34 | if not os.path.exists(os.path.join(folder_name, subfolder, str(i), 'progress.csv')):
35 | continue
36 | progress_file = os.path.join(folder_name, subfolder, str(i), 'progress.csv')
37 | raw_success_rate = get_item(progress_file, 'ep_success_rate')
38 | raw_total_timesteps = get_item(progress_file, 'total_timesteps')
39 | print(raw_total_timesteps.shape)
40 | sr_f = interpolate.interp1d(raw_total_timesteps, raw_success_rate, fill_value="extrapolate")
41 | timesteps = np.arange(0, max_timesteps[env_name], max_timesteps[env_name] // 70)
42 | print(timesteps[0], timesteps[-1], raw_total_timesteps[0], raw_total_timesteps[-1])
43 | success_rate = sr_f(timesteps)
44 | timesteps = smooth(timesteps, 5)
45 | success_rate = smooth(success_rate, 5)
46 | df_timesteps.append(timesteps)
47 | df_sr.append(success_rate)
48 | last_sr.append(success_rate[-1])
49 | df_legend.append(np.array([subfolder.upper()] * len(timesteps)))
50 |
51 | print(subfolder, 'sr', np.mean(last_sr))
52 | df_timesteps = np.concatenate(df_timesteps, axis=0).tolist()
53 | df_sr = np.concatenate(df_sr, axis=0).tolist()
54 | df_legend = np.concatenate(df_legend, axis=0).tolist()
55 | data = {'samples': df_timesteps, 'success_rate': df_sr, 'algo': df_legend}
56 | sr_timesteps = pandas.DataFrame(data)
57 |
58 | wspace = .3
59 | bottom = .3
60 | margin = .1
61 | # left = .08
62 | left = .1
63 | width = 1.25 / ((1. - left) / (2 + wspace + margin / 2))
64 | height = 1.5 / ((1. - bottom) / (1 + margin / 2))
65 |
66 | plt.style.use("ggplot")
67 | # plt.rcParams.update({'legend.fontsize': 14})
68 | p = sns.color_palette()
69 | sns.set_palette([p[i] for i in range(len(subfolders))])
70 | f, axes = plt.subplots(1, 1, figsize=(width, height))
71 | sns.lineplot(x='samples', y='success_rate', hue='algo', ax=axes, data=sr_timesteps)
72 | axes.set_xlabel('samples')
73 | axes.set_ylabel('avg. succ. rate')
74 | axes.get_legend().remove()
75 |
76 | handles, labels = axes.get_legend_handles_labels()
77 | f.legend(handles[:], ['PPO', 'SIR', 'SIL'], loc="lower right", ncol=1, bbox_to_anchor=(0.99, 0.18), title='')
78 | f.subplots_adjust(top=1. - margin / height, bottom=0.21, wspace=wspace, left=left, right=1. - margin / width)
79 | plt.savefig(os.path.join(folder_name, '../', os.path.basename(folder_name) + '.pdf'))
80 | print(os.path.join(folder_name, '../', os.path.basename(folder_name) + '.pdf'))
81 | plt.show()
82 |
--------------------------------------------------------------------------------
/plot/plot_reuse.py:
--------------------------------------------------------------------------------
1 | import seaborn as sns
2 | import pandas
3 | import numpy as np
4 | import sys, os
5 | from scipy import interpolate
6 | import matplotlib.pyplot as plt
7 |
8 |
9 | def get_item(log_file, label):
10 | data = pandas.read_csv(log_file, index_col=None, comment='#', error_bad_lines=True)
11 | return data[label].values
12 |
13 |
14 | def smooth(array, window):
15 | out = np.zeros(array.shape[0] - window)
16 | for i in range(out.shape[0]):
17 | out[i] = np.mean(array[i:i + window])
18 | return out
19 |
20 | if __name__ == '__main__':
21 | folder_name = sys.argv[1]
22 | # mode = sys.argv[2]
23 | # assert mode in ['success_rate', 'augment', 'eval']
24 | plt.style.use("ggplot")
25 | # plt.rcParams.update({'font.size': 20, 'legend.fontsize': 20,
26 | # 'axes.formatter.limits': [-5, 3]})
27 | wspace = .3
28 | bottom = .3
29 | margin = .1
30 | left = .08
31 | width = 3.5 / ((1. - left) / (2 + wspace + margin / 2))
32 | height = 1.5 / ((1. - bottom) / (1 + margin / 2))
33 |
34 | fig, axes = plt.subplots(1, 3, figsize=(width, height))
35 | for subfolder in ['sir_re1', 'sir_re4', 'sir_re8', 'sir_re16']:
36 | progress_file = os.path.join(folder_name, subfolder, '0', 'progress.csv')
37 | eval_file = os.path.join(folder_name, subfolder, '0', 'eval.csv')
38 | success_rate = get_item(progress_file, 'ep_reward_mean')
39 | total_timesteps = get_item(progress_file, 'total_timesteps')
40 | original_steps_per_iter = get_item(progress_file, 'original_timesteps')[0]
41 | augment_steps = get_item(progress_file, 'augment_steps')
42 | augment_ratio = augment_steps / (augment_steps + original_steps_per_iter)
43 | eval_reward = get_item(eval_file, 'mean_eval_reward')
44 | L = np.sum(total_timesteps < 3e7)
45 | total_timesteps = smooth(total_timesteps[:L], 20)
46 | success_rate = smooth(success_rate[:L], 20)
47 | augment_ratio = smooth(augment_ratio[:L], 20)
48 | augment_number = smooth(augment_steps[:L], 20)
49 | eval_reward = smooth(eval_reward[:L], 20)
50 | # if mode == 'success_rate':
51 | # ax.plot(total_timesteps, success_rate, label=subfolder.upper())
52 | # ax.set_ylabel('success rate')
53 | # elif mode == 'augment':
54 | # ax.plot(total_timesteps, augment_number, label=subfolder.upper())
55 | # ax.set_ylabel('number of augmented data')
56 | # elif mode == 'eval':
57 | # ax.plot(total_timesteps, eval_reward, label=subfolder.upper())
58 | # ax.set_ylabel('success rate')
59 | # ax.set_xlabel('samples')
60 | axes[0].plot(total_timesteps, success_rate, label=subfolder.upper())
61 | axes[0].set_xlabel('samples')
62 | axes[0].set_ylabel('success rate')
63 | axes[1].set_xlabel('samples')
64 | axes[1].plot(total_timesteps, eval_reward, label=subfolder.upper())
65 | axes[1].set_ylabel('success rate')
66 | axes[2].plot(total_timesteps, augment_ratio, label=subfolder.upper())
67 | axes[2].set_xlabel('samples')
68 | axes[2].set_ylabel('ratio of aug. data')
69 | # axes[0].get_legend().remove()
70 | # axes[1].get_legend().remove()
71 | # axes[2].get_legend().remove()
72 |
73 | # if mode == 'augment':
74 | # plt.legend(loc="lower right", bbox_to_anchor=(1.0, 0.0), ncol=1)
75 | fig.legend(labels=['RE1', 'RE4', 'RE8', 'RE16'], loc="lower center", bbox_to_anchor=(0.5, -0.03), ncol=4)
76 | fig.subplots_adjust(top=1. - margin / height, bottom=0.31, wspace=wspace, left=left, right=1. - margin / width)
77 | plt.savefig('reuse_ablation' + '.pdf')
78 |
79 |
80 |
--------------------------------------------------------------------------------
/plot/plot_compare.py:
--------------------------------------------------------------------------------
1 | import sys, os
2 | import numpy as np
3 | import pandas
4 | import matplotlib.pyplot as plt
5 |
6 |
7 | if __name__ == '__main__':
8 | option = sys.argv[1]
9 | log_paths = sys.argv[2:]
10 | assert option in ['success_rate', 'eval', 'entropy', 'aug_ratio', 'self_aug_ratio']
11 | window = 20
12 | def get_item(log_file, label):
13 | data = pandas.read_csv(log_file, index_col=None, comment='#', error_bad_lines=True)
14 | return data[label].values
15 | def smooth(array, window):
16 | out = np.zeros(array.shape[0] - window)
17 | for i in range(out.shape[0]):
18 | out[i] = np.mean(array[i:i + window])
19 | return out
20 | fig, ax = plt.subplots(1, 2, figsize=(10, 5))
21 | for log_path in log_paths:
22 | progress_file = os.path.join(log_path, 'progress.csv')
23 | eval_file = os.path.join(log_path, 'eval.csv')
24 | if 'ds' in log_path:
25 | success_rate = get_item(progress_file, 'ep_success_rate')
26 | else:
27 | success_rate = get_item(progress_file, 'ep_reward_mean')
28 | total_timesteps = get_item(progress_file, 'total_timesteps')
29 | entropy = get_item(progress_file, 'policy_entropy')
30 | try:
31 | eval_reward = get_item(eval_file, 'mean_eval_reward')
32 | n_updates = get_item(eval_file, 'n_updates')
33 | except:
34 | pass
35 | # success_rate = smooth(success_rate, window)
36 | # total_timesteps = smooth(total_timesteps, window)
37 | if option == 'success_rate':
38 | ax[0].plot(smooth(total_timesteps, window), smooth(success_rate, window), label=log_path)
39 | elif option == 'eval':
40 | # ax[0].plot(n_updates*65536, eval_reward, label=log_path)
41 |
42 | ax[0].plot(smooth(total_timesteps[n_updates-1], window), smooth(eval_reward, window), label=log_path)
43 | elif option == 'entropy':
44 | ax[0].plot(smooth(total_timesteps, window), smooth(entropy, window), label=log_path)
45 | elif option == 'aug_ratio':
46 | original_success = get_item(progress_file, 'original_success')
47 | total_success = get_item(progress_file, 'total_success')
48 | aug_ratio = (total_success - original_success) / (total_success + 1e-8)
49 | print(total_timesteps.shape, aug_ratio.shape)
50 | ax[0].plot(smooth(total_timesteps, 2), smooth(aug_ratio, 2), label=log_path)
51 | elif option == 'self_aug_ratio':
52 | self_aug_ratio = get_item(progress_file, 'self_aug_ratio')
53 | ax[0].plot(smooth(total_timesteps, window), smooth(self_aug_ratio, window), label=log_path)
54 | try:
55 | original_steps = get_item(progress_file, 'original_timesteps')[0]
56 | augment_steps = get_item(progress_file, 'augment_steps') / original_steps
57 | # augment_steps = smooth(augment_steps, window)
58 | except:
59 | augment_steps = np.zeros(total_timesteps.shape)
60 | ax[1].plot(smooth(total_timesteps, window), smooth(augment_steps, window), label=log_path)
61 | if option == 'success_rate':
62 | ax[0].set_title('ep reward mean')
63 | elif option == 'eval':
64 | ax[0].set_title('eval success rate')
65 | elif option == 'entropy':
66 | ax[0].set_title('entropy')
67 | elif option == 'aug_ratio':
68 | ax[0].set_title('aug success episode / total success episode')
69 | elif option == 'self_aug_ratio':
70 | ax[0].set_title('self_aug_ratio')
71 | ax[1].set_title('augment steps / original rollout steps')
72 | ax[0].grid()
73 | ax[1].grid()
74 | plt.legend()
75 | plt.show()
76 |
77 |
--------------------------------------------------------------------------------
/assets/fetch/push_wall_heavy_double_obstacle.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
42 |
43 |
44 |
45 |
46 |
47 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
--------------------------------------------------------------------------------
/plot/plot_compare_sac.py:
--------------------------------------------------------------------------------
1 | import sys, os
2 | import numpy as np
3 | import pandas
4 | import matplotlib.pyplot as plt
5 | from scipy import interpolate
6 |
7 |
8 | if __name__ == '__main__':
9 | option = sys.argv[1]
10 | log_paths = sys.argv[2:]
11 | assert option in ['success_rate', 'eval', 'entropy', 'aug_ratio', 'self_aug_ratio']
12 | window = 20 if option == 'eval' else 100
13 | def get_item(log_file, label):
14 | data = pandas.read_csv(log_file, index_col=None, comment='#', error_bad_lines=True)
15 | return data[label].values
16 | def smooth(array, window):
17 | out = np.zeros(array.shape[0] - window)
18 | for i in range(out.shape[0]):
19 | out[i] = np.mean(array[i:i + window])
20 | return out
21 | fig, ax = plt.subplots(1, 1, figsize=(5, 5))
22 | for log_path in log_paths:
23 | progress_file = os.path.join(log_path, 'progress.csv')
24 | eval_file = os.path.join(log_path, 'eval.csv')
25 | # success_rate = get_item(progress_file, 'ep_rewmean')
26 | success_rate = get_item(progress_file, 'success rate')
27 | total_timesteps = get_item(progress_file, 'total timesteps')
28 | entropy = get_item(progress_file, 'entropy')
29 | try:
30 | eval_reward = get_item(eval_file, 'mean_eval_reward')
31 | n_updates = get_item(eval_file, 'n_updates')
32 | except:
33 | pass
34 | # success_rate = smooth(success_rate, window)
35 | # total_timesteps = smooth(total_timesteps, window)
36 | if option == 'success_rate':
37 | ax.plot(smooth(total_timesteps, window), smooth(success_rate, window), label=log_path)
38 | elif option == 'eval':
39 | # ax[0].plot(n_updates*65536, eval_reward, label=log_path)
40 | try:
41 | original_steps = get_item(progress_file, 'original_timesteps')
42 | step_expand_fn = interpolate.interp1d(original_steps, total_timesteps, fill_value="extrapolate")
43 | n_updates = step_expand_fn(n_updates)
44 | except:
45 | pass
46 | ax.plot(smooth(n_updates, window), smooth(eval_reward, window), label=log_path)
47 | # ax[0].plot(smooth(total_timesteps[n_updates-1], window), smooth(eval_reward, window), label=log_path)
48 | elif option == 'entropy':
49 | ax.plot(smooth(total_timesteps, window), smooth(entropy, window), label=log_path)
50 | elif option == 'aug_ratio':
51 | original_success = get_item(progress_file, 'original_success')
52 | total_success = get_item(progress_file, 'total_success')
53 | aug_ratio = (total_success - original_success) / (total_success + 1e-8)
54 | print(total_timesteps.shape, aug_ratio.shape)
55 | ax.plot(smooth(total_timesteps, 2), smooth(aug_ratio, 2), label=log_path)
56 | elif option == 'self_aug_ratio':
57 | self_aug_ratio = get_item(progress_file, 'self_aug_ratio')
58 | ax.plot(smooth(total_timesteps, window), smooth(self_aug_ratio, window), label=log_path)
59 | '''
60 | try:
61 | original_steps = get_item(progress_file, 'original_timesteps')[0]
62 | augment_steps = get_item(progress_file, 'augment_steps') / original_steps
63 | # augment_steps = smooth(augment_steps, window)
64 | except:
65 | augment_steps = np.zeros(total_timesteps.shape)
66 | ax[1].plot(smooth(total_timesteps, window), smooth(augment_steps, window), label=log_path)
67 | '''
68 | if option == 'success_rate':
69 | ax.set_title('ep reward mean')
70 | elif option == 'eval':
71 | ax.set_title('eval success rate')
72 | elif option == 'entropy':
73 | ax.set_title('entropy')
74 | elif option == 'aug_ratio':
75 | ax.set_title('aug success episode / total success episode')
76 | elif option == 'self_aug_ratio':
77 | ax.set_title('self_aug_ratio')
78 | # ax[1].set_title('augment steps / original rollout steps')
79 | ax.grid()
80 | # ax[1].grid()
81 | plt.legend()
82 | plt.show()
83 |
84 |
--------------------------------------------------------------------------------
/assets/fetch/pick_and_place_box.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
--------------------------------------------------------------------------------
/utils/log_utils.py:
--------------------------------------------------------------------------------
1 | import csv
2 | import os
3 |
4 | import numpy as np
5 | from stable_baselines import logger
6 |
7 |
8 | def eval_model(eval_env, model):
9 | env = eval_env
10 | if hasattr(env.unwrapped, 'random_ratio'):
11 | assert abs(env.unwrapped.random_ratio) < 1e-4
12 | n_episode = 0
13 | ep_rewards = []
14 | ep_successes = []
15 | while n_episode < 20:
16 | ep_reward = 0.0
17 | ep_success = 0.0
18 | obs = env.reset()
19 | goal_dim = env.goal.shape[0]
20 | if goal_dim > 3:
21 | while (np.argmax(obs[-goal_dim + 3:]) != 0):
22 | obs = env.reset()
23 | done = False
24 | while not done:
25 | action, _ = model.predict(obs)
26 | obs, reward, done, info = env.step(action)
27 | ep_reward += reward
28 | ep_success += info['is_success']
29 | ep_rewards.append(ep_reward)
30 | ep_successes.append(ep_success)
31 | n_episode += 1
32 | return np.mean(ep_successes)
33 |
34 |
35 | def log_eval(num_update, mean_eval_reward, file_name='eval.csv'):
36 | if not os.path.exists(os.path.join(logger.get_dir(), file_name)):
37 | with open(os.path.join(logger.get_dir(), file_name), 'a', newline='') as csvfile:
38 | csvwriter = csv.writer(csvfile, delimiter=',', quotechar=',', quoting=csv.QUOTE_MINIMAL)
39 | title = ['n_updates', 'mean_eval_reward']
40 | csvwriter.writerow(title)
41 | with open(os.path.join(logger.get_dir(), file_name), 'a', newline='') as csvfile:
42 | csvwriter = csv.writer(csvfile, delimiter=',', quotechar=',', quoting=csv.QUOTE_MINIMAL)
43 | data = [num_update, mean_eval_reward]
44 | csvwriter.writerow(data)
45 |
46 |
47 | def stack_eval_model(eval_env, model, init_on_table=False):
48 | env = eval_env
49 | env.unwrapped.random_ratio = 0.0
50 | if init_on_table:
51 | env.unwrapped.task_array = [(env.n_object, i) for i in range(min(2, env.n_object))]
52 | else:
53 | env.unwrapped.task_array = [(env.n_object, i) for i in range(env.n_object)]
54 | assert abs(env.unwrapped.random_ratio) < 1e-4
55 | n_episode = 0
56 | ep_rewards = []
57 | ep_successes = []
58 | while n_episode < 20:
59 | ep_reward = 0.0
60 | ep_success = 0.0
61 | obs = env.reset()
62 | while env.current_nobject != env.n_object or (hasattr(env, 'task_mode') and env.task_mode != 1):
63 | obs = env.reset()
64 | done = False
65 | while not done:
66 | action, _ = model.predict(obs)
67 | obs, reward, done, info = env.step(action)
68 | ep_reward += reward
69 | ep_success += info['is_success']
70 | ep_rewards.append(ep_reward)
71 | ep_successes.append(ep_success)
72 | n_episode += 1
73 | return np.mean(ep_successes)
74 |
75 |
76 | def egonav_eval_model(eval_env, model, random_ratio=0.0, goal_idx=3, fixed_goal=None):
77 | env = eval_env
78 | if hasattr(env.unwrapped, 'random_ratio'):
79 | env.unwrapped.random_ratio = random_ratio
80 | n_episode = 0
81 | ep_rewards = []
82 | ep_successes = []
83 | while n_episode < 20:
84 | ep_reward = 0.0
85 | ep_success = 0.0
86 | obs = env.reset()
87 | goal_dim = env.goal.shape[0]
88 | if fixed_goal is not None:
89 | env.unwrapped.goal = fixed_goal.copy()
90 | obs = env.get_obs()
91 | obs = np.concatenate([obs[key] for key in ['observation', 'achieved_goal', 'desired_goal']])
92 | else:
93 | if goal_dim > 3:
94 | while np.argmax(obs[-goal_dim + 3:]) != goal_idx:
95 | obs = env.reset()
96 | done = False
97 | while not done:
98 | action, _ = model.predict(obs)
99 | obs, reward, done, info = env.step(action)
100 | ep_reward += reward
101 | ep_success += info['is_success']
102 | ep_rewards.append(ep_reward)
103 | ep_successes.append(ep_success)
104 | n_episode += 1
105 | return np.mean(ep_successes)
106 |
--------------------------------------------------------------------------------
/assets/masspoint/single_obstacle2.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
65 |
66 |
67 |
--------------------------------------------------------------------------------
/assets/fetch/open_close_box.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
--------------------------------------------------------------------------------
/plot/plot_walltime.py:
--------------------------------------------------------------------------------
1 | import seaborn as sns
2 | import pandas
3 | import numpy as np
4 | import sys, os
5 | from scipy import interpolate
6 | import matplotlib.pyplot as plt
7 |
8 |
9 | def get_item(log_file, label):
10 | data = pandas.read_csv(log_file, index_col=None, comment='#', error_bad_lines=True)
11 | return data[label].values
12 |
13 |
14 | def smooth(array, window):
15 | out = np.zeros(array.shape[0] - window)
16 | for i in range(out.shape[0]):
17 | out[i] = np.nanmean(array[i:i + window])
18 | return out
19 |
20 |
21 | if __name__ == '__main__':
22 | folder_name = sys.argv[1]
23 | env_name = sys.argv[2]
24 | assert env_name in ['particle']
25 | max_timesteps = {'umaze': 1e5, 'maze_ego': 2.5e7, 'maze_box': 4.9e7}
26 | df_walltime, df_sr, df_eval, df_legend = [], [], [], []
27 | # df_timesteps, df_sr, df_eval, df_legend, df_iteration, df_success_rate_iteration, df_legend_iteration = [], [], [], [], [], [], []
28 | subfolders = ['sac', 'ppo']
29 | if env_name == "particle":
30 | for subfolder in subfolders:
31 | last_sr = []
32 | for i in range(3):
33 | if not os.path.exists(os.path.join(folder_name, subfolder, str(i), 'progress.csv')):
34 | continue
35 | progress_file = os.path.join(folder_name, subfolder, str(i), 'progress.csv')
36 | eval_file = os.path.join(folder_name, subfolder, str(i), 'eval.csv')
37 | raw_walltime = get_item(progress_file, 'time_elapsed')
38 | raw_success_rate = get_item(progress_file, 'ep_success_rate') if subfolder == "ppo" else get_item(progress_file, 'success rate')
39 | if subfolder == "ppo":
40 | raw_walltime = raw_walltime[:560]
41 | raw_success_rate = raw_success_rate[:560]
42 | sr_f = interpolate.interp1d(raw_walltime, raw_success_rate, bounds_error=False)
43 | wall_time = np.arange(0, 3.3e5, 3.3e5 // 250)
44 | success_rate = sr_f(wall_time)
45 | print(wall_time[0], wall_time[-1], raw_walltime[0], raw_walltime[-1])
46 |
47 | if subfolder == "ppo":
48 | print(len(wall_time))
49 | print(success_rate[-10:])
50 | wall_time = smooth(wall_time, 10)
51 | success_rate = smooth(success_rate, 10)
52 | # eval_reward = smooth(eval_reward, 20)
53 | df_walltime.append(wall_time)
54 | df_sr.append(success_rate)
55 | last_sr.append(success_rate[-1])
56 | # df_eval.append(eval_reward)
57 | df_legend.append(np.array([subfolder.upper()] * len(wall_time)))
58 |
59 | print(subfolder, np.mean(last_sr))
60 |
61 | # df_timesteps = np.concatenate(df_timesteps, axis=0).tolist()
62 | df_walltime = np.concatenate(df_walltime, axis=0).tolist()
63 | df_sr = np.concatenate(df_sr, axis=0).tolist()
64 | # df_eval = np.concatenate(df_eval, axis=0).tolist()
65 | df_legend = np.concatenate(df_legend, axis=0).tolist()
66 | data = {'wall time': df_walltime, 'success_rate': df_sr, 'algo': df_legend}
67 | sr_walltime = pandas.DataFrame(data)
68 |
69 | wspace = .3
70 | bottom = .3
71 | margin = .1
72 | left = .15
73 | width = 1.5 / ((1. - left) / (2 + wspace + margin / 2))
74 | height = 1.5 / ((1. - bottom) / (1 + margin / 2))
75 |
76 | plt.style.use("ggplot")
77 | # plt.rcParams.update({'legend.fontsize': 14})
78 | p = sns.color_palette()
79 | sns.set_palette([p[i] for i in range(len(subfolders))])
80 | f, axes = plt.subplots(1, 1, figsize=(width, height))
81 | sns.lineplot(x='wall time', y='success_rate', hue='algo', ax=axes, data=sr_walltime)
82 | axes.set_xlabel('wall time')
83 | axes.set_ylabel('success_rate')
84 | axes.xaxis.get_major_formatter().set_powerlimits((0, 1))
85 | axes.get_legend().remove()
86 |
87 | handles, labels = axes.get_legend_handles_labels()
88 |
89 | f.legend(handles[:], ['SAC', 'PPO'], loc="lower right", ncol=1, bbox_to_anchor=(0.99, 0.18), title='')
90 | f.subplots_adjust(top=1. - margin / height, bottom=0.21, wspace=wspace, left=left, right=1. - margin / width)
91 | plt.savefig(os.path.join(folder_name, '../', os.path.basename(folder_name) + env_name + 'walltime.pdf'))
92 | plt.show()
93 |
--------------------------------------------------------------------------------
/plot/plot_success_traj.py:
--------------------------------------------------------------------------------
1 | import sys, pandas, os, imageio
2 | import numpy as np
3 | import matplotlib.pyplot as plt
4 | from mpl_toolkits.mplot3d import Axes3D
5 |
6 |
7 | def get_item(log_file, label):
8 | data = pandas.read_csv(log_file, index_col=None, comment='#')
9 | return data[label].values
10 |
11 | if __name__ == '__main__':
12 | if len(sys.argv) < 2:
13 | print('Usage: python plot_success_traj.py [csv_name]')
14 | exit()
15 | fname = sys.argv[1]
16 | dones = get_item(fname, 'done')
17 | gripper_xs = get_item(fname, 'gripper_x')
18 | gripper_ys = get_item(fname, 'gripper_y')
19 | gripper_zs = get_item(fname, 'gripper_z')
20 | box_xs = get_item(fname, 'box_x')
21 | box_ys = get_item(fname, 'box_y')
22 | box_zs = get_item(fname, 'box_z')
23 | obstacle_xs = get_item(fname, 'obstacle_x')
24 | obstacle_ys = get_item(fname, 'obstacle_y')
25 | obstacle_zs = get_item(fname, 'obstacle_z')
26 | obstacle1_xs = get_item(fname, 'obstacle1_x')
27 | obstacle1_ys = get_item(fname, 'obstacle1_y')
28 | obstacle1_zs = get_item(fname, 'obstacle1_z')
29 | goals = []
30 | for i in range(6):
31 | goals.append(get_item(fname, 'goal_' + str(i)))
32 | goals = np.asarray(goals)
33 | goals = np.swapaxes(goals, 0, 1)
34 | end_points = np.where(dones > 0.5)[0]
35 | print('#episodes', len(end_points))
36 | for i in end_points:
37 | assert np.argmax(goals[i][3:]) == 0
38 | # print(goals[i])
39 | '''
40 | _print_end_points = np.random.choice(end_points[:len(end_points) // 100], size=20)
41 | _print_end_points2 = np.random.choice(end_points[len(end_points) // 100 * 99:], size=20)
42 | _print_end_points3 = np.random.choice(end_points[len(end_points) // 100 * 50: len(end_points) // 10 * 51], size=20)
43 | print('first percentile')
44 | for i in _print_end_points:
45 | print(i, goals[i])
46 | print('50 percentile')
47 | for i in _print_end_points2:
48 | print(i, goals[i])
49 | print('last percentile')
50 | for i in _print_end_points3:
51 | print(i, goals[i])
52 | '''
53 | # print(goals[:, 3:])
54 |
55 | ep_idx = 0
56 | step = 0
57 | has_switch = False
58 | fig = plt.figure()
59 | ax = fig.add_subplot(111)
60 | for i in range(end_points[5]):
61 | ax.cla()
62 | # ax.set_xlim(1.0, 1.6)
63 | # ax.set_ylim(0.4, 1.1)
64 | ax.set_xlim(0.0, 5.0)
65 | ax.set_ylim(0.0, 5.0)
66 | # ax.set_zlim(0, 1.2)
67 | ax.set_xlabel('x')
68 | ax.set_ylabel('y')
69 | ax.scatter(gripper_xs[i], gripper_ys[i], c='tab:gray')
70 | ax.scatter(box_xs[i], box_ys[i], c='tab:blue')
71 | ax.scatter(obstacle_xs[i], obstacle_ys[i], c='tab:brown')
72 | ax.scatter(obstacle1_xs[i], obstacle1_ys[i], c='#ff00ff')
73 | ax.plot([1.5, 1.5, 1.8, 1.8], [0.0, 2.0, 2.0, 0.0], 'tab:gray')
74 | ax.plot([1.5, 1.5, 1.8, 1.8], [5.0, 3.0, 3.0, 5.0], 'tab:gray')
75 | ax.plot([3.5, 3.5, 3.2, 3.2], [0.0, 2.0, 2.0, 0.0], 'tab:gray')
76 | ax.plot([3.5, 3.5, 3.2, 3.2], [5.0, 3.0, 3.0, 5.0], 'tab:gray')
77 | if not has_switch and np.argmax(goals[i][3:]) == 1:
78 | print('episode %d switch step %d' % (ep_idx, step))
79 | print('restart box', box_xs[i], box_ys[i], 'subgoal', goals[i])
80 | has_switch = True
81 | if np.argmax(goals[i][3:]) == 0:
82 | marker = '*'
83 | else:
84 | marker = '^'
85 | ax.scatter(goals[i][0], goals[i][1], c='tab:red', marker=marker)
86 | ax.set_title('episode %d step %d' % (ep_idx, step))
87 | step += 1
88 | if dones[i] > 0.5:
89 | assert np.argmax(goals[i][3:]) == 0
90 | print('ultimate goal', goals[i])
91 | ep_idx += 1
92 | step = 0
93 | has_switch = False
94 | plt.savefig('tempimg' + str(i) + '.png')
95 | plt.pause(0.1)
96 | os.system('ffmpeg -r 2 -start_number 0 -i tempimg%d.png -c:v libx264 -pix_fmt yuv420p ' +
97 | os.path.join(os.path.dirname(fname), 'augment_data.mp4'))
98 | # images = []
99 | for i in range(end_points[5]):
100 | # images.append(plt.imread('tempimg' + str(i) + '.png'))
101 | os.remove('tempimg' + str(i) + '.png')
102 | # imageio.mimsave('augment_data.gif', images)
103 |
--------------------------------------------------------------------------------
/assets/fetch/shared.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
--------------------------------------------------------------------------------
/plot/plot_experiment_success_len.py:
--------------------------------------------------------------------------------
1 | import seaborn as sns
2 | import pandas
3 | import numpy as np
4 | import sys, os
5 | from scipy import interpolate
6 | import matplotlib.pyplot as plt
7 | from stable_baselines.bench.monitor import load_results
8 |
9 | def get_item(log_file, label):
10 | data = pandas.read_csv(log_file, index_col=None, comment='#', error_bad_lines=True)
11 | return data[label].values
12 |
13 |
14 | def smooth(array, window):
15 | out = np.zeros(array.shape[0] - window)
16 | for i in range(out.shape[0]):
17 | out[i] = np.mean(array[i:i + window])
18 | return out
19 |
20 |
21 | if __name__ == '__main__':
22 | folder_name = sys.argv[1]
23 | env_name = sys.argv[2]
24 | assert env_name in ['push', 'particle', 'maze', 'stacking']
25 | alg = sys.argv[3]
26 | assert alg in ['ppo', 'sac']
27 | # assert mode in ['train', 'hard', 'iteration']
28 | if alg == 'ppo':
29 | max_timesteps = {'push': 4.99e7,
30 | 'particle': 2.5e8,
31 | 'maze': 1.5e6,
32 | 'stacking': 2e8,}
33 | elif alg == 'sac':
34 | max_timesteps = {'push': 1.36e7,}
35 | # max_iterationss = {'push': 750,
36 | # 'particle': 510,
37 | # 'maze': 245,}
38 | # df_timesteps, df_sr, df_eval, df_legend, df_iteration, df_eval_iteration, df_legend_iteration = [], [], [], [], [], [], []
39 | df_iteration, df_len_mean, df_legend_iteration = [], [], []
40 | subfolders = [alg, 'sir', 'sil']
41 | if 'particle_random0.7' in folder_name:
42 | subfolders = ['ppo', 'sir', 'sil']
43 | elif 'particle_random1.0' in folder_name:
44 | subfolders = ['ppo', 'sir', 'sil']
45 | elif 'maze' in folder_name:
46 | subfolders = ['ppo', 'sir_re2']
47 | for subfolder in subfolders:
48 | last_success_len = []
49 | for i in range(3):
50 | if not os.path.exists(os.path.join(folder_name, subfolder, str(i), '0.monitor.csv')):
51 | continue
52 | monitor_df = load_results(os.path.join(folder_name, subfolder, str(i)))
53 | raw_len = monitor_df.l
54 | raw_success = monitor_df.is_success
55 | cum_len = raw_len.cumsum()
56 | masked_len = smooth(raw_len[raw_success > 0.5].values, 100)
57 | masked_cum_len = smooth(cum_len[raw_success > 0.5].values, 100)
58 | success_len_f = interpolate.interp1d(masked_cum_len, masked_len, fill_value="extrapolate")
59 | print(masked_cum_len[-1], max_timesteps[env_name])
60 | timesteps = np.arange(0, max_timesteps[env_name], max_timesteps[env_name] // 500)
61 | success_len = success_len_f(timesteps)
62 | # iterations = timesteps / timesteps[-1] * max_iterationss[env_name]
63 | # iterations = smooth(iterations, 20)
64 | timesteps = smooth(timesteps, 20)
65 | success_len = smooth(success_len, 20)
66 |
67 | last_success_len.append(success_len[-1])
68 |
69 | df_iteration.append(timesteps)
70 | df_len_mean.append(success_len)
71 | df_legend_iteration.append(np.array([subfolder.upper()] * len(timesteps)))
72 | assert len(timesteps) == len(success_len)
73 | print(subfolder, np.mean(last_success_len))
74 | df_iteration = np.concatenate(df_iteration, axis=0).tolist()
75 | df_len_mean = np.concatenate(df_len_mean, axis=0).tolist()
76 | df_legend_iteration = np.concatenate(df_legend_iteration, axis=0).tolist()
77 | data = {'timesteps': df_iteration, 'len_mean': df_len_mean, 'algo': df_legend_iteration}
78 | len_mean_iteration = pandas.DataFrame(data)
79 |
80 | wspace = .3
81 | bottom = .3
82 | margin = .1
83 | left = .18
84 | width = 1.2 / ((1. - left) / (2. + wspace + margin / 2))
85 | height = 1.5 / ((1. - bottom) / (1 + margin / 2))
86 |
87 | plt.style.use("ggplot")
88 | # plt.rcParams.update({'legend.fontsize': 14})
89 | p = sns.color_palette()
90 | sns.set_palette([p[i] for i in range(len(subfolders))])
91 | f, axes = plt.subplots(1, 1, figsize=(width, height))
92 | sns.lineplot(x='timesteps', y='len_mean', hue='algo', ax=axes, data=len_mean_iteration)
93 | axes.set_xlabel('timesteps')
94 | axes.set_ylabel('episode length')
95 | axes.get_legend().remove()
96 | handles, labels = axes.get_legend_handles_labels()
97 | f.legend(handles[1:], [alg.upper(), 'SIR', 'SIL'], loc="upper right", ncol=1, title='')
98 | f.subplots_adjust(top=1. - margin / height, bottom=0.2, wspace=wspace, left=left, right=1. - margin / width)
99 | plt.savefig(os.path.join(folder_name, '../', os.path.basename(folder_name) + '_successlen.pdf'))
100 | # plt.show()
101 |
--------------------------------------------------------------------------------
/assets/masspoint/double_obstacle.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
74 |
75 |
76 |
--------------------------------------------------------------------------------
/baselines/her/utils.py:
--------------------------------------------------------------------------------
1 | from collections import OrderedDict
2 |
3 | import numpy as np
4 | from gym import spaces
5 | from stable_baselines.common.vec_env import VecEnv
6 |
7 | # Important: gym mixes up ordered and unordered keys
8 | # and the Dict space may return a different order of keys that the actual one
9 | KEY_ORDER = ['observation', 'achieved_goal', 'desired_goal']
10 |
11 |
12 | class HERGoalEnvWrapper(object):
13 | """
14 | A wrapper that allow to use dict observation space (coming from GoalEnv) with
15 | the RL algorithms.
16 | It assumes that all the spaces of the dict space are of the same type.
17 |
18 | :param env: (gym.GoalEnv)
19 | """
20 |
21 | def __init__(self, env):
22 | super(HERGoalEnvWrapper, self).__init__()
23 | self.env = env
24 | self.metadata = self.env.metadata
25 | self.action_space = env.action_space
26 | self.spaces = list(env.observation_space.spaces.values())
27 | # Check that all spaces are of the same type
28 | # (current limitation of the wrapper)
29 | space_types = [type(env.observation_space.spaces[key]) for key in KEY_ORDER]
30 | assert len(set(space_types)) == 1, "The spaces for goal and observation"\
31 | " must be of the same type"
32 |
33 | if isinstance(self.spaces[0], spaces.Discrete):
34 | self.obs_dim = 1
35 | self.goal_dim = 1
36 | else:
37 | goal_space_shape = env.observation_space.spaces['achieved_goal'].shape
38 | self.obs_dim = env.observation_space.spaces['observation'].shape[0]
39 | self.goal_dim = goal_space_shape[0]
40 |
41 | if len(goal_space_shape) == 2:
42 | assert goal_space_shape[1] == 1, "Only 1D observation spaces are supported yet"
43 | else:
44 | assert len(goal_space_shape) == 1, "Only 1D observation spaces are supported yet"
45 |
46 |
47 | if isinstance(self.spaces[0], spaces.MultiBinary):
48 | total_dim = self.obs_dim + 2 * self.goal_dim
49 | self.observation_space = spaces.MultiBinary(total_dim)
50 |
51 | elif isinstance(self.spaces[0], spaces.Box):
52 | lows = np.concatenate([space.low for space in self.spaces])
53 | highs = np.concatenate([space.high for space in self.spaces])
54 | self.observation_space = spaces.Box(lows, highs, dtype=np.float32)
55 |
56 | elif isinstance(self.spaces[0], spaces.Discrete):
57 | dimensions = [env.observation_space.spaces[key].n for key in KEY_ORDER]
58 | self.observation_space = spaces.MultiDiscrete(dimensions)
59 |
60 | else:
61 | raise NotImplementedError("{} space is not supported".format(type(self.spaces[0])))
62 |
63 | if isinstance(self.env, VecEnv):
64 | self.reward_type = self.env.get_attr('reward_type')[0]
65 | else:
66 | self.reward_type = self.env.reward_type
67 |
68 |
69 | def convert_dict_to_obs(self, obs_dict):
70 | """
71 | :param obs_dict: (dict)
72 | :return: (np.ndarray)
73 | """
74 | # Note: achieved goal is not removed from the observation
75 | # this is helpful to have a revertible transformation
76 | if isinstance(self.observation_space, spaces.MultiDiscrete):
77 | # Special case for multidiscrete
78 | return np.concatenate([[int(obs_dict[key])] for key in KEY_ORDER])
79 | return np.concatenate([obs_dict[key] for key in KEY_ORDER], axis=-1)
80 |
81 | def convert_obs_to_dict(self, observations):
82 | """
83 | Inverse operation of convert_dict_to_obs
84 |
85 | :param observations: (np.ndarray)
86 | :return: (OrderedDict)
87 | """
88 | return OrderedDict([
89 | ('observation', observations[...,:self.obs_dim]),
90 | ('achieved_goal', observations[...,self.obs_dim:self.obs_dim + self.goal_dim]),
91 | ('desired_goal', observations[...,self.obs_dim + self.goal_dim:]),
92 | ])
93 |
94 | def step(self, action):
95 | obs, reward, done, info = self.env.step(action)
96 | return self.convert_dict_to_obs(obs), reward, done, info
97 |
98 | def seed(self, seed=None):
99 | return self.env.seed(seed)
100 |
101 | def reset(self):
102 | return self.convert_dict_to_obs(self.env.reset())
103 |
104 | def compute_reward(self, achieved_goal, desired_goal, info, indices=None):
105 | if isinstance(self.env, VecEnv):
106 | return self.env.env_method('compute_reward', achieved_goal, desired_goal, info, indices=indices)
107 | return self.env.compute_reward(achieved_goal, desired_goal, info)
108 |
109 | def compute_reward_and_success(self, achieved_goal, desired_goal, info, indices=None):
110 | if isinstance(self.env, VecEnv):
111 | return self.env.env_method('compute_reward_and_success', achieved_goal, desired_goal, info, indices=indices)
112 | return self.env.compute_reward_and_success(achieved_goal, desired_goal, info)
113 |
114 | def render(self, mode='human'):
115 | return self.env.render(mode)
116 |
117 | def close(self):
118 | return self.env.close()
119 |
--------------------------------------------------------------------------------
/assets/masspoint/emaze_easy.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
--------------------------------------------------------------------------------
/assets/masspoint/generate_xml.py:
--------------------------------------------------------------------------------
1 | template = '''
2 |
3 |
4 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 | {scene}
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 | {obstacles}
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
47 |
48 |
49 | '''
50 |
51 | scene_template = '''
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 | {walls}
62 | '''
63 |
64 |
65 | def generate_xml(num_obstacles):
66 | wall_template = ''
67 | walls = []
68 | for i in range(num_obstacles):
69 | walls.append(wall_template.format(**dict(id=2 * i, pos=str(1.7 * (i + 1)) + ' 1.0 0.25')))
70 | walls.append(wall_template.format(**dict(id=2 * i + 1, pos=str(1.7 * (i + 1)) + ' 4.0 0.25')))
71 | scene = scene_template.format(**dict(bound0_pos=str(1.7 * (num_obstacles + 1) / 2) + ' -0.2 0.25',
72 | bound1_pos=str(1.7 * (num_obstacles + 1) / 2) + ' 5.2 0.25',
73 | bound2_pos='-0.2 2.5 0.25',
74 | bound3_pos=str(1.7 * (num_obstacles + 1) + 0.2) + ' 2.5 0.25',
75 | bound_v_size=str(1.7 * (num_obstacles + 1) / 2) + ' 0.2 0.25',
76 | bound_h_size='0.2 2.9 0.25',
77 | walls="\n".join(walls)))
78 | obstacle_template = '''
79 |
80 |
81 |
82 |
83 |
84 |
85 | '''
86 | obstacles = [obstacle_template.format(**dict(id=i + 1)) for i in range(num_obstacles)]
87 | xml = template.format(**dict(scene=scene, obstacles="\n".join(obstacles)))
88 | return xml
89 |
--------------------------------------------------------------------------------
/run_ppo_augment.py:
--------------------------------------------------------------------------------
1 | from baselines import PPO2_SIR
2 | from stable_baselines import logger
3 | from stable_baselines.common import set_global_seeds
4 | from stable_baselines.common.vec_env import SubprocVecEnv
5 |
6 | from utils.log_utils import eval_model, log_eval, stack_eval_model
7 | from utils.parallel_subproc_vec_env import ParallelSubprocVecEnv
8 | from stable_baselines.common.policies import register_policy
9 |
10 | from utils.make_env_utils import configure_logger, make_env, get_num_workers, get_env_kwargs, get_train_kwargs, \
11 | get_policy_kwargs
12 |
13 | import os, time, argparse
14 |
15 |
16 | def arg_parse():
17 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
18 | parser.add_argument('--env', default='FetchPushWallObstacle-v4')
19 | parser.add_argument('--policy', type=str, default='MlpPolicy')
20 | parser.add_argument('--seed', type=int, default=42)
21 | parser.add_argument('--num_timesteps', type=float, default=1e8)
22 | parser.add_argument('--log_path', default=None, type=str)
23 | parser.add_argument('--load_path', default=None, type=str)
24 | parser.add_argument('--random_ratio', default=1.0, type=float)
25 | parser.add_argument('--aug_clip', default=0.1, type=float)
26 | parser.add_argument('--aug_adv_weight', default=1.0, type=float)
27 | parser.add_argument('--n_subgoal', default=4, type=int)
28 | parser.add_argument('--parallel', action="store_true", default=False)
29 | parser.add_argument('--self_imitate', action="store_true", default=False)
30 | parser.add_argument('--sil_clip', default=0.2, type=float)
31 | parser.add_argument('--start_augment', type=float, default=0)
32 | parser.add_argument('--reuse_times', default=1, type=int)
33 | parser.add_argument('--reward_type', default="sparse", type=str)
34 | parser.add_argument('--n_object', default=2, type=int)
35 | parser.add_argument('--curriculum', action="store_true", default=False)
36 | parser.add_argument('--sequential', action="store_true", default=False)
37 | parser.add_argument('--play', action="store_true", default=False)
38 | parser.add_argument('--export_gif', action="store_true", default=False)
39 | parser.add_argument('--log_trace', action="store_true", default=False)
40 | args = parser.parse_args()
41 | return args
42 |
43 |
44 | def main(args):
45 | log_dir = args.log_path if (args.log_path is not None) else \
46 | "/tmp/stable_baselines_" + time.strftime('%Y-%m-%d-%H-%M-%S')
47 | configure_logger(log_dir)
48 |
49 | set_global_seeds(args.seed)
50 |
51 | n_cpu = get_num_workers(args.env) if not args.play else 1
52 |
53 | env_kwargs = get_env_kwargs(args.env, args.random_ratio, args.sequential, args.reward_type,
54 | args.n_object, args.curriculum)
55 |
56 | def make_thunk(rank):
57 | return lambda: make_env(env_id=args.env, rank=rank, log_dir=log_dir, flatten_dict=True, kwargs=env_kwargs)
58 |
59 | env = SubprocVecEnv([make_thunk(i) for i in range(n_cpu)])
60 |
61 | aug_env_name = args.env.split('-')[0] + 'Unlimit-' + args.env.split('-')[1]
62 | aug_env_kwargs = env_kwargs.copy()
63 | aug_env_kwargs['max_episode_steps'] = None
64 |
65 | def make_thunk_aug(rank):
66 | return lambda: make_env(env_id=aug_env_name, rank=rank, flatten_dict=True, kwargs=aug_env_kwargs)
67 |
68 | if not args.parallel:
69 | aug_env = make_env(env_id=aug_env_name, rank=0, flatten_dict=True, kwargs=aug_env_kwargs)
70 | else:
71 | aug_env = ParallelSubprocVecEnv([make_thunk_aug(i) for i in range(min(32, n_cpu))], reset_when_done=False)
72 | print(aug_env)
73 |
74 | if os.path.exists(os.path.join(logger.get_dir(), 'eval.csv')):
75 | os.remove(os.path.join(logger.get_dir(), 'eval.csv'))
76 | print('Remove existing eval.csv')
77 | eval_env_kwargs = env_kwargs.copy()
78 | eval_env_kwargs['random_ratio'] = 0.0
79 | if "use_cu" in eval_env_kwargs:
80 | eval_env_kwargs['use_cu'] = False
81 | eval_env = make_env(env_id=args.env, rank=0, flatten_dict=True, kwargs=eval_env_kwargs)
82 | print(eval_env)
83 |
84 | if not args.play:
85 | os.makedirs(log_dir, exist_ok=True)
86 |
87 | from utils.attention_policy import AttentionPolicy
88 | register_policy('AttentionPolicy', AttentionPolicy)
89 |
90 | policy_kwargs = get_policy_kwargs("ppo_sir", args)
91 |
92 | train_kwargs = get_train_kwargs("ppo_sir", args, parsed_action_noise=None, eval_env=eval_env, aug_env=aug_env)
93 |
94 | model = PPO2_SIR(args.policy, env, verbose=1, nminibatches=32, lam=0.95, gamma=0.99, noptepochs=10,
95 | ent_coef=0.01, learning_rate=3e-4, cliprange=0.2, policy_kwargs=policy_kwargs,
96 | horizon=env_kwargs['max_episode_steps'], **train_kwargs)
97 |
98 | def callback(_locals, _globals):
99 | num_update = _locals["update"]
100 | if 'FetchStack' in args.env:
101 | mean_eval_reward = stack_eval_model(eval_env, _locals["self"])
102 | else:
103 | mean_eval_reward = eval_model(eval_env, _locals["self"])
104 | log_eval(num_update, mean_eval_reward)
105 | if num_update % 10 == 0:
106 | model_path = os.path.join(log_dir, 'model_' + str(num_update // 10))
107 | model.save(model_path)
108 | print('model saved to', model_path)
109 | return True
110 |
111 | model.learn(total_timesteps=int(args.num_timesteps), callback=callback, seed=args.seed, log_interval=1)
112 | model.save(os.path.join(log_dir, 'final'))
113 |
114 |
115 | if __name__ == '__main__':
116 | args = arg_parse()
117 | print('arg parsed')
118 | main(args)
119 |
--------------------------------------------------------------------------------
/utils/replay_buffer.py:
--------------------------------------------------------------------------------
1 | from stable_baselines.deepq import ReplayBuffer, PrioritizedReplayBuffer
2 | import numpy as np
3 | import random
4 |
5 |
6 | class DoublePrioritizedReplayWrapper(object):
7 | def __init__(self, buffer1, buffer2):
8 | assert isinstance(buffer1, PrioritizedReplayBuffer)
9 | assert isinstance(buffer2, PrioritizedReplayBuffer)
10 | self.buffer1 = buffer1
11 | self.buffer2 = buffer2
12 | self.min_tree_operation = buffer1._it_min._operation
13 | self.sum_tree_operation = buffer1._it_sum._operation
14 |
15 | def _sample_proportional(self, batch_size):
16 | res1, res2 = [], []
17 | _sum1 = self.buffer1._it_sum.sum()
18 | _sum2 = self.buffer2._it_sum.sum()
19 | for i in range(batch_size):
20 | mass = random.random() * (_sum1 + _sum2)
21 | if mass < _sum1:
22 | idx = self.buffer1._it_sum.find_prefixsum_idx(mass)
23 | res1.append(idx)
24 | else:
25 | idx = self.buffer2._it_sum.find_prefixsum_idx(mass - _sum1)
26 | res2.append(idx)
27 | return res1, res2
28 |
29 | def sample(self, batch_size, beta=0):
30 | assert beta > 0
31 |
32 | idxes1, idxes2 = self._sample_proportional(batch_size)
33 |
34 | weights1, weights2 = [], []
35 | p_min = self.min_tree_operation(self.buffer1._it_min.min(), self.buffer2._it_min.min()) / self.sum_tree_operation(self.buffer1._it_sum.sum(), self.buffer2._it_sum.sum())
36 | max_weight = (p_min * (len(self.buffer1._storage) + len(self.buffer2._storage))) ** (-beta)
37 |
38 | for idx in idxes1:
39 | p_sample = self.buffer1._it_sum[idx] / (self.buffer1._it_sum.sum() + self.buffer2._it_sum.sum())
40 | weight = (p_sample * (len(self.buffer1._storage) + len(self.buffer2._storage))) ** (-beta)
41 | weights1.append(weight / max_weight)
42 | for idx in idxes2:
43 | p_sample = self.buffer2._it_sum[idx] / (self.buffer1._it_sum.sum() + self.buffer2._it_sum.sum())
44 | weight = (p_sample * (len(self.buffer1._storage) + len(self.buffer2._storage))) ** (-beta)
45 | weights2.append(weight / max_weight)
46 |
47 | weights1 = np.array(weights1)
48 | weights2 = np.array(weights2)
49 | encoded_sample1 = self.buffer1._encode_sample(idxes1)
50 | encoded_sample2 = self.buffer2._encode_sample(idxes2)
51 | return tuple(list(encoded_sample1) + [weights1, idxes1]), tuple(list(encoded_sample2) + [weights2, idxes2])
52 |
53 |
54 | class MultiWorkerReplayBuffer(ReplayBuffer):
55 | def __init__(self, size, num_workers=1, gamma=0.99):
56 | super(MultiWorkerReplayBuffer, self).__init__(size)
57 | self.num_workers = num_workers
58 | self.gamma = gamma
59 | self.local_transitions = [[] for _ in range(self.num_workers)]
60 |
61 | def add(self, obs_t, action, reward, obs_tp1, done):
62 | assert obs_t.shape[0] == self.num_workers
63 | for i in range(self.num_workers):
64 | self.local_transitions[i].append([obs_t[i], action[i], reward[i], obs_tp1[i], done[i]])
65 | if done[i]:
66 | for j in range(len(self.local_transitions[i])):
67 | super().add(*(self.local_transitions[i][j]))
68 | self.local_transitions[i] = []
69 |
70 |
71 | class PrioritizedMultiWorkerReplayBuffer(PrioritizedReplayBuffer):
72 | def __init__(self, size, alpha, num_workers=1, gamma=0.99):
73 | super(PrioritizedMultiWorkerReplayBuffer, self).__init__(size, alpha)
74 | self.num_workers = num_workers
75 | self.gamma = gamma
76 | self.local_transitions = [[] for _ in range(self.num_workers)]
77 | self.model = None
78 |
79 | def set_model(self, model):
80 | self.model = model
81 |
82 | def add(self, obs_t, action, reward, obs_tp1, done):
83 | assert obs_t.shape[0] == self.num_workers
84 | for i in range(self.num_workers):
85 | self.local_transitions[i].append([obs_t[i], action[i], reward[i], obs_tp1[i], done[i]])
86 | # assert len(self.local_priorities[i]) == len(self.local_transitions[i])
87 | if done[i]:
88 | batch_obs, batch_act, batch_reward, batch_next_obs, batch_done = zip(*(self.local_transitions[i]))
89 | batch_obs, batch_act, batch_reward, batch_next_obs, batch_done = \
90 | map(lambda v: np.asarray(v),[batch_obs, batch_act, batch_reward, batch_next_obs, batch_done])
91 | priorities = compute_priority(self.model, batch_obs, batch_act,
92 | batch_next_obs, batch_reward, batch_done)
93 | for j in range(len(self.local_transitions[i])):
94 | p_idx = self._next_idx # The add call will change self._next_idx
95 | super().add(*(self.local_transitions[i][j]))
96 | self.update_priorities([p_idx], [priorities[j]])
97 | self.local_transitions[i] = []
98 |
99 |
100 | def discounted_sum(arr, gamma):
101 | arr = np.asarray(arr)
102 | return np.sum(arr * np.power(gamma, np.arange(arr.shape[0])))
103 |
104 |
105 | def compute_priority(sac_model, batch_obs, batch_act, batch_next_obs, batch_reward, batch_done):
106 | q1, value = sac_model.sess.run([sac_model.step_ops[4], sac_model.value_target], feed_dict={
107 | sac_model.observations_ph: batch_obs,
108 | sac_model.actions_ph: batch_act,
109 | sac_model.next_observations_ph: batch_next_obs,
110 | })
111 | priorities = np.reshape(batch_reward, q1.shape) + (
112 | 1 - np.reshape(batch_done, q1.shape)) * sac_model.gamma * value - q1
113 | priorities = np.squeeze(np.abs(priorities) + 1e-4, axis=-1).tolist()
114 | return priorities
115 |
--------------------------------------------------------------------------------
/plot/plot_cl_experiment.py:
--------------------------------------------------------------------------------
1 | import seaborn as sns
2 | import pandas
3 | import numpy as np
4 | import sys, os
5 | from scipy import interpolate
6 | import matplotlib.pyplot as plt
7 |
8 |
9 | def get_item(log_file, label):
10 | data = pandas.read_csv(log_file, index_col=None, comment='#', error_bad_lines=True)
11 | return data[label].values
12 |
13 |
14 | def smooth(array, window):
15 | out = np.zeros(array.shape[0] - window)
16 | for i in range(out.shape[0]):
17 | out[i] = np.mean(array[i:i + window])
18 | return out
19 |
20 |
21 | if __name__ == '__main__':
22 | folder_name = sys.argv[1]
23 | env_name = sys.argv[2]
24 | assert env_name in ['maze']
25 | # assert mode in ['train', 'hard', 'iteration']
26 | max_timesteps = {'maze': 4e6,
27 | }
28 | max_iterationss = {'maze': 245,}
29 | df_timesteps, df_sr, df_eval, df_legend, df_iteration, df_success_rate_iteration, df_legend_iteration = [], [], [], [], [], [], []
30 | # subfolders = ['ppo_sir', 'ppo_cu', 'goal_gan_b1000', ] # 'goal_gan_b10000'
31 | subfolders = ['goal_gan_b1000', 'goal_gan_b10000']
32 | last_sr = []
33 | for subfolder in subfolders:
34 | if subfolder == "ppo_cu":
35 | for i in range(3):
36 | eval_file = os.path.join(folder_name, subfolder, str(i), 'eval.csv')
37 | raw_success_rate = get_item(eval_file, 'mean_eval_reward')
38 | raw_total_timesteps = get_item(eval_file, 'n_updates') * 1000
39 | sr_f = interpolate.interp1d(raw_total_timesteps, raw_success_rate, bounds_error=False)
40 | timesteps = np.arange(0, max_timesteps[env_name], max_timesteps[env_name] // 245)
41 | print(timesteps[0], timesteps[-1], raw_total_timesteps[0], raw_total_timesteps[-1])
42 | success_rate = sr_f(timesteps)
43 | # eval_reward = eval_f(timesteps)
44 | timesteps = smooth(timesteps, 10)
45 | success_rate = smooth(success_rate, 10)
46 | # eval_reward = smooth(eval_reward, 20)
47 | df_timesteps.append(timesteps)
48 | df_sr.append(success_rate)
49 | last_sr.append(success_rate[-1])
50 | # df_eval.append(eval_reward)
51 | df_legend.append(np.array([subfolder.upper()] * len(timesteps)))
52 | else:
53 | for i in range(3):
54 | if not os.path.exists(os.path.join(folder_name, subfolder, str(i), 'progress.csv')):
55 | continue
56 | progress_file = os.path.join(folder_name, subfolder, str(i), 'progress.csv')
57 | raw_success_rate = get_item(progress_file, 'ep_reward_mean' if subfolder == "ppo_sir" or subfolder == "ppo" else 'Outer_MeanRewards')
58 | raw_total_timesteps = get_item(progress_file, 'total_timesteps' if subfolder == "ppo_sir" or subfolder == "ppo" else 'Outer_timesteps')
59 | sr_f = interpolate.interp1d(raw_total_timesteps, raw_success_rate, bounds_error=False)
60 | timesteps = np.arange(0, max_timesteps[env_name], max_timesteps[env_name] // 245)
61 | print(timesteps[0], timesteps[-1], raw_total_timesteps[0], raw_total_timesteps[-1])
62 | success_rate = sr_f(timesteps)
63 | # eval_reward = eval_f(timesteps)
64 | timesteps = smooth(timesteps, 5)
65 | success_rate = smooth(success_rate, 5)
66 | # eval_reward = smooth(eval_reward, 20)
67 | df_timesteps.append(timesteps)
68 | df_sr.append(success_rate)
69 | last_sr.append(success_rate[-1])
70 | # df_eval.append(eval_reward)
71 | df_legend.append(np.array([subfolder.upper()] * len(timesteps)))
72 |
73 | print(subfolder, np.mean(last_sr))
74 | df_timesteps = np.concatenate(df_timesteps, axis=0).tolist()
75 | df_sr = np.concatenate(df_sr, axis=0).tolist()
76 | # df_eval = np.concatenate(df_eval, axis=0).tolist()
77 | df_legend = np.concatenate(df_legend, axis=0).tolist()
78 |
79 | data = {'samples': df_timesteps, 'success_rate': df_sr, 'algo': df_legend}
80 | sr_timesteps = pandas.DataFrame(data)
81 | # data = {'samples': df_timesteps, 'eval': df_eval, 'algo': df_legend}
82 | # eval_timesteps = pandas.DataFrame(data)
83 |
84 | wspace = .3
85 | bottom = .3
86 | margin = .1
87 | left = .15
88 | width = 1.7 / ((1. - left) / (2 + wspace + margin / 2))
89 | height = 1.5 / ((1. - bottom) / (1 + margin / 2))
90 |
91 | plt.style.use("ggplot")
92 | # plt.rcParams.update({'legend.fontsize': 14})
93 | p = sns.color_palette()
94 | sns.set_palette([p[0], p[1], p[2], p[3]])
95 | f, axes = plt.subplots(1, 1, figsize=(width, height))
96 | sns.lineplot(x='samples', y='success_rate', hue='algo', ax=axes, data=sr_timesteps)
97 | axes.set_xlabel('samples')
98 | axes.set_ylabel('success_rate')
99 | axes.get_legend().remove()
100 | # sns.lineplot(x='samples', y='eval', hue='algo', ax=axes[1], data=eval_timesteps)
101 | # axes[1].set_xlabel('samples')
102 | # axes[1].set_ylabel('')
103 | # axes[1].get_legend().remove()
104 | handles, labels = axes.get_legend_handles_labels()
105 | # if mode == 'train':
106 | # sns.lineplot(x='samples', y='success_rate', hue='algo', data=sr_timesteps)
107 | # axes.set_xlabel('samples')
108 | # elif mode == 'hard':
109 | # sns.lineplot(x='samples', y='eval', hue='algo', data=eval_timesteps)
110 | # axes.set_xlabel('samples')
111 | # elif mode == 'iteration':
112 | # sns.lineplot(x='iterations', y='eval', hue='algo', ax=axes, data=eval_iteration)
113 | # axes.set_xlabel('iterations')
114 | # axes.set_ylabel('success rate')
115 | # axes.get_legend().remove()
116 | # handles, labels = axes.get_legend_handles_labels()
117 | # f.legend(handles[:], ['SIR', 'Manual CL', 'GoalGAN', 'GoalGAN_10k'], loc="lower right", ncol=1, bbox_to_anchor=(0.99, 0.18), title='')
118 | f.legend(handles[:], ['GoalGAN_1k', 'GoalGAN_10k'], loc="lower right", ncol=1,
119 | bbox_to_anchor=(0.99, 0.18), title='')
120 | f.subplots_adjust(top=1. - margin / height, bottom=0.21, wspace=wspace, left=left, right=1. - margin / width)
121 | plt.savefig(os.path.join(folder_name, '../', os.path.basename(folder_name) + '.pdf'))
122 | # plt.show()
123 |
--------------------------------------------------------------------------------
/plot/plot_experiment_len.py:
--------------------------------------------------------------------------------
1 | import seaborn as sns
2 | import pandas
3 | import numpy as np
4 | import sys, os
5 | from scipy import interpolate
6 | import matplotlib.pyplot as plt
7 |
8 |
9 | def get_item(log_file, label):
10 | data = pandas.read_csv(log_file, index_col=None, comment='#', error_bad_lines=True)
11 | return data[label].values
12 |
13 |
14 | def smooth(array, window):
15 | out = np.zeros(array.shape[0] - window)
16 | for i in range(out.shape[0]):
17 | out[i] = np.mean(array[i:i + window])
18 | return out
19 |
20 |
21 | if __name__ == '__main__':
22 | folder_name = sys.argv[1]
23 | env_name = sys.argv[2]
24 | assert env_name in ['push', 'particle', 'maze', 'stacking']
25 | # assert mode in ['train', 'hard', 'iteration']
26 | max_timesteps = {'push': 4.99e7,
27 | 'particle': 2.8e8,
28 | 'maze': 1.5e6,
29 | 'stacking': 2e8,}
30 | max_iterationss = {'push': 750,
31 | 'particle': 510,
32 | 'maze': 245,}
33 | # df_timesteps, df_sr, df_eval, df_legend, df_iteration, df_eval_iteration, df_legend_iteration = [], [], [], [], [], [], []
34 | df_iteration, df_len_mean, df_legend_iteration = [], [], []
35 | subfolders = ['ppo', 'sir_re8']
36 | if 'particle_random0.7' in folder_name:
37 | subfolders = ['ppo', 'sir_re1-8']
38 | elif 'particle_random1.0' in folder_name:
39 | subfolders = ['ppo', 'sir_re1-8']
40 | elif 'maze' in folder_name:
41 | subfolders = ['ppo', 'sir_re2']
42 | for subfolder in subfolders:
43 | for i in range(3):
44 | if not os.path.exists(os.path.join(folder_name, subfolder, str(i), 'progress.csv')):
45 | continue
46 | progress_file = os.path.join(folder_name, subfolder, str(i), 'progress.csv')
47 | # eval_file = os.path.join(folder_name, subfolder, str(i), 'eval.csv')
48 | # raw_success_rate = get_item(progress_file, 'ep_reward_mean')
49 | # raw_total_timesteps = get_item(progress_file, 'total_timesteps')
50 | raw_len_mean = get_item(progress_file, 'ep_len_mean')
51 | raw_iterations = get_item(progress_file, 'n_updates')
52 | # raw_eval_reward = get_item(eval_file, 'mean_eval_reward')
53 | # sr_f = interpolate.interp1d(raw_total_timesteps, raw_success_rate, fill_value="extrapolate")
54 | # eval_f = interpolate.interp1d(raw_total_timesteps, raw_eval_reward, fill_value="extrapolate")
55 | # timesteps = np.arange(0, max_timesteps[env_name], max_timesteps[env_name] // 500)
56 | # print(timesteps[0], timesteps[-1], raw_total_timesteps[0], raw_total_timesteps[-1])
57 | # success_rate = sr_f(timesteps)
58 | # eval_reward = eval_f(timesteps)
59 | L = max_iterationss[env_name]
60 | iterations = smooth(raw_iterations[:L], 20)
61 | len_mean = smooth(raw_len_mean[:L], 20)
62 | # timesteps = smooth(timesteps, 20)
63 | # success_rate = smooth(success_rate, 20)
64 | # eval_reward = smooth(eval_reward, 20)
65 | df_iteration.append(iterations)
66 | df_len_mean.append(len_mean)
67 | # df_timesteps.append(timesteps)
68 | # df_sr.append(success_rate)
69 | # df_eval.append(eval_reward)
70 | # df_legend.append(np.array([subfolder.upper()] * len(timesteps)))
71 |
72 |
73 | # eval_iteration = smooth(raw_eval_reward[:L], 20)
74 | # df_iteration.append(iterations)
75 | # df_eval_iteration.append(eval_iteration)
76 | df_legend_iteration.append(np.array([subfolder.upper()] * len(iterations)))
77 | # df_timesteps = np.concatenate(df_timesteps, axis=0).tolist()
78 | # df_sr = np.concatenate(df_sr, axis=0).tolist()
79 | # df_eval = np.concatenate(df_eval, axis=0).tolist()
80 | # df_legend = np.concatenate(df_legend, axis=0).tolist()
81 | df_iteration = np.concatenate(df_iteration, axis=0).tolist()
82 | df_len_mean = np.concatenate(df_len_mean, axis=0).tolist()
83 | # df_eval_iteration = np.concatenate(df_eval_iteration, axis=0).tolist()
84 | df_legend_iteration = np.concatenate(df_legend_iteration, axis=0).tolist()
85 | # data = {'samples': df_timesteps, 'success_rate': df_sr, 'algo': df_legend}
86 | # sr_timesteps = pandas.DataFrame(data)
87 | # data = {'samples': df_timesteps, 'eval': df_eval, 'algo': df_legend}
88 | # eval_timesteps = pandas.DataFrame(data)
89 | data = {'iterations': df_iteration, 'len_mean': df_len_mean, 'algo': df_legend_iteration}
90 | len_mean_iteration = pandas.DataFrame(data)
91 |
92 | wspace = .3
93 | bottom = .3
94 | margin = .1
95 | left = .18
96 | width = 1.2 / ((1. - left) / (2. + wspace + margin / 2))
97 | height = 1.5 / ((1. - bottom) / (1 + margin / 2))
98 |
99 | plt.style.use("ggplot")
100 | # plt.rcParams.update({'legend.fontsize': 14})
101 | p = sns.color_palette()
102 | sns.set_palette([p[0], p[1]])
103 | f, axes = plt.subplots(1, 1, figsize=(width, height))
104 | # sns.lineplot(x='samples', y='success_rate', hue='algo', ax=axes[0], data=sr_timesteps)
105 | # axes[0].set_xlabel('samples')
106 | # axes[0].set_ylabel('success_rate')
107 | # axes[0].get_legend().remove()
108 | # sns.lineplot(x='samples', y='eval', hue='algo', ax=axes[1], data=eval_timesteps)
109 | # axes[1].set_xlabel('samples')
110 | # axes[1].set_ylabel('')
111 | # axes[1].get_legend().remove()
112 | sns.lineplot(x='iterations', y='len_mean', hue='algo', ax=axes, data=len_mean_iteration)
113 | axes.set_xlabel('iterations')
114 | axes.set_ylabel('episode length')
115 | axes.get_legend().remove()
116 | handles, labels = axes.get_legend_handles_labels()
117 | # if mode == 'train':
118 | # sns.lineplot(x='samples', y='success_rate', hue='algo', data=sr_timesteps)
119 | # axes.set_xlabel('samples')
120 | # elif mode == 'hard':
121 | # sns.lineplot(x='samples', y='eval', hue='algo', data=eval_timesteps)
122 | # axes.set_xlabel('samples')
123 | # elif mode == 'iteration':
124 | # sns.lineplot(x='iterations', y='eval', hue='algo', ax=axes, data=eval_iteration)
125 | # axes.set_xlabel('iterations')
126 | # axes.set_ylabel('success rate')
127 | # axes.get_legend().remove()
128 | # handles, labels = axes.get_legend_handles_labels()
129 | f.legend(handles[1:], ['PPO', 'SIR'], loc="upper right", ncol=1, title='')
130 | f.subplots_adjust(top=1. - margin / height, bottom=0.2, wspace=wspace, left=left, right=1. - margin / width)
131 | plt.savefig(os.path.join(folder_name, '../', os.path.basename(folder_name) + '_len.pdf'))
132 | # plt.show()
133 |
--------------------------------------------------------------------------------
/plot/plot_experiment.py:
--------------------------------------------------------------------------------
1 | import seaborn as sns
2 | import pandas
3 | import numpy as np
4 | import sys, os
5 | from scipy import interpolate
6 | import matplotlib.pyplot as plt
7 |
8 |
9 | def get_item(log_file, label):
10 | data = pandas.read_csv(log_file, index_col=None, comment='#', error_bad_lines=True)
11 | return data[label].values
12 |
13 |
14 | def smooth(array, window):
15 | out = np.zeros(array.shape[0] - window)
16 | for i in range(out.shape[0]):
17 | out[i] = np.mean(array[i:i + window])
18 | return out
19 |
20 |
21 | if __name__ == '__main__':
22 | folder_name = sys.argv[1]
23 | env_name = sys.argv[2]
24 | assert env_name in ['push', 'particle', 'maze', 'stacking']
25 | # assert mode in ['train', 'hard', 'iteration']
26 | max_timesteps = {'push': 3e7,
27 | 'particle': 3.0e8,
28 | 'maze': 1.5e6,
29 | 'stacking': 2e8,}
30 | max_iterationss = {'push': 750,
31 | 'particle': 550,
32 | 'maze': 245,}
33 | df_timesteps, df_sr, df_eval, df_legend, df_iteration, df_eval_iteration, df_legend_iteration = [], [], [], [], [], [], []
34 | subfolders = ['ppo', 'sir', 'sil', 'ds']
35 | for subfolder in subfolders:
36 | last_sr = []
37 | last_eval = []
38 | for i in range(3):
39 | if not os.path.exists(os.path.join(folder_name, subfolder, str(i), 'progress.csv')):
40 | continue
41 | progress_file = os.path.join(folder_name, subfolder, str(i), 'progress.csv')
42 | eval_file = os.path.join(folder_name, subfolder, str(i), 'eval.csv')
43 | if 'ds' in subfolder:
44 | raw_success_rate = get_item(progress_file, 'ep_success_rate')
45 | else:
46 | raw_success_rate = get_item(progress_file, 'ep_reward_mean')
47 | raw_total_timesteps = get_item(progress_file, 'total_timesteps')
48 | raw_eval_reward = get_item(eval_file, 'mean_eval_reward')
49 | print(raw_total_timesteps.shape, raw_eval_reward.shape)
50 | sr_f = interpolate.interp1d(raw_total_timesteps, raw_success_rate, fill_value="extrapolate")
51 | eval_f = interpolate.interp1d(raw_total_timesteps, raw_eval_reward, fill_value="extrapolate")
52 | timesteps = np.arange(0, max_timesteps[env_name], max_timesteps[env_name] // 500)
53 | print(timesteps[0], timesteps[-1], raw_total_timesteps[0], raw_total_timesteps[-1])
54 | success_rate = sr_f(timesteps)
55 | eval_reward = eval_f(timesteps)
56 | timesteps = smooth(timesteps, 20)
57 | success_rate = smooth(success_rate, 20)
58 | eval_reward = smooth(eval_reward, 20)
59 | df_timesteps.append(timesteps)
60 | df_sr.append(success_rate)
61 | df_eval.append(eval_reward)
62 | last_sr.append(success_rate[-1])
63 | last_eval.append(eval_reward[-1])
64 | df_legend.append(np.array([subfolder.upper()] * len(timesteps)))
65 |
66 | raw_iterations = get_item(progress_file, 'n_updates')
67 | L = max_iterationss[env_name]
68 | iterations = smooth(raw_iterations[:L], 20)
69 | eval_iteration = smooth(raw_eval_reward[:L], 20)
70 | df_iteration.append(iterations)
71 | df_eval_iteration.append(eval_iteration)
72 | df_legend_iteration.append(np.array([subfolder.upper()] * len(iterations)))
73 | print(subfolder, 'sr', np.mean(last_sr), 'eval', np.mean(last_eval))
74 | df_timesteps = np.concatenate(df_timesteps, axis=0).tolist()
75 | df_sr = np.concatenate(df_sr, axis=0).tolist()
76 | df_eval = np.concatenate(df_eval, axis=0).tolist()
77 | df_legend = np.concatenate(df_legend, axis=0).tolist()
78 | df_iteration = np.concatenate(df_iteration, axis=0).tolist()
79 | df_eval_iteration = np.concatenate(df_eval_iteration, axis=0).tolist()
80 | df_legend_iteration = np.concatenate(df_legend_iteration, axis=0).tolist()
81 | data = {'samples': df_timesteps, 'success_rate': df_sr, 'algo': df_legend}
82 | sr_timesteps = pandas.DataFrame(data)
83 | data = {'samples': df_timesteps, 'eval': df_eval, 'algo': df_legend}
84 | eval_timesteps = pandas.DataFrame(data)
85 | data = {'iterations': df_iteration, 'eval': df_eval_iteration, 'algo': df_legend_iteration}
86 | eval_iteration = pandas.DataFrame(data)
87 |
88 | wspace = .3
89 | bottom = .3
90 | margin = .1
91 | # left = .08
92 | left = .1
93 | width = 2.15 / ((1. - left) / (2 + wspace + margin / 2))
94 | height = 1.5 / ((1. - bottom) / (1 + margin / 2))
95 |
96 | plt.style.use("ggplot")
97 | # plt.rcParams.update({'legend.fontsize': 14})
98 | p = sns.color_palette()
99 | sns.set_palette([p[i] for i in range(len(subfolders))])
100 | f, axes = plt.subplots(1, 2, figsize=(width, height))
101 | sns.lineplot(x='samples', y='success_rate', hue='algo', ax=axes[0], data=sr_timesteps)
102 | axes[0].set_xlabel('samples')
103 | axes[0].set_ylabel('avg. succ. rate')
104 | axes[0].get_legend().remove()
105 | sns.lineplot(x='samples', y='eval', hue='algo', ax=axes[1], data=eval_timesteps)
106 | axes[1].set_xlabel('samples')
107 | axes[1].set_ylabel('hard succ. rate')
108 | axes[1].get_legend().remove()
109 | # sns.lineplot(x='iterations', y='eval', hue='algo', ax=axes[2], data=eval_iteration)
110 | # axes[2].set_xlabel('iterations')
111 | # axes[2].set_ylabel('')
112 | # axes[2].get_legend().remove()
113 | handles, labels = axes[1].get_legend_handles_labels()
114 | # if mode == 'train':
115 | # sns.lineplot(x='samples', y='success_rate', hue='algo', data=sr_timesteps)
116 | # axes.set_xlabel('samples')
117 | # elif mode == 'hard':
118 | # sns.lineplot(x='samples', y='eval', hue='algo', data=eval_timesteps)
119 | # axes.set_xlabel('samples')
120 | # elif mode == 'iteration':
121 | # sns.lineplot(x='iterations', y='eval', hue='algo', ax=axes, data=eval_iteration)
122 | # axes.set_xlabel('iterations')
123 | # axes.set_ylabel('success rate')
124 | # axes.get_legend().remove()
125 | # handles, labels = axes.get_legend_handles_labels()
126 | # f.legend(handles[:], ['PPO', 'SIR', 'SIL', 'DS'], loc="lower right", ncol=1, bbox_to_anchor=(0.49, 0.18), title='')
127 | f.legend(handles[:], ['PPO', 'SIR', 'SIL'], loc="lower right", ncol=1, bbox_to_anchor=(0.49, 0.18), title='')
128 | f.subplots_adjust(top=1. - margin / height, bottom=0.21, wspace=wspace, left=left, right=1. - margin / width)
129 | plt.savefig(os.path.join(folder_name, '../', os.path.basename(folder_name) + '.pdf'))
130 | print(os.path.join(folder_name, '../', os.path.basename(folder_name) + '.pdf'))
131 | plt.show()
132 |
--------------------------------------------------------------------------------
/plot/plot_sac_experiment_maze.py:
--------------------------------------------------------------------------------
1 | import seaborn as sns
2 | import pandas
3 | import numpy as np
4 | import sys, os
5 | from scipy import interpolate
6 | import matplotlib.pyplot as plt
7 |
8 |
9 | def get_item(log_file, label):
10 | data = pandas.read_csv(log_file, index_col=None, comment='#', error_bad_lines=True)
11 | return data[label].values
12 |
13 |
14 | def smooth(array, window):
15 | out = np.zeros(array.shape[0] - window)
16 | for i in range(out.shape[0]):
17 | out[i] = np.mean(array[i:i + window])
18 | return out
19 |
20 |
21 | if __name__ == '__main__':
22 | folder_name = sys.argv[1]
23 | env_name = sys.argv[2]
24 | assert env_name in ['umaze', 'maze_ego', 'maze_box']
25 | max_timesteps = {'umaze': 1e5, 'maze_ego': 2.5e7, 'maze_box': 4.9e7}
26 | df_timesteps, df_sr, df_eval, df_legend, df_iteration, df_success_rate_iteration, df_legend_iteration = [], [], [], [], [], [], []
27 | subfolders = ['sir', 'hiro', 'dsc']
28 | if env_name == 'umaze':
29 | for subfolder in subfolders:
30 | last_sr = []
31 | for i in range(3):
32 | if not os.path.exists(os.path.join(folder_name, subfolder, str(i), 'progress.csv')):
33 | continue
34 | progress_file = os.path.join(folder_name, subfolder, str(i), 'progress.csv')
35 | if subfolder == 'hiro':
36 | raw_success_rate = get_item(progress_file, 'Value')
37 | raw_total_timesteps = get_item(progress_file, 'Step')
38 | elif subfolder == 'dsc':
39 | presmooth_success_rate = get_item(progress_file, 'Value')
40 | raw_success_rate = np.zeros_like(presmooth_success_rate)
41 | for j in range(presmooth_success_rate.shape[0]):
42 | raw_success_rate[j] = np.mean(presmooth_success_rate[max(j - 100 + 1, 0): j + 1])
43 | raw_total_timesteps = get_item(progress_file, 'Step')
44 | else:
45 | raw_success_rate = get_item(progress_file, 'ep_rewmean')
46 | raw_total_timesteps = get_item(progress_file, 'total timesteps')
47 | sr_f = interpolate.interp1d(raw_total_timesteps, raw_success_rate, fill_value="extrapolate")
48 | timesteps = np.arange(600, max_timesteps[env_name], max_timesteps[env_name] // 50)
49 | print(timesteps[0], timesteps[-1], raw_total_timesteps[0], raw_total_timesteps[-1])
50 | success_rate = sr_f(timesteps)
51 | # timesteps = smooth(timesteps, 20)
52 | # success_rate = smooth(success_rate, 20)
53 | # eval_reward = smooth(eval_reward, 20)
54 | df_timesteps.append(timesteps)
55 | df_sr.append(success_rate)
56 | last_sr.append(success_rate[-1])
57 | # df_eval.append(eval_reward)
58 | df_legend.append(np.array([subfolder.upper()] * len(timesteps)))
59 |
60 | print(subfolder, np.mean(last_sr))
61 | else:
62 | for subfolder in subfolders:
63 | last_sr = []
64 | for i in range(3):
65 | if not os.path.exists(os.path.join(folder_name, subfolder, str(i), 'eval_box.csv')):
66 | continue
67 | eval_file = os.path.join(folder_name, subfolder, str(i), 'eval.csv' if env_name == 'maze_ego' else 'eval_box.csv')
68 | if subfolder == 'hiro':
69 | raw_success_rate = get_item(eval_file, 'Value')
70 | raw_total_timesteps = get_item(eval_file, 'Step')
71 | sr_f = interpolate.interp1d(raw_total_timesteps, raw_success_rate, fill_value="extrapolate")
72 | elif subfolder == 'dsc':
73 | raw_success_rate = get_item(eval_file, 'success_rate')
74 | raw_total_timesteps = get_item(eval_file, 'timesteps')
75 | sr_f = interpolate.interp1d(raw_total_timesteps, raw_success_rate, bounds_error=False)
76 | else:
77 | progress_file = os.path.join(folder_name, subfolder, str(i), 'progress.csv')
78 | raw_total_timesteps = get_item(progress_file, 'total timesteps')
79 | if subfolder == 'sir':
80 | original_timesteps = get_item(progress_file, 'original_timesteps')
81 | else:
82 | original_timesteps = raw_total_timesteps
83 | expand_fn = interpolate.interp1d(original_timesteps, raw_total_timesteps, fill_value="extrapolate")
84 | success_rate = get_item(eval_file, 'mean_eval_reward')
85 | eval_steps = get_item(eval_file, 'n_updates')
86 | eval_steps = expand_fn(eval_steps)
87 | sr_f = interpolate.interp1d(eval_steps, success_rate, fill_value="extrapolate")
88 | timesteps = np.arange(0, max_timesteps[env_name], max_timesteps[env_name] // 250)
89 | print(timesteps[0], timesteps[-1], raw_total_timesteps[0], raw_total_timesteps[-1])
90 | success_rate = sr_f(timesteps)
91 | timesteps = smooth(timesteps, 20)
92 | success_rate = smooth(success_rate, 20)
93 | # eval_reward = smooth(eval_reward, 20)
94 | df_timesteps.append(timesteps)
95 | df_sr.append(success_rate)
96 | last_sr.append(success_rate[-1])
97 | # df_eval.append(eval_reward)
98 | df_legend.append(np.array([subfolder.upper()] * len(timesteps)))
99 |
100 | print(subfolder, np.mean(last_sr))
101 |
102 | df_timesteps = np.concatenate(df_timesteps, axis=0).tolist()
103 | df_sr = np.concatenate(df_sr, axis=0).tolist()
104 | # df_eval = np.concatenate(df_eval, axis=0).tolist()
105 | df_legend = np.concatenate(df_legend, axis=0).tolist()
106 | data = {'samples': df_timesteps, 'success_rate': df_sr, 'algo': df_legend}
107 | sr_timesteps = pandas.DataFrame(data)
108 |
109 | wspace = .3
110 | bottom = .3
111 | margin = .1
112 | left = .1
113 | width = 1.5 / ((1. - left) / (2 + wspace + margin / 2))
114 | height = 1.5 / ((1. - bottom) / (1 + margin / 2))
115 |
116 | plt.style.use("ggplot")
117 | # plt.rcParams.update({'legend.fontsize': 14})
118 | p = sns.color_palette()
119 | sns.set_palette([p[i] for i in range(len(subfolders))])
120 | f, axes = plt.subplots(1, 1, figsize=(width, height))
121 | sns.lineplot(x='samples', y='success_rate', hue='algo', ax=axes, data=sr_timesteps)
122 | axes.set_xlabel('samples')
123 | axes.set_ylabel('success_rate')
124 | axes.xaxis.get_major_formatter().set_powerlimits((0, 1))
125 | axes.get_legend().remove()
126 |
127 | handles, labels = axes.get_legend_handles_labels()
128 |
129 | f.legend(handles[:], ['SIR', 'HIRO', 'DSC'], loc="lower right", ncol=1, bbox_to_anchor=(0.99, 0.18), title='')
130 | f.subplots_adjust(top=1. - margin / height, bottom=0.21, wspace=wspace, left=left, right=1. - margin / width)
131 | plt.savefig(os.path.join(folder_name, '../', os.path.basename(folder_name) + env_name + '.pdf'))
132 | plt.show()
133 |
--------------------------------------------------------------------------------
/plot/visualize_sac_value.py:
--------------------------------------------------------------------------------
1 | import sys, os
2 | import numpy as np
3 | import matplotlib.pyplot as plt
4 | from matplotlib import cm
5 | from utils.make_env_utils import make_env, get_env_kwargs
6 | from baselines import HER2
7 |
8 |
9 | def gen_value_with_obstacle(obs, model, env_hyperparam):
10 | obstacle_xpos, obstacle_ypos = np.meshgrid(np.linspace(env_hyperparam['xlim'][0], env_hyperparam['xlim'][1], 21),
11 | np.linspace(env_hyperparam['ylim'][0], env_hyperparam['ylim'][1], 21))
12 | grid_shape = obstacle_xpos.shape
13 | _obstacle_xpos = np.reshape(obstacle_xpos, (-1, 1))
14 | _obstacle_ypos = np.reshape(obstacle_ypos, (-1, 1))
15 | batch_obs = np.tile(obs, (_obstacle_xpos.shape[0], 1))
16 | batch_obs[:, 6] = _obstacle_xpos[:, 0]
17 | batch_obs[:, 7] = _obstacle_ypos[:, 0]
18 | batch_obs[:, 12] = batch_obs[:, 6] - batch_obs[:, 0]
19 | batch_obs[:, 13] = batch_obs[:, 7] - batch_obs[:, 1]
20 | # Compute value2
21 | batch_value = model.model.sess.run(model.model.step_ops[6],
22 | {model.model.observations_ph: batch_obs})
23 | grid_value = np.reshape(batch_value, grid_shape)
24 |
25 | # Compute value1
26 | subgoal_obs = np.tile(obs, (_obstacle_xpos.shape[0], 1))
27 | # Achieved goal (current obstacle pos)
28 | subgoal_obs[:, -10: -7] = subgoal_obs[:, 6: 9]
29 | subgoal_obs[:, -7: -5] = np.array([[0., 1.]])
30 | # Desired goal (sampled perturbed obstacle pos)
31 | obstacle_xy = np.concatenate([_obstacle_xpos, _obstacle_ypos, subgoal_obs[:, 8:9]], axis=-1)
32 | subgoal_obs[:, -5: -2] = obstacle_xy
33 | subgoal_obs[:, -2: ] = np.array([[0., 1.]])
34 | # Value1 aim to answer if the subgoal is easy to achieve
35 | value1 = model.model.sess.run(model.model.step_ops[6],
36 | {model.model.observations_ph: subgoal_obs})
37 | grid_value1 = np.reshape(value1, grid_shape)
38 |
39 | # min_value = np.min(np.concatenate([np.expand_dims(value1, 1), np.expand_dims(batch_value,1)], axis=1), axis=1)
40 | # grid_value_min = np.reshape(min_value, grid_shape)
41 | normalized_value1 = (value1 - np.min(value1)) / (np.max(value1) - np.min(value1))
42 | normalized_value2 = (batch_value - np.min(batch_value)) / (np.max(batch_value) - np.min(batch_value))
43 | value_prod = normalized_value1 * normalized_value2
44 | grid_value_prod = np.reshape(value_prod, grid_shape)
45 |
46 | return obstacle_xpos, obstacle_ypos, grid_value, grid_value1, grid_value_prod
47 |
48 |
49 | if __name__ == '__main__':
50 | if len(sys.argv) < 2:
51 | print('Usage: python -m plot.visualize_sac_value [load_path]')
52 | load_path = sys.argv[1]
53 | env_name = 'FetchPushWallObstacle-v4'
54 | env_kwargs = get_env_kwargs(env_name, random_ratio=0.0, reward_type="sparse")
55 | env_hyperparam = dict(xlim=(1.05, 1.55), ylim=(0.4, 1.1))
56 | n_cpu = 1
57 | env = make_env(env_id=env_name, rank=0, log_dir=None,flatten_dict=True, kwargs=env_kwargs)
58 |
59 | model = HER2.load(load_path)
60 | fig, ax = plt.subplots(1, 1, figsize=(8, 8))
61 | plt.rcParams.update({'font.size': 20, 'xtick.labelsize': 20, 'ytick.labelsize': 20,
62 | 'axes.labelsize': 20})
63 | obs = env.reset()
64 | while not (obs[4] > 0.70 and obs[4] < 0.80 and obs[3] < 1.5):
65 | obs = env.reset()
66 | env.set_goal(np.array([1.195, 0.75, 0.425, 1, 0]))
67 | obs = env.get_obs()
68 | obs = np.concatenate([obs[key] for key in ['observation', 'achieved_goal', 'desired_goal']])
69 | img = env.render(mode='rgb_array')
70 | xs, ys, zs, value1s, value_prods = gen_value_with_obstacle(obs, model, env_hyperparam)
71 | print('gripper', obs[:3], 'box', obs[3:6], 'obstacle', obs[6:9], )
72 | np.save('xs.npy', xs)
73 | np.save('ys.npy', ys)
74 | np.save('value1.npy', value1s)
75 | np.save('value2.npy', zs)
76 | np.save('value_prod.npy', value_prods)
77 | plt.imsave(os.path.join(os.path.dirname(load_path), 'obs.png'), img)
78 |
79 | ax.cla()
80 | surf = ax.contourf((xs - 1.05) / 0.5, (ys - 0.4) / 0.7, value_prods, 15, cmap=cm.coolwarm, vmin=-0.0, vmax=1)
81 | ax.set_xlabel('x', fontsize=24)
82 | ax.set_ylabel('y', fontsize=24)
83 | ax.plot([(1.25 - 1.05) / 0.5, (1.25 - 1.05) / 0.5], [0, (0.65 - 0.4) / 0.7], 'k', linestyle='--')
84 | ax.plot([(1.25 - 1.05) / 0.5, (1.25 - 1.05) / 0.5], [(0.85 - 0.4) / 0.7, (1.1 - 0.4) / 0.7], 'k', linestyle='--')
85 | ax.axis([0., 1., 0., 1.])
86 | cb = plt.colorbar(surf)
87 | plt.tight_layout()
88 | plt.savefig('value_prod.png')
89 | ax.set_title("Value prod")
90 | plt.show()
91 |
92 | fig, ax = plt.subplots(1, 1, figsize=(8, 8))
93 | surf = ax.contourf((xs - 1.05) / 0.5, (ys - 0.4) / 0.7, value1s, 15, cmap=cm.coolwarm, vmin=-0.0, vmax=1)
94 | ax.set_xlabel('x', fontsize=24)
95 | ax.set_ylabel('y', fontsize=24)
96 | ax.plot([(1.25 - 1.05) / 0.5, (1.25 - 1.05) / 0.5], [0, (0.65 - 0.4) / 0.7], 'k', linestyle='--')
97 | ax.plot([(1.25 - 1.05) / 0.5, (1.25 - 1.05) / 0.5], [(0.85 - 0.4) / 0.7, (1.1 - 0.4) / 0.7], 'k', linestyle='--')
98 | ax.axis([0., 1., 0., 1.])
99 | cb = plt.colorbar(surf)
100 | plt.tight_layout()
101 | plt.savefig('value1.png')
102 | ax.set_title("Value 1")
103 | plt.show()
104 |
105 | fig, ax = plt.subplots(1, 1, figsize=(8, 8))
106 | surf = ax.contourf((xs - 1.05) / 0.5, (ys - 0.4) / 0.7, zs, 15, cmap=cm.coolwarm, vmin=-0.0, vmax=1)
107 | ax.set_xlabel('x', fontsize=24)
108 | ax.set_ylabel('y', fontsize=24)
109 | ax.plot([(1.25 - 1.05) / 0.5, (1.25 - 1.05) / 0.5], [0, (0.65 - 0.4) / 0.7], 'k', linestyle='--')
110 | ax.plot([(1.25 - 1.05) / 0.5, (1.25 - 1.05) / 0.5], [(0.85 - 0.4) / 0.7, (1.1 - 0.4) / 0.7], 'k', linestyle='--')
111 | ax.axis([0., 1., 0., 1.])
112 | cb = plt.colorbar(surf)
113 | plt.tight_layout()
114 | plt.savefig('value2.png')
115 | ax.set_title("Value 2")
116 | plt.show()
117 |
118 | fig, ax = plt.subplots(1, 1, figsize=(8, 8))
119 | surf = ax.contourf((xs - 1.05) / 0.5, (ys - 0.4) / 0.7, (value1s + zs) / 2, 15, cmap=cm.coolwarm, vmin=-0.0, vmax=1)
120 | ax.set_xlabel('x', fontsize=24)
121 | ax.set_ylabel('y', fontsize=24)
122 | ax.plot([(1.25 - 1.05) / 0.5, (1.25 - 1.05) / 0.5], [0, (0.65 - 0.4) / 0.7], 'k', linestyle='--')
123 | ax.plot([(1.25 - 1.05) / 0.5, (1.25 - 1.05) / 0.5], [(0.85 - 0.4) / 0.7, (1.1 - 0.4) / 0.7], 'k', linestyle='--')
124 | ax.axis([0., 1., 0., 1.])
125 | cb = plt.colorbar(surf)
126 | plt.tight_layout()
127 | plt.savefig('value_ave.png')
128 | ax.set_title("Value average")
129 | plt.show()
130 |
131 | fig, ax = plt.subplots(1, 1, figsize=(8, 8))
132 | surf = ax.contourf((xs - 1.05) / 0.5, (ys - 0.4) / 0.7, np.min(np.array([value1s, zs]), axis=0), 15, cmap=cm.coolwarm, vmin=-0.0, vmax=1)
133 | ax.set_xlabel('x', fontsize=24)
134 | ax.set_ylabel('y', fontsize=24)
135 | ax.plot([(1.25 - 1.05) / 0.5, (1.25 - 1.05) / 0.5], [0, (0.65 - 0.4) / 0.7], 'k', linestyle='--')
136 | ax.plot([(1.25 - 1.05) / 0.5, (1.25 - 1.05) / 0.5], [(0.85 - 0.4) / 0.7, (1.1 - 0.4) / 0.7], 'k', linestyle='--')
137 | ax.axis([0., 1., 0., 1.])
138 | cb = plt.colorbar(surf)
139 | plt.tight_layout()
140 | plt.savefig('value_min.png')
141 | ax.set_title("Value min")
142 | plt.show()
143 |
--------------------------------------------------------------------------------
/plot/plot_sac_experiment.py:
--------------------------------------------------------------------------------
1 | import seaborn as sns
2 | import pandas
3 | import numpy as np
4 | import sys, os
5 | from scipy import interpolate
6 | import matplotlib.pyplot as plt
7 |
8 |
9 | def get_item(log_file, label):
10 | data = pandas.read_csv(log_file, index_col=None, comment='#', error_bad_lines=True)
11 | return data[label].values
12 |
13 |
14 | def smooth(array, window):
15 | out = np.zeros(array.shape[0] - window)
16 | for i in range(out.shape[0]):
17 | out[i] = np.mean(array[i:i + window])
18 | return out
19 |
20 |
21 | if __name__ == '__main__':
22 | folder_name = sys.argv[1]
23 | env_name = sys.argv[2]
24 | assert env_name in ['push', 'stack2', 'stack3', 'particle']
25 | # assert mode in ['train', 'hard', 'iteration']
26 | max_timesteps = {'push': 1.45e7,
27 | # 'stack2': 2.8e7,
28 | 'stack2': 2.3e7,
29 | 'stack3': 1e8,
30 | 'particle': 9.5e7,
31 | }
32 | max_iterationss = {'push': 440000,
33 | 'stack2': 8.9e5,
34 | 'stack3': 2.5e6,
35 | }
36 | df_timesteps, df_sr, df_eval, df_legend, df_iteration, df_eval_iteration, df_legend_iteration = [], [], [], [], [], [], []
37 | subfolders = ['sac', 'sir', 'sil', 'ds']
38 | if 'particle' in folder_name:
39 | subfolders = ['sac', 'sir', 'sil']
40 | elif 'push_random0.7' in folder_name:
41 | subfolders = ['sac', 'sir', 'sil', 'ds2']
42 | elif 'push_random1.0' in folder_name:
43 | subfolders = ['sac', 'sir', 'sil', 'ds']
44 | elif 'stack_2obj' in folder_name or 'stack_3obj' in folder_name:
45 | subfolders = ['sac', 'sir_noknowledge', 'sil', 'ds']
46 | for subfolder in subfolders:
47 | last_sr = []
48 | last_eval = []
49 | for i in range(4):
50 | if not os.path.exists(os.path.join(folder_name, subfolder, str(i), 'progress.csv')):
51 | continue
52 | progress_file = os.path.join(folder_name, subfolder, str(i), 'progress.csv')
53 | eval_file = os.path.join(folder_name, subfolder, str(i), 'eval.csv')
54 | if subfolder is 'ds' or subfolder is 'ds2':
55 | raw_success_rate = get_item(progress_file, 'success rate')
56 | else:
57 | raw_success_rate = get_item(progress_file, 'ep_rewmean')
58 | raw_total_timesteps = get_item(progress_file, 'total timesteps')
59 | try:
60 | raw_original_timesteps = get_item(progress_file, 'original_timesteps')
61 | except KeyError:
62 | raw_original_timesteps = raw_total_timesteps
63 | raw_eval_timesteps = get_item(eval_file, 'n_updates')
64 | raw_eval_reward = get_item(eval_file, 'mean_eval_reward')
65 | print(raw_total_timesteps.shape, raw_eval_reward.shape)
66 | sr_f = interpolate.interp1d(raw_total_timesteps, raw_success_rate, fill_value="extrapolate")
67 | eval_f = interpolate.interp1d(raw_eval_timesteps, raw_eval_reward, fill_value="extrapolate")
68 | step_shrink_fn = interpolate.interp1d(raw_total_timesteps, raw_original_timesteps, fill_value="extrapolate")
69 |
70 | timesteps = np.arange(0, max_timesteps[env_name], max_timesteps[env_name] // 500)
71 | print(timesteps[0], timesteps[-1], raw_total_timesteps[0], raw_total_timesteps[-1])
72 | success_rate = sr_f(timesteps)
73 | eval_reward = eval_f(step_shrink_fn(timesteps))
74 | timesteps = smooth(timesteps, 50)
75 | success_rate = smooth(success_rate, 50)
76 | eval_reward = smooth(eval_reward, 50)
77 | df_timesteps.append(timesteps)
78 | df_sr.append(success_rate)
79 | df_eval.append(eval_reward)
80 | last_sr.append(success_rate[-1])
81 | last_eval.append(eval_reward[-1])
82 | df_legend.append(np.array([subfolder.upper()] * len(timesteps)))
83 |
84 | # raw_iterations = get_item(progress_file, 'n_updates')
85 | # iter_step_convert_fn = interpolate.interp1d(raw_iterations, raw_original_timesteps, fill_value="extrapolate")
86 | # iterations = np.arange(0, max_iterationss[env_name], max_iterationss[env_name] // 500)
87 | # eval_iteration = eval_f(iter_step_convert_fn(iterations))
88 | # iterations = smooth(iterations, 50)
89 | # eval_iteration = smooth(eval_iteration, 50)
90 | # df_iteration.append(iterations)
91 | # df_eval_iteration.append(eval_iteration)
92 | # df_legend_iteration.append(np.array([subfolder.upper()] * len(iterations)))
93 | print(subfolder, 'sr', np.mean(last_sr), 'eval', np.mean(last_eval))
94 | df_timesteps = np.concatenate(df_timesteps, axis=0).tolist()
95 | df_sr = np.concatenate(df_sr, axis=0).tolist()
96 | df_eval = np.concatenate(df_eval, axis=0).tolist()
97 | df_legend = np.concatenate(df_legend, axis=0).tolist()
98 | # df_iteration = np.concatenate(df_iteration, axis=0).tolist()
99 | # df_eval_iteration = np.concatenate(df_eval_iteration, axis=0).tolist()
100 | # df_legend_iteration = np.concatenate(df_legend_iteration, axis=0).tolist()
101 | data = {'samples': df_timesteps, 'success_rate': df_sr, 'algo': df_legend}
102 | sr_timesteps = pandas.DataFrame(data)
103 | data = {'samples': df_timesteps, 'eval': df_eval, 'algo': df_legend}
104 | eval_timesteps = pandas.DataFrame(data)
105 | # data = {'iterations': df_iteration, 'eval': df_eval_iteration, 'algo': df_legend_iteration}
106 | # eval_iteration = pandas.DataFrame(data)
107 |
108 | wspace = .3
109 | bottom = .3
110 | margin = .1
111 | # left = .08
112 | left = .1
113 | # width = 3.5 / ((1. - left) / (2 + wspace + margin / 2))
114 | width = 2.15 / ((1. - left) / (2 + wspace + margin / 2))
115 | height = 1.5 / ((1. - bottom) / (1 + margin / 2))
116 |
117 | plt.style.use("ggplot")
118 | # plt.rcParams.update({'legend.fontsize': 14})
119 | p = sns.color_palette()
120 | sns.set_palette([p[i] for i in range(len(subfolders))])
121 | # f, axes = plt.subplots(1, 3, figsize=(width, height))
122 | f, axes = plt.subplots(1, 2, figsize=(width, height))
123 | sns.lineplot(x='samples', y='success_rate', hue='algo', ax=axes[0], data=sr_timesteps)
124 | axes[0].set_xlabel('samples')
125 | axes[0].set_ylabel('avg. succ. rate')
126 | axes[0].get_legend().remove()
127 | sns.lineplot(x='samples', y='eval', hue='algo', ax=axes[1], data=eval_timesteps)
128 | axes[1].set_xlabel('samples')
129 | axes[1].set_ylabel('hard succ. rate')
130 | axes[1].get_legend().remove()
131 | # sns.lineplot(x='iterations', y='eval', hue='algo', ax=axes[2], data=eval_iteration)
132 | # axes[2].xaxis.get_major_formatter().set_powerlimits((0, 1))
133 | # axes[2].set_xlabel('iterations')
134 | # axes[2].set_ylabel('')
135 | # axes[2].get_legend().remove()
136 | handles, labels = axes[1].get_legend_handles_labels()
137 | print(handles)
138 |
139 | f.legend(handles[:], ['SAC', 'SIR', 'SIL', 'DS'][:len(subfolders)], loc="lower right", ncol=1, bbox_to_anchor=(0.49, 0.18), title='')
140 | f.subplots_adjust(top=1. - margin / height, bottom=0.21, wspace=wspace, left=left, right=1. - margin / width)
141 | plt.savefig(os.path.join(folder_name, '../', os.path.basename(folder_name) + 'clean.pdf'))
142 | plt.show()
143 |
--------------------------------------------------------------------------------
/run_ppo.py:
--------------------------------------------------------------------------------
1 | from baselines import PPO2
2 | from stable_baselines.common.policies import register_policy
3 | from stable_baselines.common import set_global_seeds
4 | from stable_baselines.common.vec_env import SubprocVecEnv
5 | from utils.log_utils import eval_model, log_eval, stack_eval_model
6 |
7 | from utils.make_env_utils import make_env, configure_logger, get_env_kwargs, get_policy_kwargs, get_train_kwargs, \
8 | get_num_workers
9 | import numpy as np
10 |
11 | import os, time, argparse
12 | import matplotlib.pyplot as plt
13 |
14 |
15 | def arg_parse():
16 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
17 | parser.add_argument('--env', default='FetchPushWallObstacle-v4')
18 | parser.add_argument('--policy', type=str, default='MlpPolicy')
19 | parser.add_argument('--seed', type=int, default=42)
20 | parser.add_argument('--num_timesteps', type=float, default=1e8)
21 | parser.add_argument('--reward_type', type=str, default='sparse')
22 | parser.add_argument('--n_object', type=int, default=2) # Only used for stacking
23 | parser.add_argument('--log_path', default=None, type=str)
24 | parser.add_argument('--load_path', default=None, type=str)
25 | parser.add_argument('--random_ratio', default=1.0, type=float)
26 | parser.add_argument('--curriculum', action="store_true", default=False)
27 | parser.add_argument('--sequential', action="store_true", default=False)
28 | parser.add_argument('--gamma', default=0.99, type=float)
29 | parser.add_argument('--play', action="store_true", default=False)
30 | parser.add_argument('--export_video', action="store_true", default=False)
31 | args = parser.parse_args()
32 | return args
33 |
34 |
35 | def main(args):
36 | log_dir = args.log_path if (args.log_path is not None) else \
37 | "/tmp/stable_baselines_" + time.strftime('%Y-%m-%d-%H-%M-%S')
38 | configure_logger(log_dir)
39 |
40 | set_global_seeds(args.seed)
41 |
42 | n_cpu = get_num_workers(args.env) if not args.play else 1
43 | env_kwargs = get_env_kwargs(args.env, args.random_ratio, args.sequential, args.reward_type,
44 | args.n_object, args.curriculum)
45 | def make_thunk(rank):
46 | return lambda: make_env(env_id=args.env, rank=rank, log_dir=log_dir, flatten_dict=True, kwargs=env_kwargs)
47 | env = SubprocVecEnv([make_thunk(i) for i in range(n_cpu)])
48 |
49 | eval_env_kwargs = env_kwargs.copy()
50 | eval_env_kwargs['random_ratio'] = 0.0
51 | if "use_cu" in eval_env_kwargs:
52 | eval_env_kwargs['use_cu'] = False
53 | eval_env = make_env(env_id=args.env, rank=0, flatten_dict=True, kwargs=eval_env_kwargs)
54 | print(eval_env)
55 | if not args.play:
56 | os.makedirs(log_dir, exist_ok=True)
57 | train_kwargs = get_train_kwargs("ppo", args, parsed_action_noise=None, eval_env=eval_env)
58 |
59 | # policy = 'MlpPolicy'
60 | from utils.attention_policy import AttentionPolicy
61 | register_policy('AttentionPolicy', AttentionPolicy)
62 | policy_kwargs = get_policy_kwargs("ppo", args)
63 | print(policy_kwargs)
64 |
65 | model = PPO2(args.policy, env, verbose=1, nminibatches=32, lam=0.95, noptepochs=10,
66 | ent_coef=0.01, learning_rate=3e-4, cliprange=0.2, policy_kwargs=policy_kwargs, **train_kwargs)
67 | print(model.get_parameter_list())
68 |
69 | def callback(_locals, _globals):
70 | num_update = _locals["update"]
71 | if 'FetchStack' in args.env:
72 | mean_eval_reward = stack_eval_model(eval_env, _locals["self"])
73 | else:
74 | mean_eval_reward = eval_model(eval_env, _locals["self"])
75 | log_eval(num_update, mean_eval_reward)
76 | if num_update % 10 == 0:
77 | model_path = os.path.join(log_dir, 'model_' + str(num_update // 10))
78 | model.save(model_path)
79 | print('model saved to', model_path)
80 | return True
81 |
82 | model.learn(total_timesteps=int(args.num_timesteps), callback=callback, seed=args.seed, log_interval=1)
83 | model.save(os.path.join(log_dir, 'final'))
84 |
85 | else:
86 | assert args.load_path is not None
87 | model = PPO2.load(args.load_path)
88 | fig, ax = plt.subplots(1, 1, figsize=(8, 8))
89 | obs = env.reset()
90 | goal_dim = env.get_attr('goal')[0].shape[0]
91 | if 'FetchStack' in args.env:
92 | while env.get_attr('current_nobject')[0] != env.get_attr('n_object')[0] or \
93 | env.get_attr('task_mode')[0] != 1:
94 | obs = env.reset()
95 | elif 'FetchPush' in args.env:
96 | while not (1.25 < obs[0][6] < 1.33 and obs[0][7] < 0.61 and 0.7 < obs[0][4] < 0.8):
97 | obs = env.reset()
98 | env.env_method('set_goal', np.array([1.2, 0.75, 0.425, 1, 0]))
99 | obs = env.env_method('get_obs')
100 | obs[0] = np.concatenate([obs[0][key] for key in ['observation', 'achieved_goal', 'desired_goal']])
101 | else:
102 | while np.argmax(obs[0][-goal_dim+3:]) != 0:
103 | obs = env.reset()
104 | print('achieved_goal', obs[0][-2*goal_dim: -goal_dim], 'goal', obs[0][-goal_dim:])
105 | episode_reward = 0.0
106 | num_episode = 0
107 | frame_idx = 0
108 | images = []
109 | if 'max_episode_steps' not in env_kwargs.keys():
110 | env_kwargs['max_episode_steps'] = 100
111 | for i in range(env_kwargs['max_episode_steps'] * 10):
112 | img = env.render(mode='rgb_array')
113 | ax.cla()
114 | ax.imshow(img)
115 | if env.get_attr('goal')[0].shape[0] <= 3:
116 | ax.set_title('episode ' + str(num_episode) + ', frame ' + str(frame_idx))
117 | else:
118 | ax.set_title('episode ' + str(num_episode) + ', frame ' + str(frame_idx) +
119 | ', goal idx ' + str(np.argmax(env.get_attr('goal')[0][3:])))
120 | if 'FetchStack' in args.env:
121 | tasks = ['pick and place', 'stack']
122 | ax.set_title('episode ' + str(num_episode) + ', frame ' + str(frame_idx)
123 | + ', task: ' + tasks[np.argmax(obs[0][-2*goal_dim-2:-2*goal_dim])])
124 | images.append(img)
125 | action, _ = model.predict(obs)
126 | obs, reward, done, _ = env.step(action)
127 | episode_reward += reward
128 | frame_idx += 1
129 | if not args.export_video:
130 | plt.pause(0.1)
131 | else:
132 | plt.imsave(os.path.join(os.path.dirname(args.load_path), 'tempimg%d.png' % i), img)
133 | if done:
134 | print('episode_reward', episode_reward)
135 | if 'FetchStack' in args.env:
136 | while env.get_attr('current_nobject')[0] != env.get_attr('n_object')[0] or \
137 | env.get_attr('task_mode')[0] != 1:
138 | obs = env.reset()
139 | else:
140 | while np.argmax(obs[0][-goal_dim + 3:]) != 0:
141 | obs = env.reset()
142 | print('goal', obs[0][-goal_dim:])
143 | episode_reward = 0.0
144 | frame_idx = 0
145 | num_episode += 1
146 | if num_episode >= 10:
147 | break
148 | if args.export_video:
149 | os.system('ffmpeg -r 5 -start_number 0 -i ' + os.path.dirname(args.load_path) +
150 | '/tempimg%d.png -c:v libx264 -pix_fmt yuv420p ' +
151 | os.path.join(os.path.dirname(args.load_path), args.env + '.mp4'))
152 | for i in range(env_kwargs['max_episode_steps'] * 10):
153 | try:
154 | os.remove(os.path.join(os.path.dirname(args.load_path), 'tempimg' + str(i) + '.png'))
155 | except:
156 | pass
157 |
158 |
159 | if __name__ == '__main__':
160 | args = arg_parse()
161 | main(args)
162 |
--------------------------------------------------------------------------------
/baselines/her/her.py:
--------------------------------------------------------------------------------
1 | import functools
2 |
3 | from stable_baselines.common import BaseRLModel
4 | from stable_baselines.common import OffPolicyRLModel
5 | from stable_baselines.common.base_class import _UnvecWrapper
6 | from stable_baselines.common.vec_env import VecEnvWrapper
7 | from .replay_buffer import HindsightExperienceReplayWrapper, KEY_TO_GOAL_STRATEGY
8 | from .utils import HERGoalEnvWrapper
9 |
10 |
11 | class HER2(BaseRLModel):
12 | """
13 | Hindsight Experience Replay (HER) https://arxiv.org/abs/1707.01495
14 | :param policy: (BasePolicy or str) The policy model to use (MlpPolicy, CnnPolicy, CnnLstmPolicy, ...)
15 | :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str)
16 | :param model_class: (OffPolicyRLModel) The off policy RL model to apply Hindsight Experience Replay
17 | currently supported: DQN, DDPG, SAC
18 | :param n_sampled_goal: (int)
19 | :param goal_selection_strategy: (GoalSelectionStrategy or str)
20 | """
21 |
22 | def __init__(self, policy, env, model_class, n_sampled_goal=4,
23 | goal_selection_strategy='future', num_workers=1, *args, **kwargs):
24 |
25 | assert not isinstance(env, VecEnvWrapper), "HER does not support VecEnvWrapper"
26 |
27 | super().__init__(policy=policy, env=env, verbose=kwargs.get('verbose', 0),
28 | # policy_base=None, requires_vec_env=(num_workers > 1))
29 | policy_base=None, requires_vec_env=kwargs.get('requires_vec_env', True))
30 |
31 | self.model_class = model_class
32 | self.replay_wrapper = None
33 | self.n_workers = num_workers
34 | # Save dict observation space (used for checks at loading time)
35 | if env is not None:
36 | self.observation_space = env.observation_space
37 | self.action_space = env.action_space
38 |
39 | # Convert string to GoalSelectionStrategy object
40 | if isinstance(goal_selection_strategy, str):
41 | assert goal_selection_strategy in KEY_TO_GOAL_STRATEGY.keys(), "Unknown goal selection strategy"
42 | goal_selection_strategy = KEY_TO_GOAL_STRATEGY[goal_selection_strategy]
43 |
44 | self.n_sampled_goal = n_sampled_goal
45 | self.goal_selection_strategy = goal_selection_strategy
46 |
47 | if self.env is not None:
48 | self._create_replay_wrapper(self.env)
49 |
50 | assert issubclass(model_class, OffPolicyRLModel), \
51 | "Error: HER only works with Off policy model (such as DDPG, SAC, TD3 and DQN)."
52 |
53 | self.model = self.model_class(policy, self.env, *args, **kwargs)
54 | # Patch to support saving/loading
55 | self.model._save_to_file = self._save_to_file
56 |
57 | def _create_replay_wrapper(self, env):
58 | """
59 | Wrap the environment in a HERGoalEnvWrapper
60 | if needed and create the replay buffer wrapper.
61 | """
62 | if not isinstance(env, HERGoalEnvWrapper):
63 | env = HERGoalEnvWrapper(env)
64 |
65 | self.env = env
66 | # NOTE: we cannot do that check directly with VecEnv
67 | # maybe we can try calling `compute_reward()` ?
68 | # assert isinstance(self.env, gym.GoalEnv), "HER only supports gym.GoalEnv"
69 |
70 | # if self.n_workers > 1:
71 | # replay_wrapper = HindsightExperienceReplayWrapper
72 | # else:
73 | # replay_wrapper = SingleHindsightExperienceReplayWrapper
74 | replay_wrapper = HindsightExperienceReplayWrapper
75 | self.replay_wrapper = functools.partial(replay_wrapper,
76 | n_sampled_goal=self.n_sampled_goal,
77 | goal_selection_strategy=self.goal_selection_strategy,
78 | wrapped_env=self.env)
79 |
80 | def set_env(self, env):
81 | assert not isinstance(env, VecEnvWrapper), "HER does not support VecEnvWrapper"
82 | super().set_env(env)
83 | self._create_replay_wrapper(self.env)
84 | self.model.set_env(self.env)
85 |
86 | def get_env(self):
87 | return self.env
88 |
89 | def get_parameter_list(self):
90 | return self.model.get_parameter_list()
91 |
92 | def __getattr__(self, attr):
93 | """
94 | Wrap the RL model.
95 | :param attr: (str)
96 | :return: (Any)
97 | """
98 | if attr in self.__dict__:
99 | return getattr(self, attr)
100 | return getattr(self.model, attr)
101 |
102 | def __set_attr__(self, attr, value):
103 | if attr in self.__dict__:
104 | setattr(self, attr, value)
105 | else:
106 | setattr(self.model, attr, value)
107 |
108 | def _get_pretrain_placeholders(self):
109 | return self.model._get_pretrain_placeholders()
110 |
111 | def setup_model(self):
112 | pass
113 |
114 | def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="HER",
115 | reset_num_timesteps=True):
116 | return self.model.learn(total_timesteps, callback=callback, seed=seed, log_interval=log_interval,
117 | tb_log_name=tb_log_name, reset_num_timesteps=reset_num_timesteps,
118 | replay_wrapper=self.replay_wrapper)
119 |
120 | def _check_obs(self, observation):
121 | if isinstance(observation, dict):
122 | if self.env is not None:
123 | if len(observation['observation'].shape) > 1:
124 | observation = _UnvecWrapper.unvec_obs(observation)
125 | return [self.env.convert_dict_to_obs(observation)]
126 | return self.env.convert_dict_to_obs(observation)
127 | else:
128 | raise ValueError("You must either pass an env to HER or wrap your env using HERGoalEnvWrapper")
129 | return observation
130 |
131 | def predict(self, observation, state=None, mask=None, deterministic=True):
132 | return self.model.predict(self._check_obs(observation), state, mask, deterministic)
133 |
134 | def action_probability(self, observation, state=None, mask=None, actions=None, logp=False):
135 | return self.model.action_probability(self._check_obs(observation), state, mask, actions, logp)
136 |
137 | def _save_to_file(self, save_path, data=None, params=None, cloudpickle=False):
138 | # HACK to save the replay wrapper
139 | # or better to save only the replay strategy and its params?
140 | # it will not work with VecEnv
141 | data['n_sampled_goal'] = self.n_sampled_goal
142 | data['goal_selection_strategy'] = self.goal_selection_strategy
143 | data['model_class'] = self.model_class
144 | data['her_obs_space'] = self.observation_space
145 | data['her_action_space'] = self.action_space
146 | super()._save_to_file(save_path, data, params, cloudpickle=cloudpickle)
147 |
148 | def save(self, save_path, cloudpickle=False):
149 | self.model.save(save_path, cloudpickle=cloudpickle)
150 |
151 | @classmethod
152 | def load(cls, load_path, env=None, custom_objects=None, **kwargs):
153 | data, _ = cls._load_from_file(load_path, custom_objects=custom_objects)
154 |
155 | if 'policy_kwargs' in kwargs and kwargs['policy_kwargs'] != data['policy_kwargs']:
156 | raise ValueError("The specified policy kwargs do not equal the stored policy kwargs. "
157 | "Stored kwargs: {}, specified kwargs: {}".format(data['policy_kwargs'],
158 | kwargs['policy_kwargs']))
159 |
160 | model = cls(policy=data["policy"], env=env, model_class=data['model_class'],
161 | n_sampled_goal=data['n_sampled_goal'],
162 | goal_selection_strategy=data['goal_selection_strategy'],
163 | _init_setup_model=False)
164 | model.__dict__['observation_space'] = data['her_obs_space']
165 | model.__dict__['action_space'] = data['her_action_space']
166 | model.model = data['model_class'].load(load_path, model.get_env(), **kwargs)
167 | model.model._save_to_file = model._save_to_file
168 | return model
--------------------------------------------------------------------------------
/run_her_augment.py:
--------------------------------------------------------------------------------
1 | from baselines import HER2, SAC_SIR
2 | from stable_baselines.sac.policies import FeedForwardPolicy as SACPolicy
3 | from stable_baselines.common.policies import register_policy
4 | from utils.parallel_subproc_vec_env import ParallelSubprocVecEnv
5 | from gym.wrappers import FlattenDictWrapper
6 | from stable_baselines.common import set_global_seeds
7 | from stable_baselines import logger
8 | from utils.make_env_utils import make_env, get_env_kwargs, get_train_kwargs, get_policy_kwargs
9 | import os, time
10 | import argparse
11 | import numpy as np
12 | from utils.log_utils import eval_model, log_eval, stack_eval_model, egonav_eval_model
13 |
14 | try:
15 | from mpi4py import MPI
16 | except ImportError:
17 | MPI = None
18 |
19 |
20 | def arg_parse():
21 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
22 | parser.add_argument('--env', default='FetchPushWallObstacle-v4')
23 | parser.add_argument('--seed', type=int, default=42)
24 | parser.add_argument('--policy', type=str, default='CustomSACPolicy')
25 | parser.add_argument('--num_workers', type=int, default=32)
26 | parser.add_argument('--learning_rate', type=float, default=3e-4)
27 | parser.add_argument('--action_noise', type=str, default='none')
28 | parser.add_argument('--num_timesteps', type=float, default=3e6)
29 | parser.add_argument('--log_path', default=None, type=str)
30 | parser.add_argument('--load_path', default=None, type=str)
31 | parser.add_argument('--play', action="store_true", default=False)
32 | parser.add_argument('--batch_size', type=int, default=64)
33 | parser.add_argument('--random_ratio', type=float, default=1.0)
34 | parser.add_argument('--gamma', type=float, default=0.95)
35 | parser.add_argument('--reward_type', type=str, default='sparse')
36 | parser.add_argument('--n_object', type=int, default=2)
37 | parser.add_argument('--start_augment', type=float, default=0)
38 | parser.add_argument('--priority', action="store_true", default=False)
39 | parser.add_argument('--curriculum', action="store_true", default=False)
40 | parser.add_argument('--imitation_coef', type=float, default=5)
41 | parser.add_argument('--sequential', action="store_true", default=False)
42 | parser.add_argument('--export_gif', action="store_true", default=False)
43 | args = parser.parse_args()
44 | return args
45 |
46 |
47 | def configure_logger(log_path, **kwargs):
48 | if log_path is not None:
49 | logger.configure(log_path)
50 | else:
51 | logger.configure(**kwargs)
52 |
53 |
54 | def main(args):
55 | log_dir = args.log_path if (args.log_path is not None) else "/tmp/stable_baselines_" + time.strftime('%Y-%m-%d-%H-%M-%S')
56 | if MPI is None or MPI.COMM_WORLD.Get_rank() == 0:
57 | rank = 0
58 | configure_logger(log_dir)
59 | else:
60 | rank = MPI.COMM_WORLD.Get_rank()
61 | configure_logger(log_dir, format_strs=[])
62 |
63 | set_global_seeds(args.seed)
64 |
65 | model_class = SAC_SIR # works also with SAC, DDPG and TD3
66 |
67 | env_kwargs = get_env_kwargs(args.env, random_ratio=args.random_ratio, sequential=args.sequential,
68 | reward_type=args.reward_type, n_object=args.n_object)
69 |
70 | def make_thunk(rank):
71 | return lambda: make_env(env_id=args.env, rank=rank, log_dir=log_dir, kwargs=env_kwargs)
72 |
73 | env = ParallelSubprocVecEnv([make_thunk(i) for i in range(args.num_workers)], reset_when_done=True)
74 |
75 | def make_thunk_aug(rank):
76 | return lambda: FlattenDictWrapper(make_env(env_id=aug_env_name, rank=rank, kwargs=aug_env_kwargs),
77 | ['observation', 'achieved_goal', 'desired_goal'])
78 |
79 | aug_env_kwargs = env_kwargs.copy()
80 | del aug_env_kwargs['max_episode_steps']
81 | aug_env_name = args.env.split('-')[0] + 'Unlimit-' + args.env.split('-')[1]
82 | aug_env = ParallelSubprocVecEnv([make_thunk_aug(i) for i in range(args.num_workers)], reset_when_done=False)
83 |
84 | if os.path.exists(os.path.join(logger.get_dir(), 'eval.csv')):
85 | os.remove(os.path.join(logger.get_dir(), 'eval.csv'))
86 | print('Remove existing eval.csv')
87 | eval_env_kwargs = env_kwargs.copy()
88 | eval_env_kwargs['random_ratio'] = 0.0
89 | eval_env = make_env(env_id=args.env, rank=0, kwargs=eval_env_kwargs)
90 | eval_env = FlattenDictWrapper(eval_env, ['observation', 'achieved_goal', 'desired_goal'])
91 |
92 | if not args.play:
93 | os.makedirs(log_dir, exist_ok=True)
94 |
95 | # Available strategies (cf paper): future, final, episode, random
96 | goal_selection_strategy = 'future' # equivalent to GoalSelectionStrategy.FUTURE
97 |
98 | if not args.play:
99 | from stable_baselines.ddpg.noise import NormalActionNoise
100 | noise_type = args.action_noise.split('_')[0]
101 | if noise_type == 'none':
102 | parsed_action_noise = None
103 | elif noise_type == 'normal':
104 | sigma = float(args.action_noise.split('_')[1])
105 | parsed_action_noise = NormalActionNoise(mean=np.zeros(env.action_space.shape),
106 | sigma=sigma * np.ones(env.action_space.shape))
107 | else:
108 | raise NotImplementedError
109 |
110 | train_kwargs = get_train_kwargs("sac_sir", args, parsed_action_noise, eval_env, aug_env)
111 |
112 | def callback(_locals, _globals):
113 | if _locals['step'] % int(1e3) == 0:
114 | if 'FetchStack' in args.env:
115 | mean_eval_reward = stack_eval_model(eval_env, _locals["self"],
116 | init_on_table=(args.env=='FetchStack-v2'))
117 | elif 'MasspointPushDoubleObstacle-v2' in args.env:
118 | mean_eval_reward = egonav_eval_model(eval_env, _locals["self"], env_kwargs["random_ratio"], fixed_goal=np.array([4., 4., 0.15, 0., 0., 0., 1.]))
119 | mean_eval_reward2 = egonav_eval_model(eval_env, _locals["self"], env_kwargs["random_ratio"],
120 | goal_idx=0, fixed_goal=np.array([4., 4., 0.15, 1., 0., 0., 0.]))
121 | log_eval(_locals['self'].num_timesteps, mean_eval_reward2, file_name="eval_box.csv")
122 | else:
123 | mean_eval_reward = eval_model(eval_env, _locals["self"])
124 | log_eval(_locals['self'].num_timesteps, mean_eval_reward)
125 | if _locals['step'] % int(2e4) == 0:
126 | model_path = os.path.join(log_dir, 'model_' + str(_locals['step'] // int(2e4)))
127 | model.save(model_path)
128 | print('model saved to', model_path)
129 | return True
130 |
131 | class CustomSACPolicy(SACPolicy):
132 | def __init__(self, *model_args, **model_kwargs):
133 | super(CustomSACPolicy, self).__init__(*model_args, **model_kwargs,
134 | layers=[256, 256] if 'MasspointPushDoubleObstacle' in args.env else [256, 256, 256, 256],
135 | feature_extraction="mlp")
136 | register_policy('CustomSACPolicy', CustomSACPolicy)
137 | from utils.sac_attention_policy import AttentionPolicy
138 | register_policy('AttentionPolicy', AttentionPolicy)
139 | policy_kwargs = get_policy_kwargs("sac_sir", args)
140 |
141 | if rank == 0:
142 | print('train_kwargs', train_kwargs)
143 | print('policy_kwargs', policy_kwargs)
144 | # Wrap the model
145 | model = HER2(args.policy, env, model_class, n_sampled_goal=4,
146 | start_augment_time=args.start_augment,
147 | goal_selection_strategy=goal_selection_strategy,
148 | num_workers=args.num_workers,
149 | policy_kwargs=policy_kwargs,
150 | verbose=1,
151 | **train_kwargs)
152 | print(model.get_parameter_list())
153 |
154 | # Train the model
155 | model.learn(int(args.num_timesteps), seed=args.seed, callback=callback, log_interval=100 if not ('MasspointMaze-v3' in args.env) else 10)
156 |
157 | if rank == 0:
158 | model.save(os.path.join(log_dir, 'final'))
159 |
160 |
161 | if __name__ == '__main__':
162 | args = arg_parse()
163 | main(args)
164 |
--------------------------------------------------------------------------------