├── ppo
    ├── train.sh
    ├── models
    │   ├── pretrained_drone
    │   │   ├── ckpts
    │   │   │   └── 100007936.pt
    │   │   └── config.gin
    │   ├── pretrained_lunar
    │   │   ├── ckpts
    │   │   │   └── 100007936.pt
    │   │   └── config.gin
    │   ├── drone_reacher_agent
    │   │   └── ckpts
    │   │   │   └── 010485760.pt
    │   ├── lunar_lander_agent
    │   │   └── ckpts
    │   │   │   └── 001000000.pt
    │   ├── behavioral_cloning_agents_drone
    │   │   ├── bc0
    │   │   │   └── ckpts
    │   │   │   │   └── 002728350.pt
    │   │   ├── bc1
    │   │   │   └── ckpts
    │   │   │   │   └── 002556300.pt
    │   │   ├── bc2
    │   │   │   └── ckpts
    │   │   │   │   └── 002116950.pt
    │   │   ├── bc3
    │   │   │   └── ckpts
    │   │   │   │   └── 001719600.pt
    │   │   ├── bc4
    │   │   │   └── ckpts
    │   │   │   │   └── 002935500.pt
    │   │   ├── bc5
    │   │   │   └── ckpts
    │   │   │   │   └── 002101950.pt
    │   │   ├── bc6
    │   │   │   └── ckpts
    │   │   │   │   └── 001462650.pt
    │   │   ├── bc7
    │   │   │   └── ckpts
    │   │   │   │   └── 002159100.pt
    │   │   ├── bc8
    │   │   │   └── ckpts
    │   │   │   │   └── 002733300.pt
    │   │   ├── bc9
    │   │   │   └── ckpts
    │   │   │   │   └── 002580450.pt
    │   │   ├── bc10
    │   │   │   └── ckpts
    │   │   │   │   └── 002958900.pt
    │   │   ├── bc11
    │   │   │   └── ckpts
    │   │   │   │   └── 001742550.pt
    │   │   ├── bc12
    │   │   │   └── ckpts
    │   │   │   │   └── 002901450.pt
    │   │   ├── bc13
    │   │   │   └── ckpts
    │   │   │   │   └── 003014400.pt
    │   │   └── bc14
    │   │   │   └── ckpts
    │   │   │       └── 001760700.pt
    │   └── behavioral_cloning_agents_lunar
    │   │   ├── bc0
    │   │       └── ckpts
    │   │       │   └── 001747200.pt
    │   │   ├── bc1
    │   │       └── ckpts
    │   │       │   └── 001272700.pt
    │   │   ├── bc2
    │   │       └── ckpts
    │   │       │   └── 001376400.pt
    │   │   ├── bc3
    │   │       └── ckpts
    │   │       │   └── 001579400.pt
    │   │   ├── bc4
    │   │       └── ckpts
    │   │       │   └── 001349200.pt
    │   │   ├── bc5
    │   │       └── ckpts
    │   │       │   └── 000993600.pt
    │   │   ├── bc6
    │   │       └── ckpts
    │   │       │   └── 001220200.pt
    │   │   └── bc7
    │   │       └── ckpts
    │   │           └── 001122700.pt
    ├── __init__.py
    ├── joystick_control.py
    ├── actor.py
    ├── configs
    │   ├── ppo_drone_noisy.gin
    │   ├── ppo_drone_laggy.gin
    │   ├── ppo_drone.gin
    │   ├── ppo_lunar_noisy.gin
    │   ├── ppo_lunar_laggy.gin
    │   └── ppo_lunar.gin
    └── constrained_residual_ppo.py
├── drone_sim
    ├── train.sh
    ├── train.py
    ├── watch.py
    ├── __init__.py
    ├── ppo.gin
    ├── actor.py
    ├── joystick_agent.py
    ├── sim.py
    ├── env.py
    └── rendering.py
├── imitation_learning
    ├── train.sh
    ├── __init__.py
    ├── drone_ob_norm.py
    ├── bc_lunar.gin
    ├── bc_drone.gin
    ├── ob_norm.py
    ├── watch.py
    ├── data_collection_lunar.py
    ├── data_collection_drone.py
    └── trainer.py
├── lunar_lander
    ├── train.sh
    ├── train.py
    ├── watch.py
    ├── td3.gin
    ├── __init__.py
    ├── actor.py
    ├── joystick_agent.py
    ├── fuel_wrapper.py
    ├── env_reacher.py
    └── env_lander.py
├── __init__.py
├── launch_docker.sh
├── Dockerfile
├── .gitignore
├── residual_wrapper.py
├── base_actors.py
└── README.md


/ppo/train.sh:
--------------------------------------------------------------------------------
1 | python -m dl.main $@
2 | 


--------------------------------------------------------------------------------
/drone_sim/train.sh:
--------------------------------------------------------------------------------
1 | python -m dl.main logs ppo.gin
2 | 


--------------------------------------------------------------------------------
/imitation_learning/train.sh:
--------------------------------------------------------------------------------
1 | python -m dl.main $@
2 | 


--------------------------------------------------------------------------------
/lunar_lander/train.sh:
--------------------------------------------------------------------------------
1 | python -m dl.main logs td3.gin
2 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
1 | from residual_shared_autonomy.residual_wrapper import ResidualWrapper
2 | from residual_shared_autonomy.base_actors import *
3 | 


--------------------------------------------------------------------------------
/ppo/models/pretrained_drone/ckpts/100007936.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbschaff/rsa/HEAD/ppo/models/pretrained_drone/ckpts/100007936.pt


--------------------------------------------------------------------------------
/ppo/models/pretrained_lunar/ckpts/100007936.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbschaff/rsa/HEAD/ppo/models/pretrained_lunar/ckpts/100007936.pt


--------------------------------------------------------------------------------
/ppo/models/drone_reacher_agent/ckpts/010485760.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbschaff/rsa/HEAD/ppo/models/drone_reacher_agent/ckpts/010485760.pt


--------------------------------------------------------------------------------
/ppo/models/lunar_lander_agent/ckpts/001000000.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbschaff/rsa/HEAD/ppo/models/lunar_lander_agent/ckpts/001000000.pt


--------------------------------------------------------------------------------
/ppo/__init__.py:
--------------------------------------------------------------------------------
1 | from residual_shared_autonomy.ppo.actor import *
2 | from residual_shared_autonomy.ppo.constrained_residual_ppo import ConstrainedResidualPPO
3 | 


--------------------------------------------------------------------------------
/launch_docker.sh:
--------------------------------------------------------------------------------
1 | sudo x-docker run --gpus all --net=host -it --rm --privileged -v `pwd`:/root/pkgs/residual_shared_autonomy residual_shared_autonomy:latest bash
2 | 


--------------------------------------------------------------------------------
/ppo/models/behavioral_cloning_agents_drone/bc0/ckpts/002728350.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbschaff/rsa/HEAD/ppo/models/behavioral_cloning_agents_drone/bc0/ckpts/002728350.pt


--------------------------------------------------------------------------------
/ppo/models/behavioral_cloning_agents_drone/bc1/ckpts/002556300.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbschaff/rsa/HEAD/ppo/models/behavioral_cloning_agents_drone/bc1/ckpts/002556300.pt


--------------------------------------------------------------------------------
/ppo/models/behavioral_cloning_agents_drone/bc2/ckpts/002116950.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbschaff/rsa/HEAD/ppo/models/behavioral_cloning_agents_drone/bc2/ckpts/002116950.pt


--------------------------------------------------------------------------------
/ppo/models/behavioral_cloning_agents_drone/bc3/ckpts/001719600.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbschaff/rsa/HEAD/ppo/models/behavioral_cloning_agents_drone/bc3/ckpts/001719600.pt


--------------------------------------------------------------------------------
/ppo/models/behavioral_cloning_agents_drone/bc4/ckpts/002935500.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbschaff/rsa/HEAD/ppo/models/behavioral_cloning_agents_drone/bc4/ckpts/002935500.pt


--------------------------------------------------------------------------------
/ppo/models/behavioral_cloning_agents_drone/bc5/ckpts/002101950.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbschaff/rsa/HEAD/ppo/models/behavioral_cloning_agents_drone/bc5/ckpts/002101950.pt


--------------------------------------------------------------------------------
/ppo/models/behavioral_cloning_agents_drone/bc6/ckpts/001462650.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbschaff/rsa/HEAD/ppo/models/behavioral_cloning_agents_drone/bc6/ckpts/001462650.pt


--------------------------------------------------------------------------------
/ppo/models/behavioral_cloning_agents_drone/bc7/ckpts/002159100.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbschaff/rsa/HEAD/ppo/models/behavioral_cloning_agents_drone/bc7/ckpts/002159100.pt


--------------------------------------------------------------------------------
/ppo/models/behavioral_cloning_agents_drone/bc8/ckpts/002733300.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbschaff/rsa/HEAD/ppo/models/behavioral_cloning_agents_drone/bc8/ckpts/002733300.pt


--------------------------------------------------------------------------------
/ppo/models/behavioral_cloning_agents_drone/bc9/ckpts/002580450.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbschaff/rsa/HEAD/ppo/models/behavioral_cloning_agents_drone/bc9/ckpts/002580450.pt


--------------------------------------------------------------------------------
/ppo/models/behavioral_cloning_agents_lunar/bc0/ckpts/001747200.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbschaff/rsa/HEAD/ppo/models/behavioral_cloning_agents_lunar/bc0/ckpts/001747200.pt


--------------------------------------------------------------------------------
/ppo/models/behavioral_cloning_agents_lunar/bc1/ckpts/001272700.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbschaff/rsa/HEAD/ppo/models/behavioral_cloning_agents_lunar/bc1/ckpts/001272700.pt


--------------------------------------------------------------------------------
/ppo/models/behavioral_cloning_agents_lunar/bc2/ckpts/001376400.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbschaff/rsa/HEAD/ppo/models/behavioral_cloning_agents_lunar/bc2/ckpts/001376400.pt


--------------------------------------------------------------------------------
/ppo/models/behavioral_cloning_agents_lunar/bc3/ckpts/001579400.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbschaff/rsa/HEAD/ppo/models/behavioral_cloning_agents_lunar/bc3/ckpts/001579400.pt


--------------------------------------------------------------------------------
/ppo/models/behavioral_cloning_agents_lunar/bc4/ckpts/001349200.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbschaff/rsa/HEAD/ppo/models/behavioral_cloning_agents_lunar/bc4/ckpts/001349200.pt


--------------------------------------------------------------------------------
/ppo/models/behavioral_cloning_agents_lunar/bc5/ckpts/000993600.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbschaff/rsa/HEAD/ppo/models/behavioral_cloning_agents_lunar/bc5/ckpts/000993600.pt


--------------------------------------------------------------------------------
/ppo/models/behavioral_cloning_agents_lunar/bc6/ckpts/001220200.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbschaff/rsa/HEAD/ppo/models/behavioral_cloning_agents_lunar/bc6/ckpts/001220200.pt


--------------------------------------------------------------------------------
/ppo/models/behavioral_cloning_agents_lunar/bc7/ckpts/001122700.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbschaff/rsa/HEAD/ppo/models/behavioral_cloning_agents_lunar/bc7/ckpts/001122700.pt


--------------------------------------------------------------------------------
/ppo/models/behavioral_cloning_agents_drone/bc10/ckpts/002958900.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbschaff/rsa/HEAD/ppo/models/behavioral_cloning_agents_drone/bc10/ckpts/002958900.pt


--------------------------------------------------------------------------------
/ppo/models/behavioral_cloning_agents_drone/bc11/ckpts/001742550.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbschaff/rsa/HEAD/ppo/models/behavioral_cloning_agents_drone/bc11/ckpts/001742550.pt


--------------------------------------------------------------------------------
/ppo/models/behavioral_cloning_agents_drone/bc12/ckpts/002901450.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbschaff/rsa/HEAD/ppo/models/behavioral_cloning_agents_drone/bc12/ckpts/002901450.pt


--------------------------------------------------------------------------------
/ppo/models/behavioral_cloning_agents_drone/bc13/ckpts/003014400.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbschaff/rsa/HEAD/ppo/models/behavioral_cloning_agents_drone/bc13/ckpts/003014400.pt


--------------------------------------------------------------------------------
/ppo/models/behavioral_cloning_agents_drone/bc14/ckpts/001760700.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbschaff/rsa/HEAD/ppo/models/behavioral_cloning_agents_drone/bc14/ckpts/001760700.pt


--------------------------------------------------------------------------------
/imitation_learning/__init__.py:
--------------------------------------------------------------------------------
1 | from residual_shared_autonomy.imitation_learning.trainer import *
2 | from residual_shared_autonomy.imitation_learning.ob_norm import *
3 | from residual_shared_autonomy.imitation_learning.drone_ob_norm import *
4 | 


--------------------------------------------------------------------------------
/drone_sim/train.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import dl
 3 | 
 4 | 
 5 | if __name__=='__main__':
 6 |     parser = argparse.ArgumentParser(description='Train Agent.')
 7 |     parser.add_argument('--expdir', type=str, help='expdir', required=True)
 8 |     parser.add_argument('--gin_config', type=str, help='gin config', required=True)
 9 |     parser.add_argument('-b', '--gin_bindings', nargs='+', help='gin bindings to overwrite config')
10 |     args = parser.parse_args()
11 |     dl.load_config(args.gin_config, args.gin_bindings)
12 |     dl.train(args.expdir)
13 | 


--------------------------------------------------------------------------------
/lunar_lander/train.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import dl
 3 | 
 4 | 
 5 | if __name__=='__main__':
 6 |     parser = argparse.ArgumentParser(description='Train Agent.')
 7 |     parser.add_argument('--expdir', type=str, help='expdir', required=True)
 8 |     parser.add_argument('--gin_config', type=str, help='gin config', required=True)
 9 |     parser.add_argument('-b', '--gin_bindings', nargs='+', help='gin bindings to overwrite config')
10 |     args = parser.parse_args()
11 |     dl.load_config(args.gin_config, args.gin_bindings)
12 |     dl.train(args.expdir)
13 | 


--------------------------------------------------------------------------------
/imitation_learning/drone_ob_norm.py:
--------------------------------------------------------------------------------
 1 | """Observation norm used by behavioral cloning agents and the shared autonomy agent."""
 2 | import numpy as np
 3 | import gin
 4 | 
 5 | 
 6 | @gin.configurable
 7 | def drone_bc_mean():
 8 |     return np.array([0.0043, -0.0065,  0.0965,  0.7479, -0.0983, -2.4260,
 9 |                      -0.0219, -0.0484, 0.3360, -0.0124, -0.0320,  0.0437,
10 |                      0.0000,  0.0000,  0.0570],
11 |                     dtype=np.float32)
12 | 
13 | 
14 | @gin.configurable
15 | def drone_bc_std():
16 |     return np.array([0.2092, 0.2249, 0.0903, 4.4870, 4.4999, 4.6344, 0.4893,
17 |                      0.2624, 2.6024, 0.2528, 0.2541, 0.5348, 0.3333, 0.3333,
18 |                      0.0455], dtype=np.float32)
19 | 


--------------------------------------------------------------------------------
/lunar_lander/watch.py:
--------------------------------------------------------------------------------
 1 | """Render trained agents."""
 2 | import dl
 3 | import argparse
 4 | from dl.rl import rl_record, misc, PPO
 5 | import os
 6 | 
 7 | if __name__ == '__main__':
 8 |     parser = argparse.ArgumentParser(description='Train Script.')
 9 |     parser.add_argument('logdir', type=str, help='logdir')
10 |     args = parser.parse_args()
11 | 
12 |     dl.load_config(os.path.join(args.logdir, 'config.gin'))
13 | 
14 |     t = PPO(args.logdir, nenv=1)
15 |     t.load()
16 |     t.pi.eval()
17 |     misc.set_env_to_eval_mode(t.env)
18 |     os.makedirs(os.path.join(t.logdir, 'video'), exist_ok=True)
19 |     outfile = os.path.join(t.logdir, 'video',
20 |                            t.ckptr.format.format(t.t) + '.mp4')
21 |     rl_record(t.env, t.pi, 10, outfile, t.device)
22 |     t.close()
23 | 


--------------------------------------------------------------------------------
/drone_sim/watch.py:
--------------------------------------------------------------------------------
 1 | """Render trained agents."""
 2 | import dl
 3 | import argparse
 4 | from dl.rl import rl_record, misc, PPO, VecFrameStack
 5 | import residual_shared_autonomy.drone_sim
 6 | import os
 7 | 
 8 | if __name__ == '__main__':
 9 |     parser = argparse.ArgumentParser(description='Train Script.')
10 |     parser.add_argument('logdir', type=str, help='logdir')
11 |     args = parser.parse_args()
12 | 
13 |     dl.load_config(os.path.join(args.logdir, 'config.gin'))
14 | 
15 |     t = PPO(args.logdir, nenv=1)
16 |     t.load()
17 |     t.pi.eval()
18 |     env = t.env
19 |     misc.set_env_to_eval_mode(env)
20 |     os.makedirs(os.path.join(t.logdir, 'video'), exist_ok=True)
21 |     outfile = os.path.join(t.logdir, 'video',
22 |                            t.ckptr.format.format(t.t) + '.mp4')
23 |     rl_record(env, t.pi, 10, outfile, t.device)
24 |     t.close()
25 | 


--------------------------------------------------------------------------------
/imitation_learning/bc_lunar.gin:
--------------------------------------------------------------------------------
 1 | import residual_shared_autonomy.imitation_learning
 2 | 
 3 | 
 4 | train.algorithm = @BCTrainer
 5 | train.maxt          = 100
 6 | train.seed          = 0
 7 | train.eval          = True
 8 | train.eval_period   = 1
 9 | train.save_period   = 1
10 | train.maxseconds    = None
11 | 
12 | optim.Adam.lr    = 0.001
13 | optim.Adam.betas = (0.9, 0.999)
14 | optim.Adam.eps   = 1e-8
15 | 
16 | BCTrainer.opt           = @optim.Adam
17 | BCTrainer.model         = @BCNet()
18 | BCTrainer.datafile      = './data/chip.pt' # Change this to your datafile
19 | BCTrainer.batch_size    = 32
20 | BCTrainer.num_workers   = 1
21 | BCTrainer.gpu           = True
22 | 
23 | DemonstrationData.mean = @bc_mean()
24 | DemonstrationData.std  = @bc_std()
25 | 
26 | BCNet.ob_shape = 9
27 | BCNet.action_shape = 2
28 | BCNet.nunits = 128
29 | 
30 | DiagGaussian.constant_log_std = False
31 | 
32 | Checkpointer.ckpt_period = 2
33 | 


--------------------------------------------------------------------------------
/imitation_learning/bc_drone.gin:
--------------------------------------------------------------------------------
 1 | import residual_shared_autonomy.imitation_learning
 2 | 
 3 | 
 4 | train.algorithm = @BCTrainer
 5 | train.maxt          = 150
 6 | train.seed          = 0
 7 | train.eval          = True
 8 | train.eval_period   = 1
 9 | train.save_period   = 1
10 | train.maxseconds    = None
11 | 
12 | optim.Adam.lr    = 0.001
13 | optim.Adam.betas = (0.9, 0.999)
14 | optim.Adam.eps   = 1e-8
15 | 
16 | BCTrainer.opt           = @optim.Adam
17 | BCTrainer.model         = @BCNet()
18 | BCTrainer.datafile      = './drone_data/chip.pt'  # Change this to your data file
19 | BCTrainer.batch_size    = 32
20 | BCTrainer.num_workers   = 1
21 | BCTrainer.gpu           = True
22 | 
23 | DemonstrationData.mean = @drone_bc_mean()
24 | DemonstrationData.std  = @drone_bc_std()
25 | 
26 | BCNet.ob_shape = 15
27 | BCNet.action_shape = 4
28 | BCNet.nunits = 128
29 | 
30 | DiagGaussian.constant_log_std = False
31 | 
32 | Checkpointer.ckpt_period = 2
33 | 


--------------------------------------------------------------------------------
/drone_sim/__init__.py:
--------------------------------------------------------------------------------
 1 | from residual_shared_autonomy.drone_sim.sim import Drone
 2 | from residual_shared_autonomy.drone_sim.rendering import DroneRenderer, DroneFollower
 3 | from residual_shared_autonomy.drone_sim.actor import drone_ppo_policy_fn
 4 | try:
 5 |     from residual_shared_autonomy.drone_sim.joystick_agent import DroneJoystickActor
 6 | except:
 7 |     pass
 8 | 
 9 | from gym.envs.registration import register
10 | 
11 | register(
12 |     id='DroneReacherHuman-v0',
13 |     entry_point='residual_shared_autonomy.drone_sim.env:DroneReacherHuman',
14 |     max_episode_steps=1000,
15 |     reward_threshold=200
16 | )
17 | 
18 | register(
19 |     id='DroneReacherBot-v0',
20 |     entry_point='residual_shared_autonomy.drone_sim.env:DroneReacherBot',
21 |     max_episode_steps=1000,
22 |     reward_threshold=200
23 | )
24 | 
25 | register(
26 |     id='DroneStabilizer-v0',
27 |     entry_point='residual_shared_autonomy.drone_sim.env:DroneStabilizer',
28 |     max_episode_steps=1000,
29 |     reward_threshold=200
30 | )
31 | 


--------------------------------------------------------------------------------
/imitation_learning/ob_norm.py:
--------------------------------------------------------------------------------
 1 | """Observation norm used by behavioral cloning agents and the shared autonomy agent."""
 2 | import numpy as np
 3 | import gin
 4 | 
 5 | 
 6 | @gin.configurable
 7 | def bc_mean():
 8 |     return np.array([0.0024,  0.7034,  0.0347, -0.2975, -0.0375, -0.0126,
 9 |                      0.0528,  0.0550,  -0.0248, -1.0000, -0.0947, -0.7991,
10 |                      -0.0927, -0.6008, -0.0884, -0.3996, -0.0877, -0.1998,
11 |                      -0.0865,  0.0000, -0.0876,  0.1998, -0.0892,  0.3996,
12 |                      -0.0901,  0.6008, -0.0882,  0.7991, -0.0827,  1.0000,
13 |                      -0.0851], dtype=np.float32)
14 | 
15 | 
16 | @gin.configurable
17 | def bc_std():
18 |     return np.array([0.3812, 0.4952, 0.6652, 0.3537, 0.5616, 0.4115, 0.2236,
19 |                      0.2279, 0.5270, 0.0000, 0.1382, 0.0000, 0.1300, 0.0000,
20 |                      0.1248, 0.0000, 0.1140, 0.0000, 0.1124, 0.0000, 0.1058,
21 |                      0.0000, 0.1059, 0.0000, 0.1081, 0.0000, 0.1262, 0.0000,
22 |                      0.1441, 0.0000, 0.1496], dtype=np.float32)
23 | 


--------------------------------------------------------------------------------
/drone_sim/ppo.gin:
--------------------------------------------------------------------------------
 1 | import residual_shared_autonomy.drone_sim
 2 | import residual_shared_autonomy.imitation_learning
 3 | 
 4 | train.algorithm = @PPO
 5 | train.maxt = 20000000
 6 | train.seed = 0
 7 | train.eval = True
 8 | train.eval_period = 1000000
 9 | train.save_period = 1000000
10 | train.maxseconds = None
11 | 
12 | optim.Adam.lr = 0.001
13 | optim.Adam.betas = (0.9, 0.999)
14 | optim.Adam.eps = 1e-5
15 | 
16 | PPO.env_fn = @make_env
17 | PPO.policy_fn = @drone_ppo_policy_fn
18 | PPO.nenv = 64
19 | PPO.eval_num_episodes = 100
20 | PPO.record_num_episodes = 0
21 | PPO.rollout_length = 1024
22 | PPO.batch_size = 512
23 | PPO.gamma = 0.99
24 | PPO.lambda_ = 0.95
25 | PPO.norm_advantages = True
26 | PPO.optimizer = @optim.Adam
27 | PPO.clip_param = 0.2
28 | PPO.epochs_per_rollout = 4
29 | PPO.max_grad_norm = 0.5
30 | PPO.ent_coef = 0.01
31 | PPO.vf_coef = 0.5
32 | PPO.gpu = True
33 | 
34 | Checkpointer.ckpt_period = 1000000
35 | 
36 | make_env.env_id = "DroneReacherBot-v0"
37 | make_env.norm_observations = True
38 | 
39 | DiagGaussian.constant_log_std = False
40 | 
41 | VecObsNormWrapper.steps = 10000
42 | VecObsNormWrapper.mean = @drone_bc_mean()
43 | VecObsNormWrapper.std = @drone_bc_std()
44 | VecObsNormWrapper.eps = 1e-2
45 | VecObsNormWrapper.log = True
46 | VecObsNormWrapper.log_prob = 0.01
47 | 


--------------------------------------------------------------------------------
/lunar_lander/td3.gin:
--------------------------------------------------------------------------------
 1 | import residual_shared_autonomy.lunar_lander
 2 | import residual_shared_autonomy.imitation_learning
 3 | 
 4 | train.algorithm = @TD3
 5 | train.maxt = 1000000
 6 | train.seed = 0
 7 | train.eval = True
 8 | train.eval_period = 100000
 9 | train.save_period = 100000
10 | train.maxseconds = None
11 | 
12 | TD3.env_fn = @make_env
13 | TD3.policy_fn = @lunar_lander_policy_fn
14 | TD3.qf_fn = @lunar_lander_qf_fn
15 | TD3.nenv = 1
16 | TD3.eval_num_episodes = 20
17 | TD3.gpu = True
18 | TD3.record_num_episodes = 5
19 | TD3.buffer_size = 100000
20 | TD3.frame_stack = 1
21 | TD3.learning_starts = 10000
22 | TD3.update_period = 1
23 | TD3.env_fn = @make_env
24 | TD3.optimizer = @optim.Adam
25 | TD3.batch_size = 256
26 | TD3.lr = 3e-4
27 | TD3.gamma = 0.99
28 | TD3.exploration_noise = 0.1
29 | TD3.policy_noise = 0.2
30 | TD3.policy_noise_clip = 0.5
31 | TD3.policy_update_period = 2
32 | TD3.target_smoothing_coef = 0.005
33 | TD3.reward_scale = 1
34 | TD3.log_period = 100
35 | 
36 | Checkpointer.ckpt_period = 100000
37 | 
38 | optim.Adam.betas = (0.9, 0.999)
39 | 
40 | make_env.env_id = "LunarLanderRandomContinuous-v2"
41 | make_env.norm_observations = True
42 | 
43 | VecObsNormWrapper.steps = 10000
44 | VecObsNormWrapper.mean = @bc_mean()
45 | VecObsNormWrapper.std = @bc_std()
46 | VecObsNormWrapper.eps = 1e-2
47 | VecObsNormWrapper.log = True
48 | VecObsNormWrapper.log_prob = 0.01
49 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM nvidia/cudagl:10.1-devel-ubuntu18.04
 2 | 
 3 | RUN apt-get update && apt-get upgrade -y
 4 | 
 5 | # Install packages
 6 | RUN apt-get install -y wget git vim libsm6 libxext6 libxrender-dev ffmpeg python-opengl
 7 | 
 8 | # install anaconda
 9 | RUN wget https://repo.anaconda.com/archive/Anaconda3-2019.10-Linux-x86_64.sh
10 | RUN bash Anaconda3-2019.10-Linux-x86_64.sh -b
11 | RUN rm Anaconda3-2019.10-Linux-x86_64.sh
12 | ENV PATH /root/anaconda3/bin:$PATH
13 | RUN conda update conda
14 | RUN yes | conda update anaconda
15 | RUN yes | conda update --all
16 | RUN conda init
17 | 
18 | # Install packages
19 | RUN conda install -y -c pytorch pytorch torchvision
20 | RUN conda install -y tensorflow-gpu==1.14.0
21 | RUN pip install gin-config
22 | RUN pip install gym[atari]
23 | RUN pip install gym[box2d]
24 | 
25 | WORKDIR /root
26 | RUN git clone https://github.com/openai/baselines.git
27 | WORKDIR /root/baselines
28 | RUN git checkout adba88b218edbb3f3bd8044dc963f8ca0901d6c6
29 | RUN pip install .
30 | 
31 | # Add a directory for python packages to be mounted
32 | ENV PYTHONPATH /root/pkgs:$PYTHONPATH
33 | 
34 | RUN apt-get install -y freeglut3-dev
35 | RUN conda install -y PyOpenGL
36 | RUN pip install pygame PyOpenGL_accelerate
37 | 
38 | WORKDIR /root/pkgs
39 | RUN git clone https://github.com/cbschaff/dl.git
40 | WORKDIR /root/pkgs/dl
41 | RUN git checkout 650db8abc90053305be95e73ce28da624e9092dc
42 | 
43 | WORKDIR /root
44 | # Bash entrypoint
45 | ENTRYPOINT /bin/bash
46 | 


--------------------------------------------------------------------------------
/imitation_learning/watch.py:
--------------------------------------------------------------------------------
 1 | """Render trained behavioral cloning agents."""
 2 | import dl
 3 | from dl.rl import make_env, VecObsNormWrapper
 4 | from residual_shared_autonomy.imitation_learning import bc_mean, bc_std
 5 | from residual_shared_autonomy.imitation_learning import drone_bc_mean, drone_bc_std
 6 | from residual_shared_autonomy.imitation_learning import BCTrainer
 7 | import residual_shared_autonomy.lunar_lander
 8 | import residual_shared_autonomy.drone_sim
 9 | import argparse
10 | import os
11 | import torch
12 | 
13 | if __name__ == '__main__':
14 |     parser = argparse.ArgumentParser(description='Train Script.')
15 |     parser.add_argument('logdir', type=str, help='logdir')
16 |     parser.add_argument('--drone', action='store_true', help='logdir')
17 |     args = parser.parse_args()
18 | 
19 |     dl.load_config(os.path.join(args.logdir, 'config.gin'))
20 | 
21 |     trainer = BCTrainer(args.logdir)
22 |     trainer.load()
23 | 
24 |     id = "DroneReacherBot-v0" if args.drone else "LunarLanderRandomContinuous-v2"
25 |     env = make_env(env_id=id,
26 |                    nenv=1, norm_observations=False)
27 |     if args.drone:
28 |         env = VecObsNormWrapper(env, mean=drone_bc_mean(),
29 |                                 std=drone_bc_std(), log=False)
30 |     else:
31 |         env = VecObsNormWrapper(env, mean=bc_mean(), std=bc_std(), log=False)
32 | 
33 |     for _ in range(5):
34 |         ob = env.reset()
35 |         env.render()
36 |         done = False
37 |         while not done:
38 |             dist = trainer.model(torch.from_numpy(ob).to(trainer.device))
39 |             ob, _, done, _ = env.step(dist.sample().cpu().numpy())
40 |             env.render()
41 | 


--------------------------------------------------------------------------------
/ppo/joystick_control.py:
--------------------------------------------------------------------------------
 1 | """Render trained agents."""
 2 | import dl
 3 | import argparse
 4 | from residual_shared_autonomy.ppo import ConstrainedResidualPPO
 5 | from residual_shared_autonomy.lunar_lander import LunarLanderJoystickActor
 6 | from residual_shared_autonomy.drone_sim import DroneJoystickActor
 7 | import os
 8 | 
 9 | if __name__ == '__main__':
10 |     parser = argparse.ArgumentParser(description='Train Script.')
11 |     parser.add_argument('logdir', type=str, help='logdir')
12 |     parser.add_argument('--drone', action='store_true', help='conrol drone env')
13 |     parser.add_argument('--reacher', action='store_true', help='conrol luanr reacher env')
14 |     args = parser.parse_args()
15 | 
16 |     if args.drone:
17 |         dl.load_config(os.path.join(args.logdir, 'config.gin'),
18 |                        ['make_env.env_id="DroneReacherBot-v0"'])
19 |         trainer = ConstrainedResidualPPO(args.logdir, nenv=1,
20 |                                          base_actor_cls=DroneJoystickActor)
21 |     elif args.reacher:
22 |         dl.load_config(os.path.join(args.logdir, 'config.gin'),
23 |                        ['make_env.env_id="LunarLanderReacher-v2"'])
24 |         trainer = ConstrainedResidualPPO(args.logdir, nenv=1,
25 |                                          base_actor_cls=LunarLanderJoystickActor)
26 |     else:
27 |         dl.load_config(os.path.join(args.logdir, 'config.gin'),
28 |                        ['make_env.env_id="LunarLanderRandomContinuous-v2"'])
29 |         trainer = ConstrainedResidualPPO(args.logdir, nenv=1,
30 |                                          base_actor_cls=LunarLanderJoystickActor)
31 |     trainer.load()
32 |     trainer.evaluate()
33 |     trainer.close()
34 | 


--------------------------------------------------------------------------------
/drone_sim/actor.py:
--------------------------------------------------------------------------------
 1 | """Defines networks for PPO experiments."""
 2 | from dl.rl.modules import ActorCriticBase, Policy
 3 | from dl.modules import DiagGaussian
 4 | import torch
 5 | import torch.nn as nn
 6 | import torch.nn.functional as F
 7 | import gin
 8 | 
 9 | 
10 | class FeedForwardActorCriticBase(ActorCriticBase):
11 |     """Policy and Value networks."""
12 | 
13 |     def __init__(self, *args, nunits=128, **kwargs):
14 |         """Init."""
15 |         self.nunits = nunits
16 |         super().__init__(*args, **kwargs)
17 | 
18 |     def build(self):
19 |         """Build."""
20 |         inshape = self.observation_space.shape[0]
21 |         self.fc1 = nn.Linear(inshape, self.nunits)
22 |         self.fc2 = nn.Linear(self.nunits, self.nunits)
23 |         self.dist = DiagGaussian(self.nunits, self.action_space.shape[0])
24 |         for p in self.dist.fc_mean.parameters():
25 |             nn.init.constant_(p, 0.)
26 | 
27 |         self.vf_fc1 = nn.Linear(inshape, self.nunits)
28 |         self.vf_fc2 = nn.Linear(self.nunits, self.nunits)
29 |         self.vf_out = nn.Linear(self.nunits, 1)
30 | 
31 |     def forward(self, x):
32 |         """Forward."""
33 |         net = x
34 |         net = F.relu(self.fc1(net))
35 |         net = F.relu(self.fc2(net))
36 |         pi = self.dist(net)
37 | 
38 |         net = x
39 |         net = F.relu(self.vf_fc1(net))
40 |         net = F.relu(self.vf_fc2(net))
41 |         vf = self.vf_out(net)
42 | 
43 |         return pi, vf
44 | 
45 | 
46 | @gin.configurable
47 | def drone_ppo_policy_fn(env, nunits=128):
48 |     """Create a policy network."""
49 |     return Policy(FeedForwardActorCriticBase(env.observation_space,
50 |                                              env.action_space,
51 |                                              nunits=nunits))
52 | 


--------------------------------------------------------------------------------
/lunar_lander/__init__.py:
--------------------------------------------------------------------------------
 1 | from residual_shared_autonomy.lunar_lander.actor import lunar_lander_policy_fn, lunar_lander_qf_fn
 2 | from residual_shared_autonomy.lunar_lander.fuel_wrapper import LunarLanderFuelWrapper
 3 | try:
 4 |     from residual_shared_autonomy.lunar_lander.joystick_agent import LunarLanderJoystickActor
 5 |     from residual_shared_autonomy.lunar_lander.keyboard_agent import LunarLanderKeyboardActor
 6 | except:
 7 |     pass
 8 | 
 9 | from gym.envs.registration import register
10 | 
11 | register(
12 |     id='LunarLanderRandom-v2',
13 |     entry_point='residual_shared_autonomy.lunar_lander.env_lander:LunarLander',
14 |     max_episode_steps=1000,
15 |     reward_threshold=200
16 | )
17 | 
18 | register(
19 |     id='LunarLanderRandomContinuous-v2',
20 |     entry_point='residual_shared_autonomy.lunar_lander.env_lander:LunarLanderContinuous',
21 |     max_episode_steps=1000,
22 |     reward_threshold=200
23 | )
24 | 
25 | register(
26 |     id='LunarLanderRandomNoGoal-v2',
27 |     entry_point='residual_shared_autonomy.lunar_lander.env_lander:LunarLanderNoGoal',
28 |     max_episode_steps=1000,
29 |     reward_threshold=200
30 | )
31 | 
32 | register(
33 |     id='LunarLanderRandomContinuousNoGoal-v2',
34 |     entry_point='residual_shared_autonomy.lunar_lander.env_lander:LunarLanderContinuousNoGoal',
35 |     max_episode_steps=1000,
36 |     reward_threshold=200
37 | )
38 | 
39 | register(
40 |     id='LunarLanderRandomConstrained-v2',
41 |     entry_point='residual_shared_autonomy.lunar_lander.env_lander:LunarLanderConstrained',
42 |     max_episode_steps=1000,
43 |     reward_threshold=200
44 | )
45 | 
46 | register(
47 |     id='LunarLanderReacher-v2',
48 |     entry_point='residual_shared_autonomy.lunar_lander.env_reacher:LunarLanderReacher',
49 |     max_episode_steps=1000,
50 |     reward_threshold=200
51 | )
52 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # training logs
  2 | logs/
  3 | ppo/logs*
  4 | 
  5 | # Byte-compiled / optimized / DLL files
  6 | __pycache__/
  7 | *.py[cod]
  8 | *$py.class
  9 | 
 10 | # C extensions
 11 | *.so
 12 | 
 13 | # Distribution / packaging
 14 | .Python
 15 | build/
 16 | develop-eggs/
 17 | dist/
 18 | downloads/
 19 | eggs/
 20 | .eggs/
 21 | lib/
 22 | lib64/
 23 | parts/
 24 | sdist/
 25 | var/
 26 | wheels/
 27 | *.egg-info/
 28 | .installed.cfg
 29 | *.egg
 30 | MANIFEST
 31 | 
 32 | # PyInstaller
 33 | #  Usually these files are written by a python script from a template
 34 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 35 | *.manifest
 36 | *.spec
 37 | 
 38 | # Installer logs
 39 | pip-log.txt
 40 | pip-delete-this-directory.txt
 41 | 
 42 | # Unit test / coverage reports
 43 | htmlcov/
 44 | .tox/
 45 | .coverage
 46 | .coverage.*
 47 | .cache
 48 | nosetests.xml
 49 | coverage.xml
 50 | *.cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | 
 63 | # Flask stuff:
 64 | instance/
 65 | .webassets-cache
 66 | 
 67 | # Scrapy stuff:
 68 | .scrapy
 69 | 
 70 | # Sphinx documentation
 71 | docs/_build/
 72 | 
 73 | # PyBuilder
 74 | target/
 75 | 
 76 | # Jupyter Notebook
 77 | .ipynb_checkpoints
 78 | 
 79 | # pyenv
 80 | .python-version
 81 | 
 82 | # celery beat schedule file
 83 | celerybeat-schedule
 84 | 
 85 | # SageMath parsed files
 86 | *.sage.py
 87 | 
 88 | # Environments
 89 | .env
 90 | .venv
 91 | env/
 92 | venv/
 93 | ENV/
 94 | env.bak/
 95 | venv.bak/
 96 | 
 97 | # Spyder project settings
 98 | .spyderproject
 99 | .spyproject
100 | 
101 | # Rope project settings
102 | .ropeproject
103 | 
104 | # mkdocs documentation
105 | /site
106 | 
107 | # mypy
108 | .mypy_cache/
109 | 


--------------------------------------------------------------------------------
/imitation_learning/data_collection_lunar.py:
--------------------------------------------------------------------------------
 1 | """Collect human data from LunarLander."""
 2 | import gym
 3 | from residual_shared_autonomy.lunar_lander import LunarLanderJoystickActor
 4 | from dl.rl import ensure_vec_env
 5 | import torch
 6 | import time
 7 | import argparse
 8 | import os
 9 | 
10 | if __name__ == '__main__':
11 |     parser = argparse.ArgumentParser(description='Data collection.')
12 |     parser.add_argument('datafile', type=str, help='datafile')
13 |     parser.add_argument('neps', type=int, help='# of episodes')
14 |     parser.add_argument('--overwrite', action='store_true',
15 |                         help='overwrite data')
16 |     args = parser.parse_args()
17 | 
18 |     env = gym.make("LunarLanderRandomContinuous-v2")
19 |     env = ensure_vec_env(env)
20 | 
21 |     if os.path.exists(args.datafile):
22 |         if not args.overwrite:
23 |             raise ValueError('Data file {} already exists. Add --overwrite to '
24 |                              ' overwrite the file.'.format(args.datafile))
25 | 
26 |     actor = LunarLanderJoystickActor(env)
27 | 
28 |     transitions = []
29 |     for i in range(args.neps):
30 |         print(i)
31 |         ob = env.reset()
32 |         env.render()
33 |         done = False
34 |         time.sleep(1.)
35 | 
36 |         while not done:
37 |             action = actor(ob)
38 |             ob_next, reward, done, _ = env.step(action)
39 |             transitions.append([ob, action.copy(), reward, done])
40 |             ob = ob_next
41 |         time.sleep(0.5)
42 | 
43 |     obs, actions, rewards, dones = zip(*transitions)
44 |     data = {'obs': torch.Tensor(obs).squeeze(),
45 |             'actions': torch.Tensor(actions).squeeze(),
46 |             'rewards': torch.Tensor(rewards).squeeze(),
47 |             'dones': torch.Tensor(dones).squeeze()}
48 | 
49 |     torch.save(data, args.datafile)
50 | 


--------------------------------------------------------------------------------
/lunar_lander/actor.py:
--------------------------------------------------------------------------------
 1 | """Defines networks for Lunar Lander experiments."""
 2 | from dl.rl.modules import PolicyBase, ContinuousQFunctionBase
 3 | from dl.rl.modules import QFunction, UnnormActionPolicy
 4 | from dl.modules import TanhDelta
 5 | import torch
 6 | import torch.nn as nn
 7 | import torch.nn.functional as F
 8 | import gin
 9 | 
10 | 
11 | class FeedForwardPolicyBase(PolicyBase):
12 |     """Policy network."""
13 | 
14 |     def build(self):
15 |         """Build."""
16 |         self.fc1 = nn.Linear(9, 256)
17 |         self.fc2 = nn.Linear(256, 256)
18 |         self.dist = TanhDelta(256, self.action_space.shape[0])
19 | 
20 |     def forward(self, x):
21 |         """Forward."""
22 |         x = x[:, :9]
23 |         x = F.relu(self.fc1(x))
24 |         x = F.relu(self.fc2(x))
25 |         return self.dist(x)
26 | 
27 | 
28 | class AppendActionFeedForwardQFBase(ContinuousQFunctionBase):
29 |     """Q network."""
30 | 
31 |     def build(self):
32 |         """Build."""
33 |         nin = 9 + self.action_space.shape[0]
34 |         self.fc1 = nn.Linear(nin, 256)
35 |         self.fc2 = nn.Linear(256, 256)
36 |         self.qvalue = nn.Linear(256, 1)
37 | 
38 |     def forward(self, x, a):
39 |         """Forward."""
40 |         x = x[:, :9]
41 |         x = F.relu(self.fc1(torch.cat([x, a], dim=1)))
42 |         x = F.relu(self.fc2(x))
43 |         return self.qvalue(x)
44 | 
45 | 
46 | @gin.configurable
47 | def lunar_lander_policy_fn(env):
48 |     """Create a policy network."""
49 |     return UnnormActionPolicy(FeedForwardPolicyBase(env.observation_space,
50 |                                                     env.action_space))
51 | 
52 | 
53 | @gin.configurable
54 | def lunar_lander_qf_fn(env):
55 |     """Create a qfunction network."""
56 |     return QFunction(AppendActionFeedForwardQFBase(env.observation_space,
57 |                                                    env.action_space))
58 | 


--------------------------------------------------------------------------------
/ppo/actor.py:
--------------------------------------------------------------------------------
 1 | """Defines networks for Residual PPO experiments."""
 2 | from dl.rl.modules import ActorCriticBase, Policy
 3 | from dl.modules import DiagGaussian
 4 | import torch
 5 | import torch.nn as nn
 6 | import torch.nn.functional as F
 7 | import gin
 8 | 
 9 | 
10 | class FeedForwardActorCriticBase(ActorCriticBase):
11 |     """Policy and Value networks."""
12 | 
13 |     def __init__(self, *args, nunits=128, **kwargs):
14 |         """Init."""
15 |         self.nunits = nunits
16 |         super().__init__(*args, **kwargs)
17 | 
18 |     def build(self):
19 |         """Build."""
20 |         inshape = (self.observation_space.spaces[0].shape[0]
21 |                    + self.observation_space.spaces[1].shape[0])
22 |         self.fc1 = nn.Linear(inshape, self.nunits)
23 |         self.fc2 = nn.Linear(self.nunits, self.nunits)
24 |         self.dist = DiagGaussian(self.nunits, self.action_space.shape[0])
25 |         for p in self.dist.fc_mean.parameters():
26 |             nn.init.constant_(p, 0.)
27 | 
28 |         self.vf_fc1 = nn.Linear(inshape, self.nunits)
29 |         self.vf_fc2 = nn.Linear(self.nunits, self.nunits)
30 |         self.vf_out = nn.Linear(self.nunits, 1)
31 | 
32 |     def forward(self, x):
33 |         """Forward."""
34 |         ob = torch.cat(x, axis=1)
35 |         net = ob
36 |         net = F.relu(self.fc1(net))
37 |         net = F.relu(self.fc2(net))
38 |         pi = self.dist(net)
39 | 
40 |         net = ob
41 |         net = F.relu(self.vf_fc1(net))
42 |         net = F.relu(self.vf_fc2(net))
43 |         vf = self.vf_out(net)
44 | 
45 |         return pi, vf
46 | 
47 | 
48 | @gin.configurable
49 | def ppo_policy_fn(env, nunits=128):
50 |     """Create a policy network."""
51 |     return Policy(FeedForwardActorCriticBase(env.observation_space,
52 |                                              env.action_space,
53 |                                              nunits=nunits))
54 | 


--------------------------------------------------------------------------------
/imitation_learning/data_collection_drone.py:
--------------------------------------------------------------------------------
 1 | """Collect human data from DroneReacher."""
 2 | import gym
 3 | import residual_shared_autonomy.drone_sim
 4 | from residual_shared_autonomy.drone_sim.joystick_agent import DroneJoystickActor
 5 | from dl.rl import ensure_vec_env
 6 | import torch
 7 | import time
 8 | import argparse
 9 | import os
10 | 
11 | if __name__ == '__main__':
12 |     parser = argparse.ArgumentParser(description='Data collection.')
13 |     parser.add_argument('datafile', type=str, help='datafile')
14 |     parser.add_argument('neps', type=int, help='# of episodes')
15 |     parser.add_argument('--overwrite', action='store_true',
16 |                         help='overwrite data')
17 |     args = parser.parse_args()
18 | 
19 |     env = gym.make("DroneReacherBot-v0")
20 |     env = ensure_vec_env(env)
21 | 
22 |     if os.path.exists(args.datafile):
23 |         if not args.overwrite:
24 |             raise ValueError('Data file {} already exists. Add --overwrite to '
25 |                              ' overwrite the file.'.format(args.datafile))
26 | 
27 |     actor = DroneJoystickActor(env)
28 | 
29 |     transitions = []
30 |     for i in range(args.neps):
31 |         print(i)
32 |         ob = env.reset()
33 |         actor.reset()
34 |         env.render()
35 |         done = False
36 |         time.sleep(1.)
37 | 
38 |         while not done:
39 |             action = actor(ob)
40 |             ob_next, reward, done, _ = env.step(action)
41 |             transitions.append([ob, action.copy(), reward, done])
42 |             ob = ob_next
43 |         time.sleep(0.5)
44 | 
45 |     obs, actions, rewards, dones = zip(*transitions)
46 |     data = {'obs': torch.Tensor(obs).squeeze(),
47 |             'actions': torch.Tensor(actions).squeeze(),
48 |             'rewards': torch.Tensor(rewards).squeeze(),
49 |             'dones': torch.Tensor(dones).squeeze()}
50 | 
51 |     torch.save(data, args.datafile)
52 | 


--------------------------------------------------------------------------------
/ppo/configs/ppo_drone_noisy.gin:
--------------------------------------------------------------------------------
 1 | import residual_shared_autonomy.ppo
 2 | import residual_shared_autonomy.drone_sim
 3 | import residual_shared_autonomy.imitation_learning
 4 | 
 5 | train.algorithm = @ConstrainedResidualPPO
 6 | train.maxt = 100000000
 7 | train.seed = 0
 8 | train.eval = True
 9 | train.eval_period = 1000000
10 | train.save_period = 1000000
11 | train.maxseconds = None
12 | 
13 | optim.Adam.lr = 0.001
14 | optim.Adam.betas = (0.9, 0.999)
15 | optim.Adam.eps = 1e-5
16 | 
17 | ConstrainedResidualPPO.policy_training_start = 500000
18 | ConstrainedResidualPPO.lambda_training_start = 5000000
19 | ConstrainedResidualPPO.lambda_lr = 0.001
20 | ConstrainedResidualPPO.lambda_init = 10
21 | ConstrainedResidualPPO.lr_decay_rate = 0.31622776601
22 | ConstrainedResidualPPO.lr_decay_freq = 20000000
23 | ConstrainedResidualPPO.l2_reg = False
24 | ConstrainedResidualPPO.reward_threshold = -0.875
25 | ConstrainedResidualPPO.env_fn = @make_env
26 | ConstrainedResidualPPO.policy_fn = @ppo_policy_fn
27 | ConstrainedResidualPPO.nenv = 64
28 | ConstrainedResidualPPO.eval_num_episodes = 100
29 | ConstrainedResidualPPO.record_num_episodes = 0
30 | ConstrainedResidualPPO.rollout_length = 1024
31 | ConstrainedResidualPPO.batch_size = 512
32 | ConstrainedResidualPPO.gamma = 0.99
33 | ConstrainedResidualPPO.lambda_ = 0.95
34 | ConstrainedResidualPPO.norm_advantages = True
35 | ConstrainedResidualPPO.optimizer = @optim.Adam
36 | ConstrainedResidualPPO.clip_param = 0.2
37 | ConstrainedResidualPPO.epochs_per_rollout = 4
38 | ConstrainedResidualPPO.max_grad_norm = 0.5
39 | ConstrainedResidualPPO.ent_coef = 0.01
40 | ConstrainedResidualPPO.vf_coef = 0.5
41 | ConstrainedResidualPPO.gpu = True
42 | ConstrainedResidualPPO.base_actor_cls = @NoisyActor
43 | 
44 | NoisyActor.actor_cls = @DroneReacherActor
45 | NoisyActor.eps = 0.5
46 | DroneReacherActor.device = 'cuda:0'
47 | DroneReacherActor.logdir = './models/drone_reacher_agent'
48 | 
49 | 
50 | Checkpointer.ckpt_period = 100000
51 | 
52 | make_env.env_id = "DroneStabilizer-v0"
53 | make_env.norm_observations = True
54 | 
55 | VecObsNormWrapper.steps = 10000
56 | VecObsNormWrapper.mean = @drone_bc_mean()
57 | VecObsNormWrapper.std = @drone_bc_std()
58 | VecObsNormWrapper.eps = 1e-2
59 | VecObsNormWrapper.log = True
60 | VecObsNormWrapper.log_prob = 0.01
61 | 
62 | # Model Params
63 | ppo_policy_fn.nunits = 128
64 | DiagGaussian.constant_log_std = False
65 | 


--------------------------------------------------------------------------------
/ppo/configs/ppo_drone_laggy.gin:
--------------------------------------------------------------------------------
 1 | import residual_shared_autonomy.ppo
 2 | import residual_shared_autonomy.drone_sim
 3 | import residual_shared_autonomy.imitation_learning
 4 | 
 5 | train.algorithm = @ConstrainedResidualPPO
 6 | train.maxt = 100000000
 7 | train.seed = 0
 8 | train.eval = True
 9 | train.eval_period = 1000000
10 | train.save_period = 1000000
11 | train.maxseconds = None
12 | 
13 | optim.Adam.lr = 0.001
14 | optim.Adam.betas = (0.9, 0.999)
15 | optim.Adam.eps = 1e-5
16 | 
17 | ConstrainedResidualPPO.policy_training_start = 500000
18 | ConstrainedResidualPPO.lambda_training_start = 5000000
19 | ConstrainedResidualPPO.lambda_lr = 0.001
20 | ConstrainedResidualPPO.lambda_init = 10
21 | ConstrainedResidualPPO.lr_decay_rate = 0.31622776601
22 | ConstrainedResidualPPO.lr_decay_freq = 20000000
23 | ConstrainedResidualPPO.l2_reg = False
24 | ConstrainedResidualPPO.reward_threshold = -0.875
25 | ConstrainedResidualPPO.env_fn = @make_env
26 | ConstrainedResidualPPO.policy_fn = @ppo_policy_fn
27 | ConstrainedResidualPPO.nenv = 64
28 | ConstrainedResidualPPO.eval_num_episodes = 100
29 | ConstrainedResidualPPO.record_num_episodes = 0
30 | ConstrainedResidualPPO.rollout_length = 1024
31 | ConstrainedResidualPPO.batch_size = 512
32 | ConstrainedResidualPPO.gamma = 0.99
33 | ConstrainedResidualPPO.lambda_ = 0.95
34 | ConstrainedResidualPPO.norm_advantages = True
35 | ConstrainedResidualPPO.optimizer = @optim.Adam
36 | ConstrainedResidualPPO.clip_param = 0.2
37 | ConstrainedResidualPPO.epochs_per_rollout = 4
38 | ConstrainedResidualPPO.max_grad_norm = 0.5
39 | ConstrainedResidualPPO.ent_coef = 0.01
40 | ConstrainedResidualPPO.vf_coef = 0.5
41 | ConstrainedResidualPPO.gpu = True
42 | ConstrainedResidualPPO.base_actor_cls = @LaggyActor
43 | 
44 | LaggyActor.actor_cls = @DroneReacherActor
45 | LaggyActor.repeat_prob = 0.8
46 | DroneReacherActor.device = 'cuda:0'
47 | DroneReacherActor.logdir = './models/drone_reacher_agent'
48 | 
49 | 
50 | Checkpointer.ckpt_period = 100000
51 | 
52 | make_env.env_id = "DroneStabilizer-v0"
53 | make_env.norm_observations = True
54 | 
55 | VecObsNormWrapper.steps = 10000
56 | VecObsNormWrapper.mean = @drone_bc_mean()
57 | VecObsNormWrapper.std = @drone_bc_std()
58 | VecObsNormWrapper.eps = 1e-2
59 | VecObsNormWrapper.log = True
60 | VecObsNormWrapper.log_prob = 0.01
61 | 
62 | # Model Params
63 | ppo_policy_fn.nunits = 128
64 | DiagGaussian.constant_log_std = False
65 | 


--------------------------------------------------------------------------------
/ppo/configs/ppo_drone.gin:
--------------------------------------------------------------------------------
 1 | import residual_shared_autonomy.ppo
 2 | import residual_shared_autonomy.drone_sim
 3 | import residual_shared_autonomy.imitation_learning
 4 | 
 5 | train.algorithm = @ConstrainedResidualPPO
 6 | train.maxt = 100000000
 7 | train.seed = 0
 8 | train.eval = True
 9 | train.eval_period = 1000000
10 | train.save_period = 1000000
11 | train.maxseconds = None
12 | 
13 | optim.Adam.lr = 0.001
14 | optim.Adam.betas = (0.9, 0.999)
15 | optim.Adam.eps = 1e-5
16 | 
17 | ConstrainedResidualPPO.policy_training_start = 500000
18 | ConstrainedResidualPPO.lambda_training_start = 5000000
19 | ConstrainedResidualPPO.lambda_lr = 0.001
20 | ConstrainedResidualPPO.lambda_init = 10
21 | ConstrainedResidualPPO.lr_decay_rate = 0.31622776601
22 | ConstrainedResidualPPO.lr_decay_freq = 20000000
23 | ConstrainedResidualPPO.l2_reg = False
24 | ConstrainedResidualPPO.reward_threshold = -0.875
25 | ConstrainedResidualPPO.env_fn = @make_env
26 | ConstrainedResidualPPO.policy_fn = @ppo_policy_fn
27 | ConstrainedResidualPPO.nenv = 64
28 | ConstrainedResidualPPO.eval_num_episodes = 100
29 | ConstrainedResidualPPO.record_num_episodes = 0
30 | ConstrainedResidualPPO.rollout_length = 1024
31 | ConstrainedResidualPPO.batch_size = 512
32 | ConstrainedResidualPPO.gamma = 0.99
33 | ConstrainedResidualPPO.lambda_ = 0.95
34 | ConstrainedResidualPPO.norm_advantages = True
35 | ConstrainedResidualPPO.optimizer = @optim.Adam
36 | ConstrainedResidualPPO.clip_param = 0.2
37 | ConstrainedResidualPPO.epochs_per_rollout = 4
38 | ConstrainedResidualPPO.max_grad_norm = 0.5
39 | ConstrainedResidualPPO.ent_coef = 0.01
40 | ConstrainedResidualPPO.vf_coef = 0.5
41 | ConstrainedResidualPPO.gpu = True
42 | ConstrainedResidualPPO.base_actor_cls = @BCMultiActor
43 | 
44 | BCMultiActor.logdir = "./models/behavioral_cloning_agents_drone"
45 | BCMultiActor.device = "cuda:0"
46 | BCMultiActor.switch_prob = 0.001
47 | 
48 | BCNet.ob_shape = 15
49 | BCNet.action_shape = 4
50 | BCNet.nunits = 128
51 | 
52 | 
53 | Checkpointer.ckpt_period = 100000
54 | 
55 | make_env.env_id = "DroneStabilizer-v0"
56 | make_env.norm_observations = True
57 | 
58 | VecObsNormWrapper.steps = 10000
59 | VecObsNormWrapper.mean = @drone_bc_mean()
60 | VecObsNormWrapper.std = @drone_bc_std()
61 | VecObsNormWrapper.eps = 1e-2
62 | VecObsNormWrapper.log = True
63 | VecObsNormWrapper.log_prob = 0.01
64 | 
65 | # Model Params
66 | ppo_policy_fn.nunits = 128
67 | DiagGaussian.constant_log_std = False
68 | 


--------------------------------------------------------------------------------
/ppo/configs/ppo_lunar_noisy.gin:
--------------------------------------------------------------------------------
 1 | import residual_shared_autonomy.ppo
 2 | import residual_shared_autonomy.imitation_learning
 3 | import residual_shared_autonomy.lunar_lander
 4 | 
 5 | train.algorithm = @ConstrainedResidualPPO
 6 | train.maxt = 100000000
 7 | train.seed = 0
 8 | train.eval = True
 9 | train.eval_period = 1000000
10 | train.save_period = 1000000
11 | train.maxseconds = None
12 | 
13 | optim.Adam.lr = 0.001
14 | optim.Adam.betas = (0.9, 0.999)
15 | optim.Adam.eps = 1e-5
16 | 
17 | ConstrainedResidualPPO.policy_training_start = 100000
18 | ConstrainedResidualPPO.lambda_training_start = 2000000
19 | ConstrainedResidualPPO.lambda_lr = 0.001
20 | ConstrainedResidualPPO.lambda_init = 10
21 | ConstrainedResidualPPO.lr_decay_rate = 0.31622776601
22 | ConstrainedResidualPPO.lr_decay_freq = 20000000
23 | ConstrainedResidualPPO.l2_reg = False
24 | ConstrainedResidualPPO.reward_threshold = -50.0
25 | ConstrainedResidualPPO.env_fn = @make_env
26 | ConstrainedResidualPPO.policy_fn = @ppo_policy_fn
27 | ConstrainedResidualPPO.nenv = 32
28 | ConstrainedResidualPPO.eval_num_episodes = 100
29 | ConstrainedResidualPPO.record_num_episodes = 0
30 | ConstrainedResidualPPO.rollout_length = 1024
31 | ConstrainedResidualPPO.batch_size = 256
32 | ConstrainedResidualPPO.gamma = 0.99
33 | ConstrainedResidualPPO.lambda_ = 0.95
34 | ConstrainedResidualPPO.norm_advantages = True
35 | ConstrainedResidualPPO.optimizer = @optim.Adam
36 | ConstrainedResidualPPO.clip_param = 0.2
37 | ConstrainedResidualPPO.epochs_per_rollout = 4
38 | ConstrainedResidualPPO.max_grad_norm = 0.5
39 | ConstrainedResidualPPO.ent_coef = 0.01
40 | ConstrainedResidualPPO.vf_coef = 0.5
41 | ConstrainedResidualPPO.gpu = True
42 | ConstrainedResidualPPO.base_actor_cls = @NoisyActor
43 | ConstrainedResidualPPO.wrapper_fn = @lunar_lander_fuel_wrapper
44 | 
45 | NoisyActor.actor_cls = @LunarLanderActor
46 | NoisyActor.eps = 0.5
47 | LunarLanderActor.logdir = "./models/lunar_lander_agent"
48 | LunarLanderActor.device = "cuda:0"
49 | 
50 | Checkpointer.ckpt_period = 100000
51 | 
52 | make_env.env_id = "LunarLanderRandomContinuousNoGoal-v2"
53 | make_env.norm_observations = True
54 | 
55 | VecObsNormWrapper.steps = 10000
56 | VecObsNormWrapper.mean = @bc_mean()
57 | VecObsNormWrapper.std = @bc_std()
58 | VecObsNormWrapper.eps = 1e-2
59 | VecObsNormWrapper.log = True
60 | VecObsNormWrapper.log_prob = 0.01
61 | 
62 | # Model Params
63 | ppo_policy_fn.nunits = 128
64 | DiagGaussian.constant_log_std = False
65 | 


--------------------------------------------------------------------------------
/ppo/configs/ppo_lunar_laggy.gin:
--------------------------------------------------------------------------------
 1 | import residual_shared_autonomy.ppo
 2 | import residual_shared_autonomy.imitation_learning
 3 | import residual_shared_autonomy.lunar_lander
 4 | 
 5 | train.algorithm = @ConstrainedResidualPPO
 6 | train.maxt = 100000000
 7 | train.seed = 0
 8 | train.eval = True
 9 | train.eval_period = 1000000
10 | train.save_period = 1000000
11 | train.maxseconds = None
12 | 
13 | optim.Adam.lr = 0.001
14 | optim.Adam.betas = (0.9, 0.999)
15 | optim.Adam.eps = 1e-5
16 | 
17 | ConstrainedResidualPPO.policy_training_start = 100000
18 | ConstrainedResidualPPO.lambda_training_start = 2000000
19 | ConstrainedResidualPPO.lambda_lr = 0.001
20 | ConstrainedResidualPPO.lambda_init = 10
21 | ConstrainedResidualPPO.lr_decay_rate = 0.31622776601
22 | ConstrainedResidualPPO.lr_decay_freq = 20000000
23 | ConstrainedResidualPPO.l2_reg = False
24 | ConstrainedResidualPPO.reward_threshold = -40.0
25 | ConstrainedResidualPPO.env_fn = @make_env
26 | ConstrainedResidualPPO.policy_fn = @ppo_policy_fn
27 | ConstrainedResidualPPO.nenv = 32
28 | ConstrainedResidualPPO.eval_num_episodes = 100
29 | ConstrainedResidualPPO.record_num_episodes = 0
30 | ConstrainedResidualPPO.rollout_length = 1024
31 | ConstrainedResidualPPO.batch_size = 256
32 | ConstrainedResidualPPO.gamma = 0.99
33 | ConstrainedResidualPPO.lambda_ = 0.95
34 | ConstrainedResidualPPO.norm_advantages = True
35 | ConstrainedResidualPPO.optimizer = @optim.Adam
36 | ConstrainedResidualPPO.clip_param = 0.2
37 | ConstrainedResidualPPO.epochs_per_rollout = 4
38 | ConstrainedResidualPPO.max_grad_norm = 0.5
39 | ConstrainedResidualPPO.ent_coef = 0.01
40 | ConstrainedResidualPPO.vf_coef = 0.5
41 | ConstrainedResidualPPO.gpu = True
42 | ConstrainedResidualPPO.base_actor_cls = @LaggyActor
43 | ConstrainedResidualPPO.wrapper_fn = @lunar_lander_fuel_wrapper
44 | 
45 | LaggyActor.actor_cls = @LunarLanderActor
46 | LaggyActor.repeat_prob = 0.8
47 | LunarLanderActor.logdir = "./models/lunar_lander_agent"
48 | LunarLanderActor.device = "cuda:0"
49 | 
50 | Checkpointer.ckpt_period = 100000
51 | 
52 | make_env.env_id = "LunarLanderRandomContinuousNoGoal-v2"
53 | make_env.norm_observations = True
54 | 
55 | VecObsNormWrapper.steps = 10000
56 | VecObsNormWrapper.mean = @bc_mean()
57 | VecObsNormWrapper.std = @bc_std()
58 | VecObsNormWrapper.eps = 1e-2
59 | VecObsNormWrapper.log = True
60 | VecObsNormWrapper.log_prob = 0.01
61 | 
62 | # Model Params
63 | ppo_policy_fn.nunits = 128
64 | DiagGaussian.constant_log_std = False
65 | 


--------------------------------------------------------------------------------
/ppo/configs/ppo_lunar.gin:
--------------------------------------------------------------------------------
 1 | import residual_shared_autonomy.ppo
 2 | import residual_shared_autonomy.imitation_learning
 3 | import residual_shared_autonomy.lunar_lander
 4 | 
 5 | train.algorithm = @ConstrainedResidualPPO
 6 | train.maxt = 100000000
 7 | train.seed = 0
 8 | train.eval = True
 9 | train.eval_period = 1000000
10 | train.save_period = 1000000
11 | train.maxseconds = None
12 | 
13 | optim.Adam.lr = 0.001
14 | optim.Adam.betas = (0.9, 0.999)
15 | optim.Adam.eps = 1e-5
16 | 
17 | ConstrainedResidualPPO.policy_training_start = 100000
18 | ConstrainedResidualPPO.lambda_training_start = 2000000
19 | ConstrainedResidualPPO.lambda_lr = 0.003
20 | ConstrainedResidualPPO.lambda_init = 20
21 | ConstrainedResidualPPO.lr_decay_rate = 0.31622776601
22 | ConstrainedResidualPPO.lr_decay_freq = 20000000
23 | ConstrainedResidualPPO.l2_reg = False
24 | ConstrainedResidualPPO.reward_threshold = -155
25 | ConstrainedResidualPPO.env_fn = @make_env
26 | ConstrainedResidualPPO.policy_fn = @ppo_policy_fn
27 | ConstrainedResidualPPO.nenv = 32
28 | ConstrainedResidualPPO.eval_num_episodes = 100
29 | ConstrainedResidualPPO.record_num_episodes = 0
30 | ConstrainedResidualPPO.rollout_length = 1024
31 | ConstrainedResidualPPO.batch_size = 256
32 | ConstrainedResidualPPO.gamma = 0.99
33 | ConstrainedResidualPPO.lambda_ = 0.95
34 | ConstrainedResidualPPO.norm_advantages = True
35 | ConstrainedResidualPPO.optimizer = @optim.Adam
36 | ConstrainedResidualPPO.clip_param = 0.2
37 | ConstrainedResidualPPO.epochs_per_rollout = 4
38 | ConstrainedResidualPPO.max_grad_norm = 0.5
39 | ConstrainedResidualPPO.ent_coef = 0.01
40 | ConstrainedResidualPPO.vf_coef = 0.5
41 | ConstrainedResidualPPO.gpu = True
42 | ConstrainedResidualPPO.base_actor_cls = @BCMultiActor
43 | ConstrainedResidualPPO.wrapper_fn = @lunar_lander_fuel_wrapper
44 | 
45 | BCMultiActor.logdir = "./models/behavioral_cloning_agents_lunar"
46 | BCMultiActor.device = "cuda:0"
47 | BCMultiActor.switch_prob = 0.001
48 | 
49 | BCNet.ob_shape = 9
50 | BCNet.action_shape = 2
51 | BCNet.nunits = 128
52 | 
53 | 
54 | Checkpointer.ckpt_period = 100000
55 | 
56 | make_env.env_id = "LunarLanderRandomContinuousNoGoal-v2"
57 | make_env.norm_observations = True
58 | 
59 | VecObsNormWrapper.steps = 10000
60 | VecObsNormWrapper.mean = @bc_mean()
61 | VecObsNormWrapper.std = @bc_std()
62 | VecObsNormWrapper.eps = 1e-2
63 | VecObsNormWrapper.log = True
64 | VecObsNormWrapper.log_prob = 0.01
65 | 
66 | # Model Params
67 | ppo_policy_fn.nunits = 128
68 | DiagGaussian.constant_log_std = False
69 | 


--------------------------------------------------------------------------------
/drone_sim/joystick_agent.py:
--------------------------------------------------------------------------------
 1 | """LunarLander Joystick Agent."""
 2 | import pygame
 3 | import numpy as np
 4 | import time
 5 | 
 6 | 
 7 | #####################################
 8 | # Change these to match your joystick
 9 | THRUST_AXIS = 1
10 | ROLL_AXIS = 3
11 | PITCH_AXIS = 4
12 | YAW_AXIS = 0
13 | #####################################
14 | 
15 | 
16 | class DroneJoystickActor(object):
17 |     """Joystick Controller for Drone tasks.
18 | 
19 |     The left stick controls upward force and yaw.
20 |     The right stick controls roll and pitch.
21 |     """
22 | 
23 |     def __init__(self, env, fps=50):
24 |         """Init."""
25 |         if env.num_envs > 1:
26 |             raise ValueError("Only one env can be controlled with the joystick")
27 |         self.env = env
28 |         self.human_agent_action = np.zeros((1, 4), dtype=np.float32)
29 |         pygame.joystick.init()
30 |         joysticks = [pygame.joystick.Joystick(x)
31 |                      for x in range(pygame.joystick.get_count())]
32 |         if len(joysticks) != 1:
33 |             raise ValueError("There must be exactly 1 joystick connected."
34 |                              f"Found {len(joysticks)}")
35 |         self.joy = joysticks[0]
36 |         self.joy.init()
37 |         pygame.init()
38 |         self.axes = [THRUST_AXIS, ROLL_AXIS, PITCH_AXIS, YAW_AXIS]
39 |         self.t = None
40 |         self.fps = fps
41 | 
42 |     def _get_human_action(self):
43 |         for event in pygame.event.get():
44 |             if event.type == pygame.JOYAXISMOTION:
45 |                 for i, ax in enumerate(self.axes):
46 |                     if event.axis == ax:
47 |                         self.human_agent_action[0, i] = event.value
48 |                         break
49 |         return self.human_agent_action
50 | 
51 |     def __call__(self, ob):
52 |         """Act."""
53 |         if self.t and (time.time() - self.t) < 1. / self.fps:
54 |             st = 1. / self.fps - (time.time() - self.t)
55 |             if st > 0.:
56 |                 time.sleep(st)
57 |         self.t = time.time()
58 |         self.env.render()
59 |         action = self._get_human_action()
60 |         return action
61 | 
62 |     def reset(self):
63 |         self.human_agent_action[:] = 0.
64 | 
65 | 
66 | if __name__ == '__main__':
67 |     import gym
68 |     import residual_shared_autonomy.drone_sim
69 |     from dl.rl import ensure_vec_env
70 | 
71 |     env = gym.make("DroneReacherHuman-v0")
72 |     env = ensure_vec_env(env)
73 | 
74 |     actor = DroneJoystickActor(env)
75 | 
76 |     for _ in range(10):
77 |         ob = env.reset()
78 |         actor.reset()
79 |         env.render()
80 |         done = False
81 |         reward = 0.0
82 | 
83 |         while not done:
84 |             ob, r, done, _ = env.step(actor(ob))
85 |             reward += r
86 |         print(reward)
87 | 


--------------------------------------------------------------------------------
/lunar_lander/joystick_agent.py:
--------------------------------------------------------------------------------
 1 | """LunarLander Joystick Agent."""
 2 | import pygame
 3 | import numpy as np
 4 | import time
 5 | 
 6 | #####################################
 7 | # Change these to match your joystick
 8 | UP_AXIS = 4
 9 | SIDE_AXIS = 3
10 | #####################################
11 | 
12 | 
13 | class LunarLanderJoystickActor(object):
14 |     """Joystick Controller for Lunar Lander."""
15 | 
16 |     def __init__(self, env, fps=50):
17 |         """Init."""
18 |         if env.num_envs > 1:
19 |             raise ValueError("Only one env can be controlled with the joystick.")
20 |         self.env = env
21 |         self.human_agent_action = np.array([[0., 0.]], dtype=np.float32)  # noop
22 |         pygame.joystick.init()
23 |         joysticks = [pygame.joystick.Joystick(x)
24 |                      for x in range(pygame.joystick.get_count())]
25 |         if len(joysticks) != 1:
26 |             raise ValueError("There must be exactly 1 joystick connected."
27 |                              f"Found {len(joysticks)}")
28 |         self.joy = joysticks[0]
29 |         self.joy.init()
30 |         pygame.init()
31 |         self.t = None
32 |         self.fps = fps
33 | 
34 |     def _get_human_action(self):
35 |         for event in pygame.event.get():
36 |             if event.type == pygame.JOYAXISMOTION:
37 |                 if event.axis == SIDE_AXIS:
38 |                     self.human_agent_action[0, 1] = event.value
39 |                 elif event.axis == UP_AXIS:
40 |                     self.human_agent_action[0, 0] = -1.0 * event.value
41 |         if abs(self.human_agent_action[0, 0]) < 0.1:
42 |             self.human_agent_action[0, 0] = 0.0
43 |         return self.human_agent_action
44 | 
45 |     def __call__(self, ob):
46 |         """Act."""
47 |         self.env.render()
48 |         action = self._get_human_action()
49 |         if self.t and (time.time() - self.t) < 1. / self.fps:
50 |             st = 1. / self.fps - (time.time() - self.t)
51 |             if st > 0.:
52 |                 time.sleep(st)
53 |         self.t = time.time()
54 |         return action
55 | 
56 |     def reset(self):
57 |         self.human_agent_action[:] = 0.
58 | 
59 | 
60 | if __name__ == '__main__':
61 |     import gym
62 |     import residual_shared_autonomy.lunar_lander
63 |     from dl.rl import ensure_vec_env
64 |     import argparse
65 | 
66 |     parser = argparse.ArgumentParser(description='play')
67 |     parser.add_argument('--reacher', action='store_true', help='play lunar reacher')
68 |     args = parser.parse_args()
69 | 
70 | 
71 |     if args.reacher:
72 |         env = gym.make("LunarLanderReacher-v2")
73 |     else:
74 |         env = gym.make("LunarLanderRandomContinuous-v2")
75 |     env = ensure_vec_env(env)
76 | 
77 |     actor = LunarLanderJoystickActor(env)
78 | 
79 |     for _ in range(10):
80 |         ob = env.reset()
81 |         env.render()
82 |         done = False
83 |         reward = 0.0
84 | 
85 |         while not done:
86 |             ob, r, done, _ = env.step(actor(ob))
87 |             reward += r
88 |         print(reward)
89 | 


--------------------------------------------------------------------------------
/lunar_lander/fuel_wrapper.py:
--------------------------------------------------------------------------------
  1 | from baselines.common.vec_env import VecEnvWrapper
  2 | from residual_shared_autonomy import ResidualWrapper
  3 | from residual_shared_autonomy.lunar_lander import env_lander
  4 | import numpy as np
  5 | import gin
  6 | 
  7 | 
  8 | class LunarLanderFuelWrapper(VecEnvWrapper):
  9 |     """Wrap the ResidualWrapper to modify fuel cost for the co-pilot.
 10 | 
 11 |     Don't make the co-pilot pay for fuel that the pilot wants to use.
 12 |     """
 13 | 
 14 |     def __init__(self, venv):
 15 |         assert isinstance(venv, ResidualWrapper)
 16 |         super().__init__(venv)
 17 | 
 18 |     def step(self, actions):
 19 |         pilot_actions = self.venv._action
 20 |         applied_actions = self.venv._add_actions(np.asarray(actions),
 21 |                                                  pilot_actions)
 22 |         m_pow_pilot, s_pow_pilot = self._get_power_from_action(pilot_actions)
 23 |         m_pow, s_pow = self._get_power_from_action(applied_actions)
 24 | 
 25 |         obs, rews, dones, infos = self.venv.step(actions)
 26 | 
 27 |         fuel_diff = self._fuel_cost(m_pow - m_pow_pilot, s_pow - s_pow_pilot)
 28 | 
 29 |         # print(fuel_diff.shape, rews.shape)
 30 |         # print(actions, pilot_actions)
 31 |         # print(fuel_diff, m_pow, m_pow_pilot, s_pow, s_pow_pilot)
 32 |         # print(lunar_lander_gym.FUEL_COST_MAIN)
 33 | 
 34 |         # Don't incentivise co-pilot for reducing the fuel costs of pilot.
 35 |         # The co pilot is only incentivised to not add extra fuel costs.
 36 |         rews = rews + np.minimum(0.0, fuel_diff)
 37 |         return obs, rews, dones, infos
 38 | 
 39 |     def _get_power_from_action(self, ac):
 40 |         main, side = ac[:, 0], ac[:, 1]
 41 |         main_on = main > 0.
 42 |         side_on = np.abs(side) > 0.5
 43 |         m_power = main_on * (np.clip(main, 0.0, 1.0) + 1.0) * 0.5
 44 |         s_power = side_on * np.clip(np.abs(side), 0.5, 1.0)
 45 |         return m_power, s_power
 46 | 
 47 |     def _fuel_cost(self, main_pow, side_pow):
 48 |         return (env_lander.FUEL_COST_MAIN * main_pow
 49 |                 + env_lander.FUEL_COST_SIDE * side_pow)
 50 | 
 51 |     def reset(self):
 52 |         """Reset."""
 53 |         return self.venv.reset()
 54 | 
 55 |     def step_wait(self):
 56 |         """Step wait."""
 57 |         return self.venv.step_wait()
 58 | 
 59 | 
 60 | @gin.configurable
 61 | def lunar_lander_fuel_wrapper(env):
 62 |     return LunarLanderFuelWrapper(env)
 63 | 
 64 | 
 65 | if __name__ == '__main__':
 66 |     import unittest
 67 |     import gym
 68 |     from dl.rl import ensure_vec_env, VecFrameStack
 69 | 
 70 |     class RandomActor(object):
 71 |         """Output random actions."""
 72 | 
 73 |         def __init__(self, action_space):
 74 |             """Init."""
 75 |             self.action_space = action_space
 76 | 
 77 |         def __call__(self, ob):
 78 |             """Act."""
 79 |             batch_size = ob.shape[0]
 80 |             return np.asarray([self.action_space.sample()
 81 |                                for _ in range(batch_size)])
 82 | 
 83 |     class Test(unittest.TestCase):
 84 |         """Tests."""
 85 | 
 86 |         def test_vec_env_wrapper(self):
 87 |             """Test."""
 88 |             env = gym.make("LunarLanderContinuous-v2")
 89 |             env = ensure_vec_env(env)
 90 |             actor = RandomActor(env.action_space)
 91 |             env = ResidualWrapper(env, actor)
 92 |             env = LunarLanderFuelWrapper(env)
 93 |             env = VecFrameStack(env, 4)
 94 |             ob, ac = env.reset()
 95 |             assert ac.shape == (1, 4*env.action_space.shape[0])
 96 |             assert ob.shape == (1, *env.observation_space.spaces[0].shape)
 97 |             assert ac.shape == (1, *env.observation_space.spaces[1].shape)
 98 | 
 99 |             for _ in range(10):
100 |                 residual_ac = [env.action_space.sample()]
101 |                 (ob, ac_next), _, _, infos = env.step(residual_ac)
102 |                 rac = np.minimum(np.maximum(residual_ac[0] + ac[0][-2:], -1), 1)
103 |                 assert np.allclose(infos[0]['action'], rac)
104 |                 ac = ac_next
105 | 
106 |     unittest.main()
107 | 


--------------------------------------------------------------------------------
/ppo/models/pretrained_drone/config.gin:
--------------------------------------------------------------------------------
  1 | import residual_shared_autonomy.drone_sim
  2 | import residual_shared_autonomy.imitation_learning
  3 | import residual_shared_autonomy.ppo
  4 | 
  5 | # Parameters for Adam:
  6 | # ==============================================================================
  7 | Adam.amsgrad = False
  8 | Adam.betas = (0.9, 0.999)
  9 | Adam.eps = 1e-05
 10 | Adam.lr = 0.001
 11 | Adam.weight_decay = 0
 12 | 
 13 | # Parameters for BCMultiActor:
 14 | # ==============================================================================
 15 | BCMultiActor.device = 'cuda:0'
 16 | BCMultiActor.logdir = \
 17 |     '/home-nfs/cbschaff/projects/residual_shared_autonomy/ppo/bc_agents_drone'
 18 | BCMultiActor.switch_prob = 0.001
 19 | 
 20 | # Parameters for BCNet:
 21 | # ==============================================================================
 22 | BCNet.action_shape = 4
 23 | BCNet.nunits = 128
 24 | BCNet.ob_shape = 15
 25 | 
 26 | # Parameters for Checkpointer:
 27 | # ==============================================================================
 28 | Checkpointer.ckpt_period = 100000
 29 | Checkpointer.format = '{:09d}'
 30 | 
 31 | # Parameters for ConstrainedResidualPPO:
 32 | # ==============================================================================
 33 | ConstrainedResidualPPO.base_actor_cls = @BCMultiActor
 34 | ConstrainedResidualPPO.batch_size = 512
 35 | ConstrainedResidualPPO.clip_param = 0.2
 36 | ConstrainedResidualPPO.ent_coef = 0.01
 37 | ConstrainedResidualPPO.env_fn = @make_env
 38 | ConstrainedResidualPPO.epochs_per_rollout = 4
 39 | ConstrainedResidualPPO.eval_num_episodes = 100
 40 | ConstrainedResidualPPO.gamma = 0.99
 41 | ConstrainedResidualPPO.gpu = True
 42 | ConstrainedResidualPPO.l2_reg = False
 43 | ConstrainedResidualPPO.lambda_ = 0.95
 44 | ConstrainedResidualPPO.lambda_init = 10.0
 45 | ConstrainedResidualPPO.lambda_lr = 0.001
 46 | ConstrainedResidualPPO.lambda_training_start = 5000000
 47 | ConstrainedResidualPPO.lr_decay_freq = 20000000
 48 | ConstrainedResidualPPO.lr_decay_rate = 0.31622776601
 49 | ConstrainedResidualPPO.max_grad_norm = 0.5
 50 | ConstrainedResidualPPO.nenv = 64
 51 | ConstrainedResidualPPO.norm_advantages = True
 52 | ConstrainedResidualPPO.optimizer = @optim.Adam
 53 | ConstrainedResidualPPO.policy_fn = @ppo_policy_fn
 54 | ConstrainedResidualPPO.policy_training_start = 500000
 55 | ConstrainedResidualPPO.record_num_episodes = 5
 56 | ConstrainedResidualPPO.reward_threshold = -0.875
 57 | ConstrainedResidualPPO.rollout_length = 1024
 58 | ConstrainedResidualPPO.vf_coef = 0.5
 59 | ConstrainedResidualPPO.wrapper_fn = None
 60 | 
 61 | # Parameters for DiagGaussian:
 62 | # ==============================================================================
 63 | DiagGaussian.constant_log_std = False
 64 | DiagGaussian.log_std_max = 2
 65 | DiagGaussian.log_std_min = -20
 66 | 
 67 | # Parameters for drone_bc_mean:
 68 | # ==============================================================================
 69 | # None.
 70 | 
 71 | # Parameters for drone_bc_std:
 72 | # ==============================================================================
 73 | # None.
 74 | 
 75 | # Parameters for make_env:
 76 | # ==============================================================================
 77 | make_env.env_id = 'DroneStabilizer-v0'
 78 | make_env.norm_observations = True
 79 | make_env.seed = 0
 80 | 
 81 | # Parameters for Policy:
 82 | # ==============================================================================
 83 | # None.
 84 | 
 85 | # Parameters for ppo_policy_fn:
 86 | # ==============================================================================
 87 | ppo_policy_fn.nunits = 128
 88 | 
 89 | # Parameters for train:
 90 | # ==============================================================================
 91 | train.algorithm = @ConstrainedResidualPPO
 92 | train.eval = True
 93 | train.eval_period = 1000000
 94 | train.maxseconds = None
 95 | train.maxt = 100000000
 96 | train.save_period = 1000000
 97 | train.seed = 0
 98 | 
 99 | # Parameters for VecObsNormWrapper:
100 | # ==============================================================================
101 | VecObsNormWrapper.eps = 0.01
102 | VecObsNormWrapper.log = True
103 | VecObsNormWrapper.log_prob = 0.01
104 | VecObsNormWrapper.mean = @drone_bc_mean()
105 | VecObsNormWrapper.std = @drone_bc_std()
106 | VecObsNormWrapper.steps = 10000
107 | 


--------------------------------------------------------------------------------
/ppo/models/pretrained_lunar/config.gin:
--------------------------------------------------------------------------------
  1 | import residual_shared_autonomy.lunar_lander
  2 | import residual_shared_autonomy.imitation_learning
  3 | import residual_shared_autonomy.ppo
  4 | 
  5 | # Parameters for Adam:
  6 | # ==============================================================================
  7 | Adam.amsgrad = False
  8 | Adam.betas = (0.9, 0.999)
  9 | Adam.eps = 1e-05
 10 | Adam.lr = 0.001
 11 | Adam.weight_decay = 0
 12 | 
 13 | # Parameters for bc_mean:
 14 | # ==============================================================================
 15 | # None.
 16 | 
 17 | # Parameters for bc_std:
 18 | # ==============================================================================
 19 | # None.
 20 | 
 21 | # Parameters for BCMultiActor:
 22 | # ==============================================================================
 23 | BCMultiActor.device = 'cuda:0'
 24 | BCMultiActor.logdir = \
 25 |     '/home-nfs/cbschaff/projects/residual_shared_autonomy/ppo/bc_agents_no_terrain'
 26 | BCMultiActor.switch_prob = 0.001
 27 | 
 28 | # Parameters for BCNet:
 29 | # ==============================================================================
 30 | BCNet.action_shape = 2
 31 | BCNet.nunits = 128
 32 | BCNet.ob_shape = 9
 33 | 
 34 | # Parameters for Checkpointer:
 35 | # ==============================================================================
 36 | Checkpointer.ckpt_period = 100000
 37 | Checkpointer.format = '{:09d}'
 38 | 
 39 | # Parameters for ConstrainedResidualPPO:
 40 | # ==============================================================================
 41 | ConstrainedResidualPPO.base_actor_cls = @BCMultiActor
 42 | ConstrainedResidualPPO.batch_size = 256
 43 | ConstrainedResidualPPO.clip_param = 0.2
 44 | ConstrainedResidualPPO.ent_coef = 0.01
 45 | ConstrainedResidualPPO.env_fn = @make_env
 46 | ConstrainedResidualPPO.epochs_per_rollout = 4
 47 | ConstrainedResidualPPO.eval_num_episodes = 100
 48 | ConstrainedResidualPPO.gamma = 0.99
 49 | ConstrainedResidualPPO.gpu = True
 50 | ConstrainedResidualPPO.l2_reg = False
 51 | ConstrainedResidualPPO.lambda_ = 0.95
 52 | ConstrainedResidualPPO.lambda_init = 20.0
 53 | ConstrainedResidualPPO.lambda_lr = 0.003
 54 | ConstrainedResidualPPO.lambda_training_start = 2000000
 55 | ConstrainedResidualPPO.lr_decay_freq = 20000000
 56 | ConstrainedResidualPPO.lr_decay_rate = 0.31622776601
 57 | ConstrainedResidualPPO.max_grad_norm = 0.5
 58 | ConstrainedResidualPPO.nenv = 32
 59 | ConstrainedResidualPPO.norm_advantages = True
 60 | ConstrainedResidualPPO.optimizer = @optim.Adam
 61 | ConstrainedResidualPPO.policy_fn = @ppo_policy_fn
 62 | ConstrainedResidualPPO.policy_training_start = 100000
 63 | ConstrainedResidualPPO.record_num_episodes = 5
 64 | ConstrainedResidualPPO.reward_threshold = -155.0
 65 | ConstrainedResidualPPO.vf_coef = 0.5
 66 | ConstrainedResidualPPO.wrapper_fn = @lunar_lander_fuel_wrapper
 67 | 
 68 | # Parameters for DiagGaussian:
 69 | # ==============================================================================
 70 | DiagGaussian.constant_log_std = False
 71 | DiagGaussian.log_std_max = 2
 72 | DiagGaussian.log_std_min = -20
 73 | 
 74 | # Parameters for lunar_lander_fuel_wrapper:
 75 | # ==============================================================================
 76 | # None.
 77 | 
 78 | # Parameters for make_env:
 79 | # ==============================================================================
 80 | make_env.env_id = 'LunarLanderRandomContinuousNoGoal-v2'
 81 | make_env.norm_observations = True
 82 | make_env.seed = 0
 83 | 
 84 | # Parameters for Policy:
 85 | # ==============================================================================
 86 | # None.
 87 | 
 88 | # Parameters for ppo_policy_fn:
 89 | # ==============================================================================
 90 | ppo_policy_fn.nunits = 128
 91 | 
 92 | # Parameters for train:
 93 | # ==============================================================================
 94 | train.algorithm = @ConstrainedResidualPPO
 95 | train.eval = True
 96 | train.eval_period = 1000000
 97 | train.maxseconds = None
 98 | train.maxt = 100000000
 99 | train.save_period = 1000000
100 | train.seed = 0
101 | 
102 | # Parameters for VecObsNormWrapper:
103 | # ==============================================================================
104 | VecObsNormWrapper.eps = 0.01
105 | VecObsNormWrapper.log = True
106 | VecObsNormWrapper.log_prob = 0.01
107 | VecObsNormWrapper.mean = @bc_mean()
108 | VecObsNormWrapper.std = @bc_std()
109 | VecObsNormWrapper.steps = 10000
110 | 


--------------------------------------------------------------------------------
/imitation_learning/trainer.py:
--------------------------------------------------------------------------------
  1 | """Defines trainer and network for MNIST."""
  2 | import torch
  3 | import torch.nn as nn
  4 | import torch.nn.functional as F
  5 | import gin
  6 | import time
  7 | from dl import logger, Checkpointer, nest
  8 | from dl.util import StatefulSampler
  9 | from dl.modules import DiagGaussian
 10 | from torch.utils.data import DataLoader
 11 | import os
 12 | 
 13 | 
 14 | @gin.configurable
 15 | class BCNet(nn.Module):
 16 |     """Imitation network."""
 17 | 
 18 |     def __init__(self, ob_shape, action_shape, nunits):
 19 |         """Init."""
 20 |         super().__init__()
 21 |         self.nunits = nunits
 22 |         self.fc1 = nn.Linear(ob_shape, self.nunits)
 23 |         self.fc2 = nn.Linear(self.nunits, self.nunits)
 24 |         self.dist = DiagGaussian(self.nunits, action_shape)
 25 |         self.ob_shape = ob_shape
 26 | 
 27 |     def forward(self, ob):
 28 |         """Forward."""
 29 |         if ob.shape[-1] > self.ob_shape:
 30 |             # HACK. While iterating on the project, I changed the
 31 |             # observation space of the environments. To use human
 32 |             # data collected before that, I manually crop out
 33 |             # newer observation data.
 34 |             ob = ob[:, :self.ob_shape]
 35 |         net = F.relu(self.fc1(ob))
 36 |         net = F.relu(self.fc2(net))
 37 |         return self.dist(net)
 38 | 
 39 | 
 40 | @gin.configurable(blacklist=['datafile'])
 41 | class DemonstrationData(object):
 42 |     def __init__(self, datafile, mean=None, std=None):
 43 |         data = torch.load(datafile)
 44 |         self.obs = data['obs']
 45 |         self.actions = data['actions']
 46 |         self.n = self.obs.shape[0]
 47 |         if mean is None or std is None:
 48 |             self.mean = self.obs.mean(dim=0)
 49 |             self.std = self.obs.std(dim=0)
 50 |         else:
 51 |             self.mean = torch.from_numpy(mean)
 52 |             self.std = torch.from_numpy(std)
 53 |         self.obs = (self.obs - self.mean) / (self.std + 1e-5)
 54 | 
 55 |     def __len__(self):
 56 |         return self.n
 57 | 
 58 |     def __getitem__(self, idx):
 59 |         return self.obs[idx], self.actions[idx]
 60 | 
 61 | 
 62 | @gin.configurable(blacklist=['logdir'])
 63 | class BCTrainer(object):
 64 |     """Behavioral cloning."""
 65 | 
 66 |     def __init__(self, logdir, model, opt, datafile, batch_size, num_workers,
 67 |                  gpu=True):
 68 |         self.logdir = logdir
 69 |         self.ckptr = Checkpointer(os.path.join(logdir, 'ckpts'))
 70 |         self.data = DemonstrationData(datafile)
 71 |         self.sampler = StatefulSampler(self.data, shuffle=True)
 72 |         self.dtrain = DataLoader(self.data, sampler=self.sampler,
 73 |                                  batch_size=batch_size,
 74 |                                  num_workers=num_workers)
 75 |         self._diter = None
 76 |         self.t = 0
 77 |         self.epochs = 0
 78 |         self.batch_size = batch_size
 79 | 
 80 |         self.device = torch.device('cuda:0' if gpu and torch.cuda.is_available()
 81 |                                    else 'cpu')
 82 |         self.model = model
 83 |         self.model.to(self.device)
 84 |         self.opt = opt(self.model.parameters())
 85 | 
 86 |     def step(self):
 87 |         # Get batch.
 88 |         if self._diter is None:
 89 |             self._diter = self.dtrain.__iter__()
 90 |         try:
 91 |             batch = self._diter.__next__()
 92 |         except StopIteration:
 93 |             self.epochs += 1
 94 |             self._diter = None
 95 |             return self.epochs
 96 |         batch = nest.map_structure(lambda x: x.to(self.device), batch)
 97 | 
 98 |         # compute loss
 99 |         ob, ac = batch
100 |         self.model.train()
101 |         loss = -self.model(ob).log_prob(ac).mean()
102 | 
103 |         logger.add_scalar('train/loss', loss.detach().cpu().numpy(),
104 |                           self.t, time.time())
105 | 
106 |         # update model
107 |         self.opt.zero_grad()
108 |         loss.backward()
109 |         self.opt.step()
110 | 
111 |         # increment step
112 |         self.t += min(len(self.data) - (self.t % len(self.data)),
113 |                       self.batch_size)
114 |         return self.epochs
115 | 
116 |     def evaluate(self):
117 |         """Evaluate model."""
118 |         self.model.eval()
119 | 
120 |         nll = 0.
121 |         with torch.no_grad():
122 |             for batch in self.dtrain:
123 |                 ob, ac = nest.map_structure(lambda x: x.to(self.device), batch)
124 |                 nll += -self.model(ob).log_prob(ac).sum()
125 |             avg_nll = nll / len(self.data)
126 | 
127 |             logger.add_scalar('train/NLL', nll, self.epochs, time.time())
128 |             logger.add_scalar('train/AVG_NLL', avg_nll, self.epochs,
129 |                               time.time())
130 | 
131 |     def save(self):
132 |         state_dict = {}
133 |         state_dict['model'] = self.model.state_dict()
134 |         state_dict['opt'] = self.opt.state_dict()
135 |         state_dict['sampler'] = self.sampler.state_dict(self._diter)
136 |         state_dict['t'] = self.t
137 |         state_dict['epochs'] = self.epochs
138 |         self.ckptr.save(state_dict, self.t)
139 | 
140 |     def load(self, t=None):
141 |         state_dict = self.ckptr.load()
142 |         if state_dict is None:
143 |             self.t = 0
144 |             self.epochs = 0
145 |             return self.epochs
146 |         self.model.load_state_dict(state_dict['model'])
147 |         self.opt.load_state_dict(state_dict['opt'])
148 |         self.sampler.load_state_dict(state_dict['sampler'])
149 |         self.t = state_dict['t']
150 |         self.epochs = state_dict['epochs']
151 |         if self._diter is not None:
152 |             self._diter.__del__()
153 |             self._diter = None
154 | 
155 |     def close(self):
156 |         """Close data iterator."""
157 |         if self._diter is not None:
158 |             self._diter.__del__()
159 |             self._diter = None
160 | 


--------------------------------------------------------------------------------
/residual_wrapper.py:
--------------------------------------------------------------------------------
  1 | """Residual Policy environment wrapper."""
  2 | from gym.spaces import Tuple, Box
  3 | from baselines.common.vec_env import VecEnvWrapper
  4 | import numpy as np
  5 | 
  6 | 
  7 | class ResidualWrapper(VecEnvWrapper):
  8 |     """Wrapper for residual policy learning.
  9 | 
 10 |     https://arxiv.org/abs/1812.06298
 11 | 
 12 |     Requires a callable which returns an action. The chosen action is added to
 13 |     the observation.
 14 |     """
 15 | 
 16 |     def __init__(self, venv, act_fn):
 17 |         """Init."""
 18 |         super().__init__(venv)
 19 |         self.act_fn = act_fn
 20 |         self._ob = None
 21 |         if not isinstance(self.action_space, Box):
 22 |             raise ValueError("ResidualWrapper can only be used with continuous "
 23 |                              "action spaces.")
 24 |         self.observation_space = Tuple([self.observation_space,
 25 |                                         self.action_space])
 26 |         bound = self.action_space.high - self.action_space.low
 27 |         self.action_space = Box(-bound, bound)
 28 |         self._action = None
 29 | 
 30 |     def reset(self):
 31 |         """Reset."""
 32 |         ob = self.venv.reset()
 33 |         self._action = np.asarray(self.act_fn(ob))
 34 |         if hasattr(self.act_fn, 'reset'):
 35 |             # reset joystick to zero action
 36 |             self.act_fn.reset()
 37 |         return (ob, self._norm_action(self._action))
 38 | 
 39 |     def step(self, action):
 40 |         """Step."""
 41 |         action = self._add_actions(np.asarray(action), self._action)
 42 |         ob, rs, dones, infos = self.venv.step(action)
 43 |         for i, info in enumerate(infos):
 44 |             info['action'] = action[i]
 45 |             info['assistant_action'] = np.asarray(action)[i]
 46 |             info['player_action'] = self._action[i]
 47 |         self._action = self.act_fn(ob)
 48 |         return (ob, self._norm_action(self._action)), rs, dones, infos
 49 | 
 50 |     def _clip_action(self, ac):
 51 |         return np.maximum(np.minimum(ac, self.venv.action_space.high),
 52 |                           self.venv.action_space.low)
 53 | 
 54 |     def _add_actions(self, ac1, ac2):
 55 |         return self._clip_action(ac1 + ac2)
 56 | 
 57 |     def _norm_action(self, ac):
 58 |         high = self.venv.action_space.high
 59 |         low = self.venv.action_space.low
 60 |         return 2 * (ac - low) / (high - low) - 1.0
 61 | 
 62 |     def step_wait(self):
 63 |         """Step wait."""
 64 |         return self.venv.step_wait()
 65 | 
 66 | 
 67 | if __name__ == '__main__':
 68 |     import unittest
 69 |     import gym
 70 |     from dl.rl import ensure_vec_env, VecFrameStack
 71 | 
 72 |     class ZeroActor(object):
 73 |         """Output zeros."""
 74 | 
 75 |         def __init__(self, action_space):
 76 |             """Init."""
 77 |             self.action_space = action_space
 78 | 
 79 |         def __call__(self, ob):
 80 |             """Act."""
 81 |             batch_size = ob.shape[0]
 82 |             return np.zeros_like([self.action_space.sample()
 83 |                                   for _ in range(batch_size)])
 84 | 
 85 |     class RandomActor(object):
 86 |         """Output random actions."""
 87 | 
 88 |         def __init__(self, action_space):
 89 |             """Init."""
 90 |             self.action_space = action_space
 91 | 
 92 |         def __call__(self, ob):
 93 |             """Act."""
 94 |             batch_size = ob.shape[0]
 95 |             return np.asarray([self.action_space.sample()
 96 |                                for _ in range(batch_size)])
 97 | 
 98 |     class Test(unittest.TestCase):
 99 |         """Tests."""
100 | 
101 |         def test_zero_actor(self):
102 |             """Test."""
103 |             env = gym.make("LunarLanderContinuous-v2")
104 |             env = ensure_vec_env(env)
105 |             actor = ZeroActor(env.action_space)
106 |             env = ResidualWrapper(env, actor)
107 |             ob, ac = env.reset()
108 |             assert np.allclose(ac, 0)
109 |             assert ac.shape == (1, *env.action_space.shape)
110 |             assert ob.shape == (1, *env.observation_space.spaces[0].shape)
111 |             assert ac.shape == (1, *env.observation_space.spaces[1].shape)
112 |             assert isinstance(env.observation_space, Tuple)
113 | 
114 |             residual_ac = [env.action_space.sample()]
115 |             (ob, ac), _, _, infos = env.step(residual_ac)
116 |             rac = np.minimum(np.maximum(residual_ac[0], -1), 1)
117 |             assert np.allclose(infos[0]['action'], rac)
118 |             assert np.allclose(ac, 0)
119 |             assert ac.shape == (1, *env.action_space.shape)
120 |             assert ob.shape == (1, *env.observation_space.spaces[0].shape)
121 |             assert ac.shape == (1, *env.observation_space.spaces[1].shape)
122 | 
123 |         def test_random_actor(self):
124 |             """Test."""
125 |             env = gym.make("LunarLanderContinuous-v2")
126 |             env = ensure_vec_env(env)
127 |             actor = RandomActor(env.action_space)
128 |             env = ResidualWrapper(env, actor)
129 |             ob, ac = env.reset()
130 |             assert ac.shape == (1, *env.action_space.shape)
131 |             assert ob.shape == (1, *env.observation_space.spaces[0].shape)
132 |             assert ac.shape == (1, *env.observation_space.spaces[1].shape)
133 |             assert isinstance(env.observation_space, Tuple)
134 | 
135 |             for _ in range(10):
136 |                 residual_ac = [env.action_space.sample()]
137 |                 (ob, ac_next), _, _, infos = env.step(residual_ac)
138 |                 rac = np.minimum(np.maximum(residual_ac[0] + ac[0], -1), 1)
139 |                 assert np.allclose(infos[0]['action'], rac)
140 |                 ac = ac_next
141 | 
142 |         def test_vec_env_wrapper(self):
143 |             """Test."""
144 |             env = gym.make("LunarLanderContinuous-v2")
145 |             env = ensure_vec_env(env)
146 |             actor = RandomActor(env.action_space)
147 |             env = ResidualWrapper(env, actor)
148 |             env = VecFrameStack(env, 4)
149 |             ob, ac = env.reset()
150 |             assert ac.shape == (1, 4*env.action_space.shape[0])
151 |             assert ob.shape == (1, *env.observation_space.spaces[0].shape)
152 |             assert ac.shape == (1, *env.observation_space.spaces[1].shape)
153 |             assert isinstance(env.observation_space, Tuple)
154 | 
155 |             for _ in range(10):
156 |                 residual_ac = [env.action_space.sample()]
157 |                 (ob, ac_next), _, _, infos = env.step(residual_ac)
158 |                 rac = np.minimum(np.maximum(residual_ac[0] + ac[0][-2:], -1), 1)
159 |                 assert np.allclose(infos[0]['action'], rac)
160 |                 ac = ac_next
161 | 
162 |     unittest.main()
163 | 


--------------------------------------------------------------------------------
/base_actors.py:
--------------------------------------------------------------------------------
  1 | """Base actors on which residuals are learned."""
  2 | import numpy as np
  3 | import torch
  4 | from residual_shared_autonomy.imitation_learning import BCNet
  5 | from residual_shared_autonomy.lunar_lander import lunar_lander_policy_fn
  6 | from residual_shared_autonomy.drone_sim import drone_ppo_policy_fn
  7 | from dl import Checkpointer
  8 | import gin
  9 | import os
 10 | 
 11 | 
 12 | @gin.configurable
 13 | class ZeroActor(object):
 14 |     """Output random actions."""
 15 | 
 16 |     def __init__(self, env):
 17 |         """Init."""
 18 |         self.action_space = env.action_space
 19 |         self.batch_size = env.num_envs
 20 | 
 21 |     def __call__(self, ob):
 22 |         """Act."""
 23 |         return np.zeros([self.batch_size] + list(self.action_space.shape),
 24 |                         dtype=np.float32)
 25 | 
 26 | 
 27 | @gin.configurable
 28 | class RandomActor(object):
 29 |     """Output random actions."""
 30 | 
 31 |     def __init__(self, env):
 32 |         """Init."""
 33 |         self.action_space = env.action_space
 34 |         self.batch_size = env.num_envs
 35 | 
 36 |     def __call__(self, ob):
 37 |         """Act."""
 38 |         return np.asarray([self.action_space.sample()
 39 |                            for _ in range(self.batch_size)])
 40 | 
 41 | 
 42 | @gin.configurable
 43 | class PolicyActor(object):
 44 |     """policy actor"""
 45 | 
 46 |     def __init__(self, pi, device):
 47 |         self.pi = pi
 48 |         self.device = device
 49 | 
 50 |     def __call__(self, ob):
 51 |         """Act."""
 52 |         if isinstance(ob, np.ndarray):
 53 |             ob = torch.from_numpy(ob).to(self.device)
 54 |         return self.pi(ob).action.cpu().numpy()
 55 | 
 56 | 
 57 | @gin.configurable
 58 | class LunarLanderActor(object):
 59 |     """Lunar Lander actor."""
 60 | 
 61 |     def __init__(self, env, logdir, device):
 62 |         self.ckptr = Checkpointer(os.path.join(logdir, 'ckpts'))
 63 |         if not torch.cuda.is_available():
 64 |             device = 'cpu'
 65 |         self.pi = lunar_lander_policy_fn(env)
 66 |         self.pi.to(device)
 67 |         self.pi.load_state_dict(self.ckptr.load()['pi'])
 68 |         self.pi.eval()
 69 |         self.device = device
 70 | 
 71 |     def __call__(self, ob):
 72 |         """Act."""
 73 |         with torch.no_grad():
 74 |             if isinstance(ob, np.ndarray):
 75 |                 ob = torch.from_numpy(ob).to(self.device)
 76 |             return self.pi(ob).action.cpu().numpy()
 77 | 
 78 | 
 79 | @gin.configurable
 80 | class DroneReacherActor(object):
 81 |     """DroneReacher actor."""
 82 | 
 83 |     def __init__(self, env, logdir, device):
 84 |         self.ckptr = Checkpointer(os.path.join(logdir, 'ckpts'))
 85 |         if not torch.cuda.is_available():
 86 |             device = 'cpu'
 87 |         self.pi = drone_ppo_policy_fn(env)
 88 |         self.pi.to(device)
 89 |         self.pi.load_state_dict(self.ckptr.load()['pi'])
 90 |         self.pi.eval()
 91 |         self.device = device
 92 | 
 93 |     def __call__(self, ob):
 94 |         """Act."""
 95 |         with torch.no_grad():
 96 |             if isinstance(ob, np.ndarray):
 97 |                 ob = torch.from_numpy(ob).to(self.device)
 98 |             return self.pi(ob).action.cpu().numpy()
 99 | 
100 | 
101 | @gin.configurable
102 | class LaggyActor(object):
103 |     """Laggy actor"""
104 | 
105 |     def __init__(self, env, actor_cls, repeat_prob):
106 |         self.actor = actor_cls(env)
107 |         self.repeat_prob = repeat_prob
108 |         self.action = None
109 | 
110 |     def __call__(self, ob):
111 |         """Act."""
112 |         if self.action is None or np.random.rand() > self.repeat_prob:
113 |             self.action = self.actor(ob)
114 |         return self.action
115 | 
116 | 
117 | @gin.configurable
118 | class NoisyActor(object):
119 |     """Noisy actor"""
120 | 
121 |     def __init__(self, env, actor_cls, eps):
122 |         self.actor = actor_cls(env)
123 |         self.eps = eps
124 | 
125 |     def __call__(self, ob):
126 |         """Act."""
127 |         action = self.actor(ob)
128 |         if np.random.rand() < self.eps:
129 |             action = np.random.uniform(-1, 1, action.shape).astype(action.dtype)
130 |         return action
131 | 
132 | 
133 | @gin.configurable
134 | class BCActor(object):
135 |     """Actor trained with Behavioral cloning"""
136 | 
137 |     def __init__(self, env, logdir, device):
138 |         self.ckptr = Checkpointer(os.path.join(logdir, 'ckpts'))
139 |         if not torch.cuda.is_available():
140 |             device = 'cpu'
141 |         self.device = device
142 |         self.net = BCNet()
143 |         self.net.to(device)
144 |         self.net.load_state_dict(self.ckptr.load()['model'])
145 | 
146 |     def __call__(self, ob):
147 |         """Act."""
148 |         with torch.no_grad():
149 |             dist = self.net(torch.from_numpy(ob).to(self.device))
150 |             ac = dist.sample().cpu().numpy()
151 |             return np.clip(ac, -1., 1.)
152 | 
153 | 
154 | @gin.configurable
155 | class BCMultiActor(object):
156 |     """Use multiple actors trained with Behavioral cloning"""
157 | 
158 |     def __init__(self, env, logdir, device, switch_prob=0.001):
159 |         dirs = [x for x in os.listdir(logdir) if os.path.isdir(
160 |                                             os.path.join(logdir, x, 'ckpts'))]
161 | 
162 |         self.ckptrs = [Checkpointer(os.path.join(logdir, x, 'ckpts'))
163 |                        for x in dirs]
164 |         if not torch.cuda.is_available():
165 |             device = 'cpu'
166 |         self.device = device
167 |         self.nets = [BCNet() for _ in dirs]
168 |         for net, ckptr in zip(self.nets, self.ckptrs):
169 |             net.to(device)
170 |             net.load_state_dict(ckptr.load()['model'])
171 |         self.current_actor = np.random.choice(self.nets)
172 |         self.switch_prob = switch_prob
173 | 
174 |     def __call__(self, ob):
175 |         """Act."""
176 |         if np.random.rand() < self.switch_prob:
177 |             self.current_actor = np.random.choice(self.nets)
178 |         with torch.no_grad():
179 |             if isinstance(ob, np.ndarray):
180 |                 ob = torch.from_numpy(ob)
181 |             dist = self.current_actor(ob.to(self.device))
182 |             ac = dist.sample().cpu().numpy()
183 |             return np.clip(ac, -1., 1.)
184 | 
185 | 
186 | if __name__ == '__main__':
187 |     import gym
188 |     import residual_shared_autonomy.envs
189 |     from dl.rl import ensure_vec_env
190 |     import time
191 | 
192 |     env = gym.make("LunarLanderRandomContinuous-v2")
193 |     env = ensure_vec_env(env)
194 | 
195 |     # actor = OrnsteinUhlenbeckActor(env, 0.5)
196 |     actor = RandomActor(env)
197 | 
198 |     for _ in range(10):
199 |         ob = env.reset()
200 |         env.render()
201 |         done = False
202 |         reward = 0.0
203 |         time.sleep(1.)
204 | 
205 |         while not done:
206 |             ob, r, done, _ = env.step(actor(ob))
207 |             env.render()
208 |             reward += r
209 |         print(reward)
210 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Residual Policy Learning for Shared Autonomy
  2 | 
  3 | This repository accompanies the paper [Residual Policy Learning for Shared Autonomy](https://arxiv.org/abs/2004.05097). 
  4 | 
  5 | This repository contains code to train our shared autonomy agent with several human models, including agents trained to
  6 | imitate humans and corrupted versions of optimal policies. We have provided all of the human models used in our paper
  7 | as well as code for you to make your own human models. Trained shared autonomy agents are also provided.
  8 | 
  9 | You can interact with the models through a joystick and the pygame interface.
 10 | 
 11 | The code is based on my [deep reinforcement learning library](https://github.com/cbschaff/dl) which builds on PyTorch,
 12 | OpenAI Gym, gin-config, and Tensorboard.
 13 | 
 14 | ## Installation
 15 | 
 16 | 1. Install [docker](https://docs.docker.com/get-docker/).
 17 | 2. Install [x-docker](https://github.com/afdaniele/x-docker), a wrapper around docker for running GUI applications inside a container.
 18 | 3. In the top level directory, build the docker image by running:
 19 |     ```./build_docker.sh```
 20 | 4. Launch the docker container by running:
 21 |     ```./launch_docker.sh```
 22 |     This will start a container and mount the code at ```/root/pkgs/residual_shared_autonomy```.
 23 |  
 24 |  All commands, unless otherwise stated, should be run from inside a docker container.
 25 |  
 26 | ## Repository structure
 27 | The code is organized into the following directories:
 28 | 
 29 | * ```ppo/```: The code for training and interacting with shared autonomy agents.
 30 | * ```lunar_lander/```: The code for LunarLander, LunarReacher, and training RL agents for these environments.
 31 | * ```drone_sim/```: The code for DroneReacher and training RL agents for that environment.
 32 | * ```imitation_learning/```: The code for collecting human data and training behavioral cloning agents.
 33 |  
 34 | ## Playing the games with and without our assistants
 35 | 
 36 | You can interact with the environments using a joystick controller (Xbox, etc.).
 37 | 
 38 | NOTE: The controls are hard coded for our particular joystick (Xbox one controller).
 39 | You may need to edit ```lunar_lander/joystick_agent.py``` and ```drone_sim/joystick_agent.py```
 40 | to match your joystick.
 41 | 
 42 | NOTE: Docker seems to only recognize a joystick if it is connected to your computer before you start the container.
 43 | 
 44 | ### Lunar Lander and Lunar Reacher
 45 | 
 46 | In these environments, you control the thrusters of a spaceship. The controls are as follows:
 47 | * Tilting the RIGHT joystick VERTICALLY will control the main thruster.
 48 | * Tilting the RIGHT joystick HORIZONTALLY will control the side thrusters.
 49 | 
 50 | To play the game, run:
 51 | 
 52 | ```
 53 | cd /root/pkgs/residual_shared_autonomy/lunar_lander
 54 | python joystick_agent.py
 55 | ```
 56 | 
 57 | To play the game with our pretrained assistant, run:
 58 | 
 59 | ```
 60 | cd /root/pkgs/residual_shared_autonomy/ppo
 61 | python joystick_control.py models/pretrained_lunar
 62 | ```
 63 | To play LunarReacher, add a ```--reacher``` flag to the python commands above.
 64 | 
 65 | ### Drone Reacher
 66 | 
 67 | In this environment, you will control a simulated drone to reacher a red floating target. You will control thrust and
 68 | angular acceleration. The controls are as follows:
 69 | * Tilt the LEFT joystick VERTICALLY to control thrust (the middle of the range outputs 1 g of thrust)
 70 | * Tilt the LEFT joystick HORIZONTALLY to control yaw
 71 | * Tilt the RIGHT joystick VERTICALLY to control pitch
 72 | * Tilt the RIGHT joystick HORIZONTALLY to control roll
 73 | 
 74 | To play the game, run:
 75 | 
 76 | ```
 77 | cd /root/pkgs/residual_shared_autonomy/drone_sim
 78 | python joystick_agent.py
 79 | ```
 80 | 
 81 | To play the game with our pretrained assistant, run:
 82 | 
 83 | ```
 84 | cd /root/pkgs/residual_shared_autonomy/ppo
 85 | python joystick_control.py models/pretrained_drone --drone
 86 | ```
 87 | 
 88 | ## Training shared autonomy agents
 89 | 
 90 | Shared autonomy agents can be trained using the following commands:
 91 | 
 92 | ```
 93 | cd /root/pkgs/residual_shared_autonomy/ppo
 94 | ./train.sh logs configs/ppo_lunar.gin
 95 | ```
 96 | 
 97 | Replace ```configs/ppo_lunar.gin``` with ```./configs/ppo_drone.gin``` to train the agent for the DroneReacher environment.
 98 | 
 99 | Experiments can be visualized by running:
100 | 
101 | ```tensorboard --logdir /path/to/log/directory```
102 | 
103 | To change hyperparameters, edit the ".gin" files.
104 | 
105 | To play with your assistants, follow the instructions above.
106 | 
107 | To train assistants with laggy or noisy models, call ```train.sh``` with the 'laggy' and 'noisy' gin files.
108 | 
109 | ## Creating human models
110 | 
111 | We provide the human models used in our paper, but you can create your own as well.
112 | 
113 | ### Imitating human behavior
114 | 
115 | To create your own models, you will need to collect data and then train an agent to imitate that data.
116 | 
117 | To collect data, run:
118 | 
119 | ```
120 | cd /root/pkgs/residual_shared_autonomy/imitation_learning
121 | python data_collection_lunar.py path/to/save/data num_episodes
122 | ```
123 | 
124 | Replace 'lunar' with 'drone' to collect data for the drone simulator.
125 | 
126 | To train a model with the collected data, first edit ```bc_lunar.gin``` by changing
127 | BCTrainer.datafile to your data. Then run:
128 | 
129 | ```
130 | cd /root/pkgs/residual_shared_autonomy/imitation_learning
131 | ./train.sh /path/to/log/directory bc_lunar.gin
132 | ```
133 | 
134 | To view logs, run: ```tensorboard --logdir /path/to/logdir```
135 | 
136 | To observe your trained agent, run: ```python watch.py /path/to/logdir```. Add a ```--drone``` flag for the drone simulator.
137 | 
138 | To use your models in training a shared autonomy agent, place all of their log directories in a common directory
139 | (or add them to the correct "behavioral_cloning_agents" directory in ```ppo/models```). Then change ```ppo/ppo_lunar.gin``` or 
140 | ```ppo/ppo_drone.gin``` so that ```BCMultiActor.logdir = "path/to/your/directory"```.
141 | 
142 | ### Modelling humans as corrupted experts
143 | 
144 | We can also model humans as corrupted experts. In our paper, be explore laggy and noisy experts, where the expert is 
145 | a policy trained using RL. We have included pretrained policies, but you can train your own using the following instructions.
146 | 
147 | For Lunar Lander, run:
148 | 
149 | ```
150 | cd /root/pkgs/residual_shared_autonomy/lunar_lander
151 | ./train.sh
152 | ```
153 | 
154 | For Drone Reacher, run:
155 | 
156 | ```
157 | cd /root/pkgs/residual_shared_autonomy/drone_sim
158 | ./train.sh
159 | ```
160 | 
161 | To train shared autonomy agents with these models, edit the '.gin' files in ```ppo/configs``` to point to your model
162 | instead of the pretrained one by setting ```LunarLanderActor.logdir = "/path/to/your/log/directory"``` or 
163 | ```DroneReacherActor.logdir = "/path/to/your/log/directory"```.
164 | 


--------------------------------------------------------------------------------
/drone_sim/sim.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | 
  4 | class Drone(object):
  5 |     """Drone state."""
  6 | 
  7 |     def __init__(self, p_n, p_e, h, u, v, w, phi, theta, psi, p, q, r,
  8 |                  m_center, r_center, m_rotor, l):
  9 |         """State and dynamics of the drone.
 10 | 
 11 |         Notation and equations from "Quadrotor Dynamics and Control Rev 0.1"
 12 |         by Randal W. Beard.
 13 |         https://scholarsarchive.byu.edu/cgi/viewcontent.cgi?article=2324&context=facpub
 14 | 
 15 |         The drone is assumed to consist of a spherical dense center of mass
 16 |         m_center and radius r_center and 4 point mass rotors with mass m_rotor
 17 |         located a distance of l from the center.
 18 | 
 19 |         position in inertial frame
 20 |         --------------------------
 21 |         p_n: position north measured along i_i
 22 |         p_e: position east measured along j_i
 23 |         h: height measured along k_i
 24 | 
 25 |         velocity in body frame
 26 |         ----------------------
 27 |         u: velocity measured along i_b
 28 |         v: velocity measured along j_b
 29 |         w: velocity measured along k_b
 30 | 
 31 |         orientation in vehicle, vehicle_1, and vehicle_2 frames
 32 |         -------------------------------------------------------
 33 |         phi: roll angle in vehicle_2 frame
 34 |         theta: pitch angle in vehicle_1 frame
 35 |         psi: yaw angle in vehicle frame
 36 | 
 37 |         angular rate in body frame
 38 |         --------------------------
 39 |         p: roll rate measured along i_b
 40 |         q: pitch rate measured along j_b
 41 |         r: yaw rate measured along k_b
 42 | 
 43 |         drone parameters
 44 |         ----------------
 45 |         m_center: mass of the center of the drone
 46 |         r_center: radius of the center of the drone
 47 |         m_rotor: mass of each rotor
 48 |         l: distance of rotors from center
 49 |         """
 50 |         self.pos_i = np.array([p_n, p_e, h])
 51 |         self.vel_b = np.array([u, v, w])
 52 |         self.ori_v = np.array([phi, theta, psi])
 53 |         self.angvel_b = np.array([p, q, r])
 54 | 
 55 |         self.m = m_center + 4 * m_rotor
 56 |         self.m_center = m_center
 57 |         self.m_rotor = m_rotor
 58 |         self.r_center = r_center
 59 |         self.lengh = l
 60 | 
 61 |         # compute moment of inertia
 62 |         self.jx = 2 * m_center * (r_center ** 2) / 5 + 2 * (l ** 2) * m_rotor
 63 |         self.jy = 2 * m_center * (r_center ** 2) / 5 + 2 * (l ** 2) * m_rotor
 64 |         self.jz = 2 * m_center * (r_center ** 2) / 5 + 4 * (l ** 2) * m_rotor
 65 | 
 66 |         # compute useful constants
 67 |         self._jc = np.array([(self.jy - self.jz) / self.jx,
 68 |                              (self.jz - self.jx) / self.jy,
 69 |                              (self.jx - self.jy) / self.jz])
 70 |         self._jinv = np.array([1/self.jx, 1/self.jy, 1/self.jz])
 71 |         self._g = 9.80665
 72 | 
 73 |     def state(self):
 74 |         """Get the state of the drone.
 75 | 
 76 |         State consists of position, velocity, orientation, and angular velocity.
 77 |         """
 78 |         return np.concatenate([self.pos_i, self.vel_b, self.ori_v,
 79 |                                self.angvel_b])
 80 | 
 81 |     def step(self, torques, f, time_step):
 82 |         """Apply forces and torques for time_step amount of time.
 83 | 
 84 |         torques: a numpy array of roll, pitch, yaw torques in the body frame.
 85 |         f: a numpy array of the x, y, z forces in the body frame,
 86 |            excluding gravity.
 87 |         time_step: the step size for approximating new state.
 88 |         """
 89 |         c = np.cos(self.ori_v)
 90 |         s = np.sin(self.ori_v)
 91 |         t = np.tan(self.ori_v)
 92 |         p, q, r = self.angvel_b
 93 |         u, v, w = self.ori_v
 94 |         g = self._g
 95 | 
 96 |         # compute velocity in inertial frame
 97 |         rot_b_i = np.array([
 98 |             [c[1]*c[2], s[0]*s[1]*c[2] - c[0]*s[2], c[0]*s[1]*c[2] + s[0]*s[2]],
 99 |             [c[1]*s[2], s[0]*s[1]*s[2] + c[0]*c[2], c[0]*s[1]*s[2] - s[0]*c[2]],
100 |             [-s[1],     s[0]*c[1],                  c[0]*c[1]]
101 |         ])
102 |         vel_i = rot_b_i.dot(self.vel_b)
103 | 
104 |         # compute acceleration in body frame
105 |         acc_b = (np.array([r*v - q*w, p*w - r*u, q*u - p*v])
106 |                  + np.array([-g*s[1], g*c[1]*s[0], g*c[1]*c[0]])
107 |                  + f / self.m)
108 | 
109 |         # compute angular velocity in vehicle frames
110 |         rot_b_v = np.array([
111 |             [1., s[0]*t[1], c[0]*t[1]],
112 |             [0., c[0], -s[0]],
113 |             [0., s[0]/c[1], c[0]/c[1]]
114 |         ])
115 |         angvel_v = rot_b_v.dot(self.angvel_b)
116 | 
117 |         # compute angular acceleration in body frame
118 |         angacc_b = self._jc * np.array([q*r, p*r, p*q]) + self._jinv * torques
119 | 
120 |         # update state
121 |         self.pos_i += time_step * vel_i
122 |         self.vel_b += time_step * acc_b
123 |         self.ori_v += time_step * angvel_v
124 |         self.angvel_b += time_step * angacc_b
125 |         self.ori_v = ((self.ori_v + np.pi) % (2 * np.pi)) - np.pi
126 |         return self.state()
127 | 
128 |     def rotate_i_to_b(self, v):
129 |         c = np.cos(self.ori_v)
130 |         s = np.sin(self.ori_v)
131 |         rot_b_i = np.array([
132 |             [c[1]*c[2], s[0]*s[1]*c[2] - c[0]*s[2], c[0]*s[1]*c[2] + s[0]*s[2]],
133 |             [c[1]*s[2], s[0]*s[1]*s[2] + c[0]*c[2], c[0]*s[1]*s[2] - s[0]*c[2]],
134 |             [-s[1],     s[0]*c[1],                  c[0]*c[1]]
135 |         ])
136 |         return rot_b_i.T.dot(v)
137 | 
138 | 
139 | 
140 | if __name__ == '__main__':
141 |     # Test the simulator by applying random torques for roll and pitch
142 |     # with a constant upward force, then turning off all motors for a bit,
143 |     # then continuing.
144 |     d = Drone(0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 10., 1., 1., 2.)
145 |     print(d.state())
146 |     torques = np.zeros(3)
147 |     forces = np.array([0., 0., -200.])
148 |     states = []
149 |     for _ in range(100):
150 |         torques = np.random.uniform(-1., 1., (3,))
151 |         s = d.step(torques, forces, 1./50.)
152 |         print(s)
153 |         states.append(s)
154 |     h_off = states[-1][2]
155 |     while states[-1][2] - 1e-5 < h_off:
156 |         s = d.step(0*torques, 0*forces, 1./50.)
157 |         print(s)
158 |         states.append(s)
159 |     for _ in range(100):
160 |         torques = np.random.uniform(-1., 1., (3,))
161 |         s = d.step(torques, forces, 1./50.)
162 |         print(s)
163 |         states.append(s)
164 | 
165 |     from matplotlib import pyplot as plt
166 |     states = np.array(states)
167 |     x = np.arange(states.shape[0]) / 50.
168 |     names = ['x', 'y', 'h', 'u', 'v', 'w']
169 |     names2 = ['phi', 'theta', 'psi', 'p', 'q', 'r']
170 |     f, (ax1, ax2) = plt.subplots(1, 2)
171 |     for i, n in enumerate(names):
172 |         ax1.plot(x, states[:, i], label=n)
173 |     ax1.legend()
174 |     for i, n in enumerate(names2):
175 |         ax2.plot(x, states[:, 6 + i], label=n)
176 |     ax2.legend()
177 |     plt.show()
178 | 


--------------------------------------------------------------------------------
/drone_sim/env.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | from gym import spaces
  3 | from gym.utils import seeding, EzPickle
  4 | from residual_shared_autonomy.drone_sim import Drone
  5 | from residual_shared_autonomy.drone_sim import DroneRenderer, DroneFollower
  6 | 
  7 | import numpy as np
  8 | 
  9 | 
 10 | FPS = 50
 11 | N_OBS_DIM = 12
 12 | MAX_NUM_STEPS = 1500
 13 | SIMS_PER_STEP = 1
 14 | DRONE_RADIUS = 0.5
 15 | DRONE_LENGTH = 2.0
 16 | DRONE_MASS_CENTER = 6.0
 17 | DRONE_MASS_ROTOR = 1.0
 18 | CAMERA_OFFSET = (-20, 0, -10)
 19 | CAMERA_ALPHA_POS = 0.5
 20 | CAMERA_ALPHA_ROT = 0.9
 21 | MAX_ACC = (20, 1.0)  # upward acceleration, angular acceleration
 22 | BOUNDS = (40, 40, -100.)
 23 | TARGET_RADIUS = 3.
 24 | ACTION_NOISE = 0.05
 25 | 
 26 | 
 27 | class DroneEnv(gym.Env, EzPickle):
 28 |     """Reacher task in simple drone simulator."""
 29 | 
 30 |     goal_in_state = True
 31 |     goal_in_reward = True
 32 | 
 33 |     metadata = {
 34 |         'render.modes': ['human', 'rgb_array'],
 35 |     }
 36 | 
 37 |     def __init__(self):
 38 |         """Init."""
 39 |         EzPickle.__init__(self)
 40 |         self.seed()
 41 | 
 42 |         self.renderer = DroneRenderer()
 43 |         self.camera = DroneFollower(CAMERA_OFFSET, CAMERA_ALPHA_POS,
 44 |                                     CAMERA_ALPHA_ROT)
 45 |         self.observation_space = spaces.Box(-1., +1,
 46 |                                             (N_OBS_DIM + 3
 47 |                                              * self.goal_in_state,),
 48 |                                             dtype=np.float32)
 49 |         self.action_space = spaces.Box(-1, +1, (4,), dtype=np.float32)
 50 | 
 51 |         self.reset()
 52 | 
 53 |     def _out_of_bounds(self, s):
 54 |         bx, by, bz = BOUNDS
 55 |         if np.abs(s[0]) >= bx:
 56 |             return True
 57 |         if np.abs(s[1]) >= by:
 58 |             return True
 59 |         if s[2] < bz or s[2] >= 0.0:
 60 |             return True
 61 |         return False
 62 | 
 63 |     def _at_target(self, s):
 64 |         d = np.sqrt(np.sum((s[:3] - self.target) ** 2))
 65 |         return d < TARGET_RADIUS
 66 | 
 67 |     def _ac_to_force_torque(self, ac):
 68 |         range = MAX_ACC[0] - self.drone._g
 69 |         # center upward acceleration at -self.drone.g
 70 |         force = np.array([
 71 |             0.0,
 72 |             0.0,
 73 |             -self.drone.m * (range * -ac[0] + self.drone._g)
 74 |         ])
 75 |         torque = np.array([
 76 |             MAX_ACC[1] * ac[1] * self.drone.jx,
 77 |             MAX_ACC[1] * ac[2] * self.drone.jy,
 78 |             MAX_ACC[1] * ac[3] * self.drone.jz,
 79 |         ])
 80 |         return force, torque
 81 | 
 82 |     def _norm_state(self, state):
 83 |         bx, by, bz = BOUNDS
 84 |         state[0] /= bx
 85 |         state[1] /= by
 86 |         state[2] /= bz
 87 |         if self.goal_in_state:
 88 |             state[-3] /= bx
 89 |             state[-2] /= by
 90 |             state[-1] /= bz
 91 |         return state.astype(np.float32)
 92 | 
 93 |     def seed(self, seed=None):
 94 |         """Set random seed."""
 95 |         self.np_random, seed = seeding.np_random(seed)
 96 |         return [seed]
 97 | 
 98 |     def reset(self):
 99 |         """Reset environment."""
100 |         self.drone = Drone(0., 0., -1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
101 |                            DRONE_MASS_CENTER, DRONE_RADIUS, DRONE_MASS_ROTOR,
102 |                            DRONE_LENGTH)
103 |         self.camera.reset(self.drone.pos_i, self.drone.ori_v)
104 |         self.t = 0
105 |         self.shaping = None
106 |         state = self.drone.state()
107 |         self.target = np.random.uniform(-40., 0., (3,))
108 |         self.target[:2] += 20.
109 |         self.target[2] /= 4.
110 |         self.target[2] -= TARGET_RADIUS
111 |         if self.goal_in_state:
112 |             target_v = self.target - state[:3]
113 |             # target_v = self.drone.rotate_i_to_b(target_v)
114 |             state = np.concatenate([state, target_v])
115 |         return self._norm_state(state)
116 | 
117 |     def step(self, action):
118 |         """Step environment."""
119 |         action = action + np.random.normal(scale=ACTION_NOISE,
120 |                                            size=action.shape)
121 |         action = np.clip(action, -1.0, 1.0)
122 |         force, torque = self._ac_to_force_torque(action)
123 |         for _ in range(SIMS_PER_STEP):
124 |             state = self.drone.step(torque, force, 1.0/FPS)
125 |             self.camera.step(self.drone.pos_i, self.drone.ori_v)
126 |         self.t += 1
127 |         reward = 0.0
128 |         if self.goal_in_state:
129 |             target_v = self.target - state[:3]
130 |             # target_v = self.drone.rotate_i_to_b(target_v)
131 |             state = np.concatenate([state, target_v])
132 |         dist_to_targ = np.sqrt(np.sum(state[-3:] ** 2))
133 | 
134 |         # shape reward if goal is to reach target, penalize if goal
135 |         # is to stabilize
136 |         # shaping for low velocity and staying upright and distance to target.
137 |         shaping = (-np.sqrt((state ** 2)[3:6].sum())    # linear velocities
138 |                    - np.sqrt((state ** 2)[9:12].sum())  # angular rates
139 |                    - np.sqrt((state ** 2)[6:8].sum()))  # roll and pitch
140 |         if self.goal_in_reward:
141 |             reward -= 0.01 * dist_to_targ
142 | 
143 |         if self.shaping is None:
144 |             self.shaping = shaping
145 |         else:
146 |             reward += shaping - self.shaping
147 |             self.shaping = shaping
148 | 
149 |         done = False
150 |         success = False
151 |         crash = False
152 |         timeout = False
153 |         if self._out_of_bounds(state):
154 |             done = True
155 |             crash = True
156 |             if self.goal_in_reward:
157 |                 reward -= 100.
158 |             else:
159 |                 reward -= 10.
160 |         elif self._at_target(state) and self.goal_in_reward:
161 |             done = True
162 |             success = True
163 |             reward += 10.
164 |         elif self.t >= MAX_NUM_STEPS:
165 |             done = True
166 |             timeout = True
167 | 
168 |         info = {'success': success, 'crash': crash, 'timeout': timeout}
169 |         return self._norm_state(state), reward, done, info
170 | 
171 |     def render(self, mode='human'):
172 |         """Render."""
173 |         if self.goal_in_reward:
174 |             self.renderer.set_target(self.target, TARGET_RADIUS)
175 |         self.renderer.set_camera(self.camera.camera_pos,
176 |                                  self.camera.camera_angle)
177 |         self.renderer.draw_world(self.drone.pos_i, self.drone.ori_v,
178 |                                  DRONE_RADIUS, DRONE_LENGTH, BOUNDS[:2])
179 |         if mode == 'rgb_array':
180 |             return self.renderer.get_pixels()
181 | 
182 |     def close(self):
183 |         import pygame
184 |         pygame.display.quit()
185 |         self.renderer.initialized = False
186 | 
187 |     def state_dict(self):
188 |         """Get env state."""
189 |         return {'rng': self.np_random.get_state()}
190 | 
191 |     def load_state_dict(self, state_dict):
192 |         """Load env state."""
193 |         self.np_random.set_state(state_dict['rng'])
194 | 
195 | 
196 | class DroneReacherBot(DroneEnv):
197 |     """Reach the target."""
198 | 
199 |     goal_in_state = True
200 |     goal_in_reward = True
201 | 
202 | 
203 | class DroneReacherHuman(DroneEnv):
204 |     """Reach the target."""
205 | 
206 |     goal_in_state = False
207 |     goal_in_reward = True
208 | 
209 | 
210 | class DroneStabilizer(DroneEnv):
211 |     """Don't Crash."""
212 | 
213 |     goal_in_state = True
214 |     goal_in_reward = False
215 | 
216 | 
217 | if __name__ == '__main__':
218 |     import time
219 |     env = DroneStabilizer()
220 |     done = False
221 |     env.render()
222 |     while not done:
223 |         ac = env.action_space.sample()
224 |         ac[0] = -1.0  # np.random.uniform(0.5, 1.0)
225 |         s, r, done, _ = env.step(ac)
226 |         env.render()
227 |         time.sleep(0.02)
228 | 


--------------------------------------------------------------------------------
/drone_sim/rendering.py:
--------------------------------------------------------------------------------
  1 | import pygame
  2 | from pygame.locals import *
  3 | import OpenGL
  4 | from OpenGL.GL import *
  5 | from OpenGL.GLU import *
  6 | import numpy as np
  7 | from scipy.spatial.transform import Rotation as R
  8 | import time
  9 | import sys
 10 | 
 11 | 
 12 | class DroneRenderer(object):
 13 |     def __init__(self):
 14 |         self.q = gluNewQuadric()
 15 |         self.camera_xyz = [0., 0., 0.]
 16 |         self.camera_rpy = [0., 0., 0.]
 17 |         self.color1 = [0.7, 0.7, 0.7]
 18 |         self.color2 = [0.4, 0.4, 0.4]
 19 |         self.ground_color1 = [0.2, 1.0, 0.2]
 20 |         self.ground_color2 = [1.0, 0.2, 0.2]
 21 |         self.target = None
 22 |         self.initialized = False
 23 | 
 24 |     def _init_window(self):
 25 |         pygame.init()
 26 |         display = (800, 600)
 27 |         pygame.display.set_mode(display, DOUBLEBUF|OPENGL)
 28 |         glEnable(GL_DEPTH_TEST);
 29 |         glMatrixMode(GL_PROJECTION)
 30 |         gluPerspective(45.0, display[0]/display[1], 0.1, 100.0)
 31 |         glMatrixMode(GL_MODELVIEW)
 32 |         self.initialized = True
 33 | 
 34 |     def set_camera(self, xyz, rpy):
 35 |         self.camera_xyz = xyz
 36 |         self.camera_rpy = rpy
 37 | 
 38 |     def set_target(self, xyz, radius):
 39 |         self.target = (xyz, radius)
 40 | 
 41 |     def camera_transform(self):
 42 |         x, y, z = self.camera_xyz
 43 |         ar, ap, ay = self.camera_rpy
 44 |         glRotatef(-ar,1.0,0.0,0.0)
 45 |         glRotatef(-ap,0.0,1.0,0.0)
 46 |         glRotatef(-ay,0.,0.0,1.0)
 47 |         glTranslatef(-x, -y, -z)
 48 | 
 49 |     def draw_target(self, xyz, radius):
 50 |         glLoadIdentity()
 51 |         self.camera_transform()
 52 |         glTranslatef(*xyz)
 53 |         glColor3fv((1.0, 0., 0.))
 54 |         gluSphere(self.q, radius, 32, 32)
 55 | 
 56 |     def _apply_rpy(self, rpy):
 57 |         r, p, y = rpy
 58 |         axes = np.eye(3)
 59 |         axes[:2] = R.from_rotvec(y * axes[2]).apply(axes[:2])
 60 |         axes[:1] = R.from_rotvec(p * axes[1]).apply(axes[:1])
 61 |         glRotatef(180. / np.pi * r, axes[0,0], axes[0,1], axes[0,2])
 62 |         glRotatef(180. / np.pi * p, axes[1,0], axes[1,1], axes[1,2])
 63 |         glRotatef(180. / np.pi * y, axes[2,0], axes[2,1], axes[2,2])
 64 | 
 65 |     def draw_body(self, pos, orientation, radius):
 66 |         glLoadIdentity()
 67 |         self.camera_transform()
 68 |         glColor3fv(self.color1)
 69 |         glTranslatef(pos[0], pos[1], pos[2])
 70 |         # self._apply_rpy(orientation)
 71 |         # glTranslatef(0., 0., -radius)
 72 |         # gluCylinder(self.q, radius, radius, 2*radius, 32, 32)
 73 |         gluSphere(self.q, radius, 32, 32)
 74 | 
 75 |     def draw_arm(self, arm, pos, orientation, length, rod_radius,
 76 |                   rotor_radius):
 77 |         glLoadIdentity()
 78 |         if arm == 0:
 79 |             glColor3fv(self.color2)
 80 |         else:
 81 |             glColor3fv(self.color1)
 82 |         self.camera_transform()
 83 |         glTranslatef(pos[0], pos[1], pos[2])
 84 |         self._apply_rpy(orientation)
 85 |         glRotatef(90*arm,0.0,0.0,1.0)
 86 |         glRotatef(90,0.0,1.0,0.0)
 87 |         gluCylinder(self.q, rod_radius, rod_radius, length, 32, 32)
 88 | 
 89 |         glLoadIdentity()
 90 |         self.camera_transform()
 91 |         glTranslatef(pos[0], pos[1], pos[2])
 92 |         self._apply_rpy(orientation)
 93 |         glRotatef(90*arm,0.0,0.0,1.0)
 94 |         glTranslatef(length, 0., -3*rod_radius)
 95 |         gluCylinder(self.q, rotor_radius, 0., 4*rod_radius, 32, 32)
 96 | 
 97 |     def draw_ground(self, boundx=100, boundy=100):
 98 |         glLoadIdentity()
 99 |         self.camera_transform()
100 |         glBegin(GL_LINES)
101 |         for x in range(-100, 100):
102 |             if np.abs(x) >= boundx:
103 |                 glColor3fv(self.ground_color2)
104 |                 glVertex3fv((x, -100., 0.))
105 |                 glVertex3fv((x, 100., 0.))
106 |             else:
107 |                 glColor3fv(self.ground_color2)
108 |                 glVertex3fv((x, -100., 0.))
109 |                 glVertex3fv((x, -boundx, 0.))
110 |                 glVertex3fv((x, boundx, 0.))
111 |                 glVertex3fv((x, 100., 0.))
112 |                 glColor3fv(self.ground_color1)
113 |                 glVertex3fv((x, -boundx, 0.))
114 |                 glVertex3fv((x, boundx, 0.))
115 |         for y in range(-100, 100):
116 |             if np.abs(y) >= boundy:
117 |                 glColor3fv(self.ground_color2)
118 |                 glVertex3fv((-100., y, 0.))
119 |                 glVertex3fv((100., y, 0.))
120 |             else:
121 |                 glColor3fv(self.ground_color2)
122 |                 glVertex3fv((-100., y, 0.))
123 |                 glVertex3fv((-boundx, y, 0.))
124 |                 glVertex3fv((boundx, y, 0.))
125 |                 glVertex3fv((100., y, 0.))
126 |                 glColor3fv(self.ground_color1)
127 |                 glVertex3fv((-boundx, y, 0.))
128 |                 glVertex3fv((boundx, y, 0.))
129 |         glEnd()
130 | 
131 |     def draw_world(self, pos, orientation, radius, length, bounds=(100, 100)):
132 |         if not self.initialized:
133 |             self._init_window()
134 |         glClear(GL_COLOR_BUFFER_BIT|GL_DEPTH_BUFFER_BIT)
135 | 
136 |         rotor_radius = (length - radius) / 2
137 |         rod_radius = radius / 10.
138 | 
139 |         self.draw_ground(*bounds)
140 |         if self.target:
141 |             self.draw_target(*self.target)
142 |         for i in range(4):
143 |             self.draw_arm(i, pos, orientation, length, rod_radius, rotor_radius)
144 |         self.draw_body(pos, orientation, radius)
145 |         pygame.display.flip()
146 | 
147 |     def get_pixels(self):
148 |         data = glReadPixels(0, 0, 800, 600, GL_RGB, GL_UNSIGNED_BYTE)
149 |         return np.frombuffer(data, dtype=np.uint8).reshape(600, 800, 3)[::-1]
150 | 
151 | 
152 | class DroneFollower(object):
153 |     def __init__(self, offset, alpha_pos, alpha_rot):
154 |         self.offset = np.array(offset)
155 |         self.alpha_pos = alpha_pos
156 |         self.alpha_rot = alpha_rot
157 | 
158 |         self.camera_pos = np.zeros(3)
159 |         self.camera_angle = np.zeros(3)
160 | 
161 |     def _get_angle_diff(self, a1, a2):
162 |         d = a2 - a1
163 |         for i, dd in enumerate(d):
164 |             if dd < -180.:
165 |                 d[i] += 360.
166 |             elif dd >= 180.:
167 |                 d[i] -= 360.
168 |         return d
169 | 
170 |     def _get_rotation_from_vecs(self, v1, v2):
171 |         v1 = v1 / np.sqrt(np.sum(v1 ** 2))
172 |         v2 = v2 / np.sqrt(np.sum(v2 ** 2))
173 |         rot_axis = np.cross(v1, v2)
174 |         if np.allclose(rot_axis, 0.):
175 |             return R.from_euler('xyz', [0., 0., 0.])
176 |         rot_axis /= np.sqrt((rot_axis ** 2).sum())
177 |         angle = np.arccos(v1.dot(v2))
178 |         return R.from_rotvec(angle * rot_axis)
179 | 
180 |     def _look_at_drone(self, camera_pos, drone_pos, drone_ori):
181 |         dir = drone_pos - camera_pos
182 |         dir /= np.sqrt(np.sum(dir ** 2))
183 |         camera_start = np.array([0., 0., -1.])
184 |         rot = self._get_rotation_from_vecs(camera_start, dir)
185 | 
186 |         # orient camera up
187 |         camera_up = rot.apply(np.array([0., 1., 0.]))
188 |         z_body = self._apply_rpy(np.array([0., 0., -1.]), drone_ori)
189 | 
190 |         # project z_body onto dir
191 |         proj = z_body.dot(dir) * dir
192 | 
193 |         # find vector orthogonal to dir in the plane of (z_body, dir)
194 |         dir_ortho = z_body - proj
195 |         dir_ortho /= np.sqrt(np.sum(dir_ortho ** 2))
196 | 
197 |         # find angle between dir_ortho and rotated camera_up
198 |         # and the direction to rotate it
199 |         angle = np.arccos(camera_up.dot(dir_ortho))
200 |         sign = np.sign(np.cross(dir_ortho, camera_up).dot(dir))
201 | 
202 |         # combine rotations for looking at drone and orienting camera
203 |         rot_up = rot.from_rotvec(- sign * angle * dir)
204 |         rot_tot = rot_up * rot
205 |         return rot_tot.as_euler('xyz', degrees=True)
206 | 
207 |     def _apply_rpy(self, v, rpy):
208 |         xyz = np.eye(3)
209 |         yaw = R.from_rotvec(rpy[2] * xyz[2])
210 |         xyz[:2] = yaw.apply(xyz[:2])
211 |         pitch = R.from_rotvec(rpy[1] * xyz[1])
212 |         xyz[:1] = pitch.apply(xyz[:1])
213 |         roll = R.from_rotvec(rpy[0] * xyz[0])
214 |         v = yaw.apply(v)
215 |         v = pitch.apply(v)
216 |         v = roll.apply(v)
217 |         return v
218 | 
219 |     def _set_camera_pose(self, drone_pos, drone_ori, alpha_pos, alpha_ori):
220 |         drone_pos = np.array(drone_pos)
221 |         drone_ori = np.array(drone_ori)
222 |         cp = drone_pos + self._apply_rpy(self.offset, drone_ori)
223 |         self.camera_pos = alpha_pos * self.camera_pos + (1.0 - alpha_pos) * cp
224 |         ca = self._look_at_drone(self.camera_pos, drone_pos, drone_ori)
225 |         angle_diff = self._get_angle_diff(self.camera_angle, ca)
226 |         self.camera_angle += (1.0 - alpha_ori) * angle_diff
227 |         for i, c in enumerate(self.camera_angle):
228 |             if c < -180.:
229 |                 self.camera_angle[i] += 360.
230 |             elif c >= 180.:
231 |                 self.camera_angle[i] -= 360.
232 | 
233 |     def reset(self, drone_pos, drone_ori):
234 |         self._set_camera_pose(drone_pos, drone_ori, 0.0, 0.0)
235 |         return self.camera_pos, self.camera_angle
236 | 
237 |     def step(self, drone_pos, drone_ori):
238 |         self._set_camera_pose(drone_pos, drone_ori, self.alpha_pos,
239 |                               self.alpha_rot)
240 |         return self.camera_pos, self.camera_angle
241 | 
242 | 
243 | def main():
244 |     from residual_shared_autonomy.drone_sim import Drone
245 |     np.random.seed(0)
246 | 
247 |     d = Drone(0., 0., 0., 0., 0., 0., 0.,
248 |               0., 0., 0., 0., 0., 6, 0.5, 1, 2)
249 |     torques = np.zeros(3)
250 |     forces = np.array([0., 0., -200.])
251 |     states = []
252 |     r = DroneRenderer()
253 |     f = DroneFollower([-10., 0., -10.], 0.9, 0.9)
254 |     r.set_target([50., 0., 0.], 1.)
255 |     cp, ca = f.reset(d.pos_i, d.ori_v)
256 | 
257 |     for _ in range(100):
258 |         r.set_camera(cp, ca)
259 |         r.draw_world(d.pos_i, d.ori_v, d.r_center, d.lengh)
260 |         torques = 50 * np.random.uniform(-1., 1., (3,))
261 |         s = d.step(torques, forces, 1./50.)
262 |         states.append(s)
263 |         cp, ca = f.step(d.pos_i, d.ori_v)
264 |         time.sleep(0.02)
265 |     h_off = states[-1][2]
266 |     while states[-1][2] - 1e-5 < h_off:
267 |         r.set_camera(cp, ca)
268 |         r.draw_world(d.pos_i, d.ori_v, d.r_center, d.lengh)
269 |         s = d.step(0*torques, 0*forces, 1./50.)
270 |         states.append(s)
271 |         cp, ca = f.step(d.pos_i, d.ori_v)
272 |         time.sleep(0.02)
273 |     for _ in range(100):
274 |         r.set_camera(cp, ca)
275 |         r.draw_world(d.pos_i, d.ori_v, d.r_center, d.lengh)
276 |         torques = 50 * np.random.uniform(-1., 1., (3,))
277 |         s = d.step(torques, forces, 1./50.)
278 |         states.append(s)
279 |         cp, ca = f.step(d.pos_i, d.ori_v)
280 |         time.sleep(0.02)
281 | 
282 |     import matplotlib
283 |     matplotlib.use('TkAgg')
284 |     from matplotlib import pyplot as plt
285 |     states = np.array(states)
286 |     x = np.arange(states.shape[0]) / 50.
287 |     names = ['x', 'y', 'h', 'u', 'v', 'w']
288 |     names2 = ['phi', 'theta', 'psi', 'p', 'q', 'r']
289 |     f, (ax1, ax2) = plt.subplots(1, 2)
290 |     for i, n in enumerate(names):
291 |         ax1.plot(x, states[:, i], label=n)
292 |     ax1.legend()
293 |     for i, n in enumerate(names2):
294 |         ax2.plot(x, states[:, 6 + i], label=n)
295 |     ax2.legend()
296 |     plt.show()
297 | 
298 | 
299 | if __name__ == '__main__':
300 |     main()
301 | 


--------------------------------------------------------------------------------
/ppo/constrained_residual_ppo.py:
--------------------------------------------------------------------------------
  1 | """Residual Policy Learning with PPO.
  2 | 
  3 | https://arxiv.org/abs/1812.06298
  4 | """
  5 | from dl.rl.envs import VecEpisodeLogger
  6 | from dl.rl.data_collection import RolloutDataManager
  7 | from dl.rl.modules import Policy
  8 | from dl.rl.util import rl_evaluate, rl_record, misc
  9 | from dl import logger, Algorithm, Checkpointer
 10 | from residual_shared_autonomy import ResidualWrapper
 11 | import gin
 12 | import numpy as np
 13 | import os
 14 | import torch
 15 | import torch.nn as nn
 16 | import torch.nn.functional as F
 17 | import time
 18 | 
 19 | 
 20 | class ResidualPPOActor(object):
 21 |     """Actor."""
 22 | 
 23 |     def __init__(self, pi, policy_training_start):
 24 |         """Init."""
 25 |         self.pi = pi
 26 |         self.policy_training_start = policy_training_start
 27 |         self.t = 0
 28 | 
 29 |     def __call__(self, ob, state_in=None, mask=None):
 30 |         """Produce decision from model."""
 31 |         if self.t < self.policy_training_start:
 32 |             outs = self.pi(ob, state_in, mask, deterministic=True)
 33 |             if not torch.allclose(outs.action, torch.zeros_like(outs.action)):
 34 |                 raise ValueError("Pi should be initialized to output zero "
 35 |                                  "actions so that an acurate value function "
 36 |                                  "can be learned for the base policy.")
 37 |         else:
 38 |             outs = self.pi(ob, state_in, mask)
 39 |         residual_norm = torch.mean(torch.sum(torch.abs(outs.action), dim=1))
 40 |         logger.add_scalar('actor/l1_residual_norm', residual_norm, self.t,
 41 |                           time.time())
 42 |         self.t += outs.action.shape[0]
 43 |         data = {'action': outs.action,
 44 |                 'value': outs.value,
 45 |                 'logp': outs.dist.log_prob(outs.action)}
 46 |         if outs.state_out:
 47 |             data['state'] = outs.state_out
 48 |         return data
 49 | 
 50 |     def state_dict(self):
 51 |         """State dict."""
 52 |         return {'t': self.t}
 53 | 
 54 |     def load_state_dict(self, state_dict):
 55 |         """Load state dict."""
 56 |         self.t = state_dict['t']
 57 | 
 58 | 
 59 | @gin.configurable(blacklist=['logdir'])
 60 | class ConstrainedResidualPPO(Algorithm):
 61 |     """Constrained Residual PPO algorithm."""
 62 | 
 63 |     def __init__(self,
 64 |                  logdir,
 65 |                  env_fn,
 66 |                  policy_fn,
 67 |                  nenv=1,
 68 |                  optimizer=torch.optim.Adam,
 69 |                  lambda_lr=1e-4,
 70 |                  lambda_init=100.,
 71 |                  lr_decay_rate=1./3.16227766017,
 72 |                  lr_decay_freq=20000000,
 73 |                  l2_reg=True,
 74 |                  reward_threshold=-0.05,
 75 |                  rollout_length=128,
 76 |                  batch_size=32,
 77 |                  gamma=0.99,
 78 |                  lambda_=0.95,
 79 |                  norm_advantages=False,
 80 |                  epochs_per_rollout=10,
 81 |                  max_grad_norm=None,
 82 |                  ent_coef=0.01,
 83 |                  vf_coef=0.5,
 84 |                  clip_param=0.2,
 85 |                  base_actor_cls=None,
 86 |                  policy_training_start=10000,
 87 |                  lambda_training_start=100000,
 88 |                  eval_num_episodes=1,
 89 |                  record_num_episodes=1,
 90 |                  wrapper_fn=None,  # additional wrappers for the env
 91 |                  gpu=True):
 92 |         """Init."""
 93 |         self.logdir = logdir
 94 |         self.ckptr = Checkpointer(os.path.join(logdir, 'ckpts'))
 95 |         self.env_fn = env_fn
 96 |         self.nenv = nenv
 97 |         self.eval_num_episodes = eval_num_episodes
 98 |         self.record_num_episodes = record_num_episodes
 99 |         self.epochs_per_rollout = epochs_per_rollout
100 |         self.max_grad_norm = max_grad_norm
101 |         self.ent_coef = ent_coef
102 |         self.vf_coef = vf_coef
103 |         self.clip_param = clip_param
104 |         self.base_actor_cls = base_actor_cls
105 |         self.policy_training_start = policy_training_start
106 |         self.lambda_training_start = lambda_training_start
107 |         self.lambda_lr = lambda_lr
108 |         self.lr_decay_rate = lr_decay_rate
109 |         self.lr_decay_freq = lr_decay_freq
110 |         self.l2_reg = l2_reg
111 |         self.reward_threshold = reward_threshold
112 |         self.device = torch.device('cuda:0' if gpu and torch.cuda.is_available()
113 |                                    else 'cpu')
114 | 
115 |         self.env = VecEpisodeLogger(env_fn(nenv=nenv))
116 |         self.env = ResidualWrapper(self.env, self.base_actor_cls(self.env))
117 |         if wrapper_fn:
118 |             self.env = wrapper_fn(self.env)
119 | 
120 |         self.pi = policy_fn(self.env).to(self.device)
121 |         self.opt = optimizer(self.pi.parameters())
122 |         self.pi_lr = self.opt.param_groups[0]['lr']
123 |         if lambda_init < 10:
124 |             lambda_init = np.log(np.exp(lambda_init) - 1)
125 |         self.log_lambda_ = nn.Parameter(
126 |                             torch.Tensor([lambda_init]).to(self.device))
127 |         self.opt_l = optimizer([self.log_lambda_], lr=lambda_lr)
128 |         self._actor = ResidualPPOActor(self.pi, policy_training_start)
129 |         self.data_manager = RolloutDataManager(
130 |             self.env,
131 |             self._actor,
132 |             self.device,
133 |             rollout_length=rollout_length,
134 |             batch_size=batch_size,
135 |             gamma=gamma,
136 |             lambda_=lambda_,
137 |             norm_advantages=norm_advantages)
138 | 
139 |         self.mse = nn.MSELoss(reduction='none')
140 |         self.huber = nn.SmoothL1Loss()
141 | 
142 |         self.t = 0
143 | 
144 |     def loss(self, batch):
145 |         """Compute loss."""
146 |         if self.data_manager.recurrent:
147 |             outs = self.pi(batch['obs'], batch['state'], batch['mask'])
148 |         else:
149 |             outs = self.pi(batch['obs'])
150 |         loss = {}
151 | 
152 |         # compute policy loss
153 |         if self.t < self.policy_training_start:
154 |             pi_loss = torch.Tensor([0.0]).to(self.device)
155 |         else:
156 |             logp = outs.dist.log_prob(batch['action'])
157 |             assert logp.shape == batch['logp'].shape
158 |             ratio = torch.exp(logp - batch['logp'])
159 |             assert ratio.shape == batch['atarg'].shape
160 |             ploss1 = ratio * batch['atarg']
161 |             ploss2 = torch.clamp(ratio, 1.0-self.clip_param,
162 |                                  1.0+self.clip_param) * batch['atarg']
163 |             pi_loss = -torch.min(ploss1, ploss2).mean()
164 |         loss['pi'] = pi_loss
165 | 
166 |         # compute value loss
167 |         vloss1 = 0.5 * self.mse(outs.value, batch['vtarg'])
168 |         vpred_clipped = batch['vpred'] + (
169 |             outs.value - batch['vpred']).clamp(-self.clip_param,
170 |                                                self.clip_param)
171 |         vloss2 = 0.5 * self.mse(vpred_clipped, batch['vtarg'])
172 |         vf_loss = torch.max(vloss1, vloss2).mean()
173 |         loss['value'] = vf_loss
174 | 
175 |         # compute entropy loss
176 |         if self.t < self.policy_training_start:
177 |             ent_loss = torch.Tensor([0.0]).to(self.device)
178 |         else:
179 |             ent_loss = outs.dist.entropy().mean()
180 |         loss['entropy'] = ent_loss
181 | 
182 |         # compute residual regularizer
183 |         if self.t < self.policy_training_start:
184 |             reg_loss = torch.Tensor([0.0]).to(self.device)
185 |         else:
186 |             if self.l2_reg:
187 |                 reg_loss = outs.dist.rsample().pow(2).sum(dim=-1).mean()
188 |             else:  # huber loss
189 |                 ac_norm = torch.norm(outs.dist.rsample(), dim=-1)
190 |                 reg_loss = self.huber(ac_norm, torch.zeros_like(ac_norm))
191 |         loss['reg'] = reg_loss
192 | 
193 |         ###############################
194 |         # Constrained loss added here.
195 |         ###############################
196 | 
197 |         # soft plus on lambda to constrain it to be positive.
198 |         lambda_ = F.softplus(self.log_lambda_)
199 |         logger.add_scalar('alg/lambda', lambda_, self.t, time.time())
200 |         logger.add_scalar('alg/lambda_', self.log_lambda_, self.t, time.time())
201 |         if self.t < max(self.policy_training_start, self.lambda_training_start):
202 |             loss['lambda'] = torch.Tensor([0.0]).to(self.device)
203 |         else:
204 |             neps = (1.0 - batch['mask']).sum()
205 |             loss['lambda'] = (lambda_ * (batch['reward'].sum()
206 |                                          - self.reward_threshold * neps)
207 |                               / batch['reward'].size()[0])
208 |         if self.t >= self.policy_training_start:
209 |             loss['pi'] = (reg_loss + lambda_ * loss['pi']) / (1. + lambda_)
210 |         loss['total'] = (loss['pi'] + self.vf_coef * vf_loss
211 |                          - self.ent_coef * ent_loss)
212 |         return loss
213 | 
214 |     def step(self):
215 |         """Compute rollout, loss, and update model."""
216 |         self.pi.train()
217 |         # adjust learning rate
218 |         lr_frac = self.lr_decay_rate ** (self.t // self.lr_decay_freq)
219 |         for g in self.opt.param_groups:
220 |             g['lr'] = self.pi_lr * lr_frac
221 |         for g in self.opt_l.param_groups:
222 |             g['lr'] = self.lambda_lr * lr_frac
223 | 
224 |         self.data_manager.rollout()
225 |         self.t += self.data_manager.rollout_length * self.nenv
226 |         losses = {}
227 |         for _ in range(self.epochs_per_rollout):
228 |             for batch in self.data_manager.sampler():
229 |                 loss = self.loss(batch)
230 |                 if losses == {}:
231 |                     losses = {k: [] for k in loss}
232 |                 for k, v in loss.items():
233 |                     losses[k].append(v.detach().cpu().numpy())
234 |                 if self.t >= max(self.policy_training_start,
235 |                                  self.lambda_training_start):
236 |                     self.opt_l.zero_grad()
237 |                     loss['lambda'].backward(retain_graph=True)
238 |                     self.opt_l.step()
239 |                 self.opt.zero_grad()
240 |                 loss['total'].backward()
241 |                 if self.max_grad_norm:
242 |                     nn.utils.clip_grad_norm_(self.pi.parameters(),
243 |                                              self.max_grad_norm)
244 |                 self.opt.step()
245 |         for k, v in losses.items():
246 |             logger.add_scalar(f'loss/{k}', np.mean(v), self.t, time.time())
247 |         logger.add_scalar('alg/lr_pi', self.opt.param_groups[0]['lr'], self.t,
248 |                           time.time())
249 |         logger.add_scalar('alg/lr_lambda', self.opt_l.param_groups[0]['lr'],
250 |                           self.t, time.time())
251 |         return self.t
252 | 
253 |     def evaluate(self):
254 |         """Evaluate model."""
255 |         self.pi.eval()
256 |         misc.set_env_to_eval_mode(self.env)
257 | 
258 |         # Eval policy
259 |         os.makedirs(os.path.join(self.logdir, 'eval'), exist_ok=True)
260 |         outfile = os.path.join(self.logdir, 'eval',
261 |                                self.ckptr.format.format(self.t) + '.json')
262 |         stats = rl_evaluate(self.env, self.pi, self.eval_num_episodes,
263 |                             outfile, self.device)
264 |         logger.add_scalar('eval/mean_episode_reward', stats['mean_reward'],
265 |                           self.t, time.time())
266 |         logger.add_scalar('eval/mean_episode_length', stats['mean_length'],
267 |                           self.t, time.time())
268 | 
269 |         # Record policy
270 |         # os.makedirs(os.path.join(self.logdir, 'video'), exist_ok=True)
271 |         # outfile = os.path.join(self.logdir, 'video',
272 |         #                        self.ckptr.format.format(self.t) + '.mp4')
273 |         # rl_record(self.env, self.pi, self.record_num_episodes, outfile,
274 |         #           self.device)
275 | 
276 |         self.pi.train()
277 |         misc.set_env_to_train_mode(self.env)
278 | 
279 |     def save(self):
280 |         """State dict."""
281 |         state_dict = {
282 |             'pi': self.pi.state_dict(),
283 |             'opt': self.opt.state_dict(),
284 |             'lambda_': self.log_lambda_,
285 |             'opt_l': self.opt_l.state_dict(),
286 |             'env': misc.env_state_dict(self.env),
287 |             '_actor': self._actor.state_dict(),
288 |             't': self.t
289 |         }
290 |         self.ckptr.save(state_dict, self.t)
291 | 
292 |     def load(self, t=None):
293 |         """Load state dict."""
294 |         state_dict = self.ckptr.load(t)
295 |         if state_dict is None:
296 |             self.t = 0
297 |             return self.t
298 |         self.pi.load_state_dict(state_dict['pi'])
299 |         self.opt.load_state_dict(state_dict['opt'])
300 |         self.opt_l.load_state_dict(state_dict['opt_l'])
301 |         self.log_lambda_.data.copy_(state_dict['lambda_'])
302 |         misc.env_load_state_dict(self.env, state_dict['env'])
303 |         self._actor.load_state_dict(state_dict['_actor'])
304 |         self.t = state_dict['t']
305 |         return self.t
306 | 
307 |     def close(self):
308 |         """Close environment."""
309 |         try:
310 |             self.env.close()
311 |         except Exception:
312 |             pass
313 | 
314 | 
315 | if __name__ == '__main__':
316 |     import unittest
317 |     import shutil
318 |     from dl.rl.modules import ActorCriticBase
319 |     from dl.rl import make_env
320 |     from dl import train
321 |     import residual_shared_autonomy.envs
322 |     from dl.modules import DiagGaussian
323 |     import torch.nn.functional as F
324 |     from functools import partial
325 | 
326 |     class FeedForwardActorCriticBase(ActorCriticBase):
327 |         """Policy and Value networks."""
328 | 
329 |         def build(self):
330 |             """Build."""
331 |             inshape = (self.observation_space.spaces[0].shape[0]
332 |                        + self.observation_space.spaces[1].shape[0])
333 |             self.fc1 = nn.Linear(inshape, 32)
334 |             self.fc2 = nn.Linear(32, 32)
335 |             self.fc3 = nn.Linear(32, 32)
336 |             self.dist = DiagGaussian(32, self.action_space.shape[0])
337 |             for p in self.dist.fc_mean.parameters():
338 |                 nn.init.constant_(p, 0.)
339 | 
340 |             self.vf_fc1 = nn.Linear(inshape, 32)
341 |             self.vf_fc2 = nn.Linear(32, 32)
342 |             self.vf_fc3 = nn.Linear(32, 32)
343 |             self.vf_out = nn.Linear(32, 1)
344 | 
345 |         def forward(self, x):
346 |             """Forward."""
347 |             ob = torch.cat(x, axis=1)
348 |             net = ob
349 |             net = F.relu(self.fc1(net))
350 |             net = F.relu(self.fc2(net))
351 |             net = F.relu(self.fc3(net))
352 |             pi = self.dist(net)
353 | 
354 |             net = ob
355 |             net = F.relu(self.vf_fc1(net))
356 |             net = F.relu(self.vf_fc2(net))
357 |             net = F.relu(self.vf_fc3(net))
358 |             vf = self.vf_out(net)
359 | 
360 |             return pi, vf
361 | 
362 |     class RandomActor(object):
363 |         """Output random actions."""
364 | 
365 |         def __init__(self, env):
366 |             """Init."""
367 |             self.action_space = env.action_space
368 | 
369 |         def __call__(self, ob):
370 |             """Act."""
371 |             batch_size = ob.shape[0]
372 |             return np.asarray([self.action_space.sample()
373 |                                for _ in range(batch_size)])
374 | 
375 |     class TestResidualPPO(unittest.TestCase):
376 |         """Test case."""
377 | 
378 |         def test_feed_forward_ppo(self):
379 |             """Test feed forward ppo."""
380 |             def env_fn(nenv):
381 |                 return make_env(env_id="LunarLanderRandomConstrained-v2",
382 |                                 nenv=nenv)
383 | 
384 |             def policy_fn(env):
385 |                 return Policy(FeedForwardActorCriticBase(env.observation_space,
386 |                                                          env.action_space))
387 | 
388 |             ppo = partial(ConstrainedResidualPPO,
389 |                           env_fn=env_fn,
390 |                           nenv=32,
391 |                           policy_fn=policy_fn,
392 |                           base_actor_cls=RandomActor,
393 |                           policy_training_start=500)
394 |             train('test', ppo, maxt=1000, eval=True, eval_period=1000)
395 |             alg = ppo('test')
396 |             alg.load()
397 |             shutil.rmtree('test')
398 | 
399 |     unittest.main()
400 | 


--------------------------------------------------------------------------------
/lunar_lander/env_reacher.py:
--------------------------------------------------------------------------------
  1 | import sys, math
  2 | import numpy as np
  3 | 
  4 | import Box2D
  5 | from Box2D.b2 import (edgeShape, circleShape, fixtureDef, polygonShape, revoluteJointDef, contactListener)
  6 | 
  7 | import gym
  8 | from gym import spaces
  9 | from gym.utils import seeding, EzPickle
 10 | 
 11 | # Rocket trajectory optimization is a classic topic in Optimal Control.
 12 | #
 13 | # According to Pontryagin's maximum principle it's optimal to fire engine full throttle or
 14 | # turn it off. That's the reason this environment is OK to have discreet actions (engine on or off).
 15 | #
 16 | # Landing pad is always at coordinates (0,0). Coordinates are the first two numbers in state vector.
 17 | # Reward for moving from the top of the screen to landing pad and zero speed is about 100..140 points.
 18 | # If lander moves away from landing pad it loses reward back. Episode finishes if the lander crashes or
 19 | # comes to rest, receiving additional -100 or +100 points. Each leg ground contact is +10. Firing main
 20 | # engine is -0.3 points each frame. Firing side engine is -0.03 points each frame. Solved is 200 points.
 21 | #
 22 | # Landing outside landing pad is possible. Fuel is infinite, so an agent can learn to fly and then land
 23 | # on its first attempt. Please see source code for details.
 24 | #
 25 | # To see heuristic landing, run:
 26 | #
 27 | # python gym/envs/box2d/lunar_lander.py
 28 | #
 29 | # To play yourself, run:
 30 | #
 31 | # python examples/agents/keyboard_agent.py LunarLander-v2
 32 | #
 33 | # Created by Oleg Klimov. Licensed on the same terms as the rest of OpenAI Gym.
 34 | 
 35 | FPS    = 50
 36 | SCALE  = 30.0   # affects how fast-paced the game is, forces should be adjusted as well
 37 | 
 38 | N_OBS_DIM = 9
 39 | MAX_NUM_STEPS = 1000
 40 | 
 41 | MAIN_ENGINE_POWER  = 13.0
 42 | SIDE_ENGINE_POWER  =  0.6
 43 | 
 44 | INITIAL_RANDOM = 1000.0   # Set 1500 to make game harder
 45 | 
 46 | LANDER_POLY =[
 47 |     (-14,+17), (-17,0), (-17,-10),
 48 |     (+17,-10), (+17,0), (+14,+17)
 49 |     ]
 50 | LEG_AWAY = 20
 51 | LEG_DOWN = 18
 52 | LEG_W, LEG_H = 2, 8
 53 | LEG_SPRING_TORQUE = 40
 54 | TARGET_R = 25
 55 | 
 56 | SIDE_ENGINE_HEIGHT = 14.0
 57 | SIDE_ENGINE_AWAY   = 12.0
 58 | 
 59 | VIEWPORT_W = 600
 60 | VIEWPORT_H = 400
 61 | 
 62 | class ContactDetector(contactListener):
 63 |     def __init__(self, env):
 64 |         contactListener.__init__(self)
 65 |         self.env = env
 66 |     def BeginContact(self, contact):
 67 |         if self.env.lander==contact.fixtureA.body or self.env.lander==contact.fixtureB.body:
 68 |             self.env.game_over = True
 69 |         for i in range(2):
 70 |             if self.env.legs[i] in [contact.fixtureA.body, contact.fixtureB.body]:
 71 |                 self.env.legs[i].ground_contact = True
 72 |     def EndContact(self, contact):
 73 |         for i in range(2):
 74 |             if self.env.legs[i] in [contact.fixtureA.body, contact.fixtureB.body]:
 75 |                 self.env.legs[i].ground_contact = False
 76 | 
 77 | class LunarLander(gym.Env, EzPickle):
 78 |     metadata = {
 79 |         'render.modes': ['human', 'rgb_array'],
 80 |         'video.frames_per_second' : FPS
 81 |     }
 82 | 
 83 |     continuous = False
 84 |     terrain_in_ob = True
 85 |     human_test = False
 86 | 
 87 |     def __init__(self):
 88 |         EzPickle.__init__(self)
 89 |         self.seed()
 90 |         self.viewer = None
 91 | 
 92 |         self.world = Box2D.b2World()
 93 |         self.moon = None
 94 |         self.lander = None
 95 |         self.particles = []
 96 | 
 97 |         self.prev_reward = None
 98 |         self.curr_step = None
 99 | 
100 |         # useful range is -1 .. +1, but spikes can be higher
101 |         ob_dim = N_OBS_DIM + 22 if self.terrain_in_ob else N_OBS_DIM
102 |         self.observation_space = spaces.Box(-np.inf, np.inf, shape=(ob_dim,), dtype=np.float32)
103 | 
104 |         if self.continuous:
105 |             # Action is two floats [main engine, left-right engines].
106 |             # Main engine: -1..0 off, 0..+1 throttle from 50% to 100% power. Engine can't work with less than 50% power.
107 |             # Left-right:  -1.0..-0.5 fire left engine, +0.5..+1.0 fire right engine, -0.5..0.5 off
108 |             self.action_space = spaces.Box(-1, +1, (2,), dtype=np.float32)
109 |         else:
110 |             # Nop, fire left engine, main engine, right engine
111 |             self.action_space = spaces.Discrete(4)
112 | 
113 |         self.reset()
114 | 
115 |     def seed(self, seed=None):
116 |         self.np_random, seed = seeding.np_random(seed)
117 |         return [seed]
118 | 
119 |     def _destroy(self):
120 |         if not self.moon: return
121 |         self.world.contactListener = None
122 |         self._clean_particles(True)
123 |         self.world.DestroyBody(self.moon)
124 |         self.moon = None
125 |         self.world.DestroyBody(self.lander)
126 |         self.lander = None
127 |         self.world.DestroyBody(self.legs[0])
128 |         self.world.DestroyBody(self.legs[1])
129 | 
130 |     def reset(self):
131 |         self._destroy()
132 |         self.world.contactListener_keepref = ContactDetector(self)
133 |         self.world.contactListener = self.world.contactListener_keepref
134 |         self.game_over = False
135 |         self.prev_shaping = None
136 |         self.curr_step = 0
137 | 
138 |         W = VIEWPORT_W/SCALE
139 |         H = VIEWPORT_H/SCALE
140 | 
141 |         # terrain
142 |         CHUNKS = 11
143 |         height = self.np_random.uniform(0, H/2, size=(CHUNKS+1,) )
144 |         chunk_x  = [W/(CHUNKS-1)*i for i in range(CHUNKS)]
145 |         # randomize helipad x coord
146 |         helipad_chunk = np.random.choice(range(1, CHUNKS-1))
147 | 
148 |         self.helipad_x1 = chunk_x[helipad_chunk-1]
149 |         self.helipad_x2 = chunk_x[helipad_chunk+1]
150 |         self.helipad_y  = H/4
151 |         height[helipad_chunk-2] = self.helipad_y
152 |         height[helipad_chunk-1] = self.helipad_y
153 |         height[helipad_chunk+0] = self.helipad_y
154 |         height[helipad_chunk+1] = self.helipad_y
155 |         height[helipad_chunk+2] = self.helipad_y
156 |         smooth_y = [0.33*(height[i-1] + height[i+0] + height[i+1]) for i in range(CHUNKS)]
157 |         self.terrain_x = chunk_x
158 |         self.terrain_y = smooth_y
159 | 
160 |         # target
161 |         radius = TARGET_R / SCALE
162 |         # exclude middle half of the screen
163 |         self.target_cx = self.np_random.uniform(radius, W/2. - radius)
164 |         if self.target_cx >= W / 4.:
165 |             self.target_cx += W / 2.
166 |         self.target_cy = self.np_random.uniform(H/2 + radius, H - radius)
167 | 
168 |         self.moon = self.world.CreateStaticBody( shapes=edgeShape(vertices=[(0, 0), (W, 0)]) )
169 |         self.sky_polys = []
170 |         for i in range(CHUNKS-1):
171 |             p1 = (chunk_x[i],   smooth_y[i])
172 |             p2 = (chunk_x[i+1], smooth_y[i+1])
173 |             self.moon.CreateEdgeFixture(
174 |                 vertices=[p1,p2],
175 |                 density=0,
176 |                 friction=0.1)
177 |             self.sky_polys.append( [p1, p2, (p2[0],H), (p1[0],H)] )
178 | 
179 |         self.moon.color1 = (0.0,0.0,0.0)
180 |         self.moon.color2 = (0.0,0.0,0.0)
181 | 
182 |         initial_y = VIEWPORT_H/SCALE
183 |         self.lander = self.world.CreateDynamicBody(
184 |             position = (VIEWPORT_W/SCALE/2, initial_y),
185 |             angle=0.0,
186 |             fixtures = fixtureDef(
187 |                 shape=polygonShape(vertices=[ (x/SCALE,y/SCALE) for x,y in LANDER_POLY ]),
188 |                 density=5.0,
189 |                 friction=0.1,
190 |                 categoryBits=0x0010,
191 |                 maskBits=0x001,  # collide only with ground
192 |                 restitution=0.0) # 0.99 bouncy
193 |                 )
194 |         self.lander.color1 = (0.5,0.4,0.9)
195 |         self.lander.color2 = (0.3,0.3,0.5)
196 |         self.lander.ApplyForceToCenter( (
197 |             self.np_random.uniform(-INITIAL_RANDOM, INITIAL_RANDOM),
198 |             self.np_random.uniform(-INITIAL_RANDOM, INITIAL_RANDOM)
199 |             ), True)
200 | 
201 |         self.legs = []
202 |         for i in [-1,+1]:
203 |             leg = self.world.CreateDynamicBody(
204 |                 position = (VIEWPORT_W/SCALE/2 - i*LEG_AWAY/SCALE, initial_y),
205 |                 angle = (i*0.05),
206 |                 fixtures = fixtureDef(
207 |                     shape=polygonShape(box=(LEG_W/SCALE, LEG_H/SCALE)),
208 |                     density=1.0,
209 |                     restitution=0.0,
210 |                     categoryBits=0x0020,
211 |                     maskBits=0x001)
212 |                 )
213 |             leg.ground_contact = False
214 |             leg.color1 = (0.5,0.4,0.9)
215 |             leg.color2 = (0.3,0.3,0.5)
216 |             rjd = revoluteJointDef(
217 |                 bodyA=self.lander,
218 |                 bodyB=leg,
219 |                 localAnchorA=(0, 0),
220 |                 localAnchorB=(i*LEG_AWAY/SCALE, LEG_DOWN/SCALE),
221 |                 enableMotor=True,
222 |                 enableLimit=True,
223 |                 maxMotorTorque=LEG_SPRING_TORQUE,
224 |                 motorSpeed=+0.3*i  # low enough not to jump back into the sky
225 |                 )
226 |             if i==-1:
227 |                 rjd.lowerAngle = +0.9 - 0.5  # Yes, the most esoteric numbers here, angles legs have freedom to travel within
228 |                 rjd.upperAngle = +0.9
229 |             else:
230 |                 rjd.lowerAngle = -0.9
231 |                 rjd.upperAngle = -0.9 + 0.5
232 |             leg.joint = self.world.CreateJoint(rjd)
233 |             self.legs.append(leg)
234 | 
235 |         self.drawlist = [self.lander] + self.legs
236 | 
237 |         return self.step(np.array([0,0]) if self.continuous else 0)[0]
238 | 
239 |     def _create_particle(self, mass, x, y, ttl):
240 |         p = self.world.CreateDynamicBody(
241 |             position = (x,y),
242 |             angle=0.0,
243 |             fixtures = fixtureDef(
244 |                 shape=circleShape(radius=2/SCALE, pos=(0,0)),
245 |                 density=mass,
246 |                 friction=0.1,
247 |                 categoryBits=0x0100,
248 |                 maskBits=0x001,  # collide only with ground
249 |                 restitution=0.3)
250 |                 )
251 |         p.ttl = ttl
252 |         self.particles.append(p)
253 |         self._clean_particles(False)
254 |         return p
255 | 
256 |     def _clean_particles(self, all):
257 |         while self.particles and (all or self.particles[0].ttl<0):
258 |             self.world.DestroyBody(self.particles.pop(0))
259 | 
260 |     def step(self, action):
261 |         if self.continuous:
262 |             action = np.clip(action, -1, +1).astype(np.float32)
263 |         else:
264 |             assert self.action_space.contains(action), "%r (%s) invalid " % (action, type(action))
265 | 
266 |         # Engines
267 |         tip  = (math.sin(self.lander.angle), math.cos(self.lander.angle))
268 |         side = (-tip[1], tip[0]);
269 |         dispersion = [self.np_random.uniform(-1.0, +1.0) / SCALE for _ in range(2)]
270 | 
271 |         m_power = 0.0
272 |         if (self.continuous and action[0] > 0.0) or (not self.continuous and action==2):
273 |             # Main engine
274 |             if self.continuous:
275 |                 m_power = (np.clip(action[0], 0.0,1.0) + 1.0)*0.5   # 0.5..1.0
276 |                 assert m_power>=0.5 and m_power <= 1.0
277 |             else:
278 |                 m_power = 1.0
279 |             ox =  tip[0]*(4/SCALE + 2*dispersion[0]) + side[0]*dispersion[1]   # 4 is move a bit downwards, +-2 for randomness
280 |             oy = -tip[1]*(4/SCALE + 2*dispersion[0]) - side[1]*dispersion[1]
281 |             impulse_pos = (self.lander.position[0] + ox, self.lander.position[1] + oy)
282 |             p = self._create_particle(3.5, impulse_pos[0], impulse_pos[1], m_power)    # particles are just a decoration, 3.5 is here to make particle speed adequate
283 |             p.ApplyLinearImpulse(           ( ox*MAIN_ENGINE_POWER*m_power,  oy*MAIN_ENGINE_POWER*m_power), impulse_pos, True)
284 |             self.lander.ApplyLinearImpulse( (-ox*MAIN_ENGINE_POWER*m_power, -oy*MAIN_ENGINE_POWER*m_power), impulse_pos, True)
285 | 
286 |         s_power = 0.0
287 |         if (self.continuous and np.abs(action[1]) > 0.5) or (not self.continuous and action in [1,3]):
288 |             # Orientation engines
289 |             if self.continuous:
290 |                 direction = np.sign(action[1])
291 |                 s_power = np.clip(np.abs(action[1]), 0.5,1.0)
292 |                 assert s_power>=0.5 and s_power <= 1.0
293 |             else:
294 |                 direction = action-2
295 |                 s_power = 1.0
296 |             ox =  tip[0]*dispersion[0] + side[0]*(3*dispersion[1]+direction*SIDE_ENGINE_AWAY/SCALE)
297 |             oy = -tip[1]*dispersion[0] - side[1]*(3*dispersion[1]+direction*SIDE_ENGINE_AWAY/SCALE)
298 |             impulse_pos = (self.lander.position[0] + ox - tip[0]*17/SCALE, self.lander.position[1] + oy + tip[1]*SIDE_ENGINE_HEIGHT/SCALE)
299 |             p = self._create_particle(0.7, impulse_pos[0], impulse_pos[1], s_power)
300 |             p.ApplyLinearImpulse(           ( ox*SIDE_ENGINE_POWER*s_power,  oy*SIDE_ENGINE_POWER*s_power), impulse_pos, True)
301 |             self.lander.ApplyLinearImpulse( (-ox*SIDE_ENGINE_POWER*s_power, -oy*SIDE_ENGINE_POWER*s_power), impulse_pos, True)
302 | 
303 |         self.world.Step(1.0/FPS, 6*30, 2*30)
304 | 
305 |         pos = self.lander.position
306 |         vel = self.lander.linearVelocity
307 |         helipad_x = (self.helipad_x1 + self.helipad_x2) / 2
308 |         state = [
309 |             (pos.x - VIEWPORT_W/SCALE/2) / (VIEWPORT_W/SCALE/2),
310 |             (pos.y - (self.helipad_y+LEG_DOWN/SCALE)) / (VIEWPORT_H/SCALE/2),
311 |             vel.x*(VIEWPORT_W/SCALE/2)/FPS,
312 |             vel.y*(VIEWPORT_H/SCALE/2)/FPS,
313 |             self.lander.angle,
314 |             20.0*self.lander.angularVelocity/FPS,
315 |             1.0 if self.legs[0].ground_contact else 0.0,
316 |             1.0 if self.legs[1].ground_contact else 0.0,
317 |             (helipad_x - VIEWPORT_W/SCALE/2) / (VIEWPORT_W/SCALE/2)
318 |             ]
319 |         if self.terrain_in_ob:
320 |             for x, y in zip(self.terrain_x, self.terrain_y):
321 |                 state.append((x - VIEWPORT_W/SCALE/2) / (VIEWPORT_W/SCALE/2))
322 |                 state.append((y - (self.helipad_y+LEG_DOWN/SCALE)) / (VIEWPORT_H/SCALE/2))
323 |         assert len(state)==N_OBS_DIM + 22 * self.terrain_in_ob
324 |         self.curr_step += 1
325 | 
326 |         reward = 0
327 |         dx = pos.x - self.target_cx
328 |         dy = pos.y - self.target_cy
329 |         shaping = \
330 |             - 100*np.sqrt(dx*dx + dy*dy)/2 \
331 |             - 100*np.sqrt(state[2]*state[2] + state[3]*state[3]) \
332 |             - 100*abs(state[4]) + 10*state[6] + 10*state[7]   # And ten points for legs contact, the idea is if you
333 |                                                               # lose contact again after landing, you get negative reward
334 | 
335 |         if self.prev_shaping is not None:
336 |             reward = shaping - self.prev_shaping
337 |         self.prev_shaping = shaping
338 | 
339 |         reward -= m_power*0.30  # less fuel spent is better, about -30 for heurisic landing
340 |         reward -= s_power*0.03
341 | 
342 |         oob = abs(state[0]) >= 1.0
343 |         timeout = self.curr_step >= MAX_NUM_STEPS
344 |         at_target = np.sqrt(dx*dx + dy*dy) < TARGET_R/SCALE
345 | 
346 |         done = self.game_over or oob or at_target or timeout
347 |         success = False
348 |         crash = False
349 |         if done:
350 |             if self.game_over or oob:
351 |                 reward -= 100
352 |                 crash = True
353 |                 self.lander.color1 = (1.0,0.0,0.0)
354 |             elif at_target:
355 |                 success = True
356 |                 reward += 100
357 |                 self.lander.color1 = (0.0,1.0,0.0)
358 |             if self.human_test:
359 |                 self.render()
360 |                 import time
361 |                 time.sleep(1.0)
362 | 
363 |         info = {'success': success, 'crash': crash}
364 |         return np.array(state, dtype=np.float32), reward, done, info
365 | 
366 |     def render(self, mode='human'):
367 |         from gym.envs.classic_control import rendering
368 |         if self.viewer is None:
369 |             self.viewer = rendering.Viewer(VIEWPORT_W, VIEWPORT_H)
370 |             self.viewer.set_bounds(0, VIEWPORT_W/SCALE, 0, VIEWPORT_H/SCALE)
371 | 
372 |         for obj in self.particles:
373 |             obj.ttl -= 0.15
374 |             obj.color1 = (max(0.2,0.2+obj.ttl), max(0.2,0.5*obj.ttl), max(0.2,0.5*obj.ttl))
375 |             obj.color2 = (max(0.2,0.2+obj.ttl), max(0.2,0.5*obj.ttl), max(0.2,0.5*obj.ttl))
376 | 
377 |         self._clean_particles(False)
378 | 
379 |         for p in self.sky_polys:
380 |             self.viewer.draw_polygon(p, color=(0,0,0))
381 | 
382 |         for obj in self.particles + self.drawlist:
383 |             for f in obj.fixtures:
384 |                 trans = f.body.transform
385 |                 if type(f.shape) is circleShape:
386 |                     t = rendering.Transform(translation=trans*f.shape.pos)
387 |                     self.viewer.draw_circle(f.shape.radius, 20, color=obj.color1).add_attr(t)
388 |                     self.viewer.draw_circle(f.shape.radius, 20, color=obj.color2, filled=False, linewidth=2).add_attr(t)
389 |                 else:
390 |                     path = [trans*v for v in f.shape.vertices]
391 |                     self.viewer.draw_polygon(path, color=obj.color1)
392 |                     path.append(path[0])
393 |                     self.viewer.draw_polyline(path, color=obj.color2, linewidth=2)
394 | 
395 |         # draw target
396 |         t = rendering.Transform(translation=np.array([self.target_cx, self.target_cy]))
397 |         self.viewer.draw_circle(TARGET_R / SCALE, 20, color=[255, 0, 0], filled=False, linewidth=5).add_attr(t)
398 | 
399 |         return self.viewer.render(return_rgb_array = mode=='rgb_array')
400 | 
401 |     def close(self):
402 |         if self.viewer is not None:
403 |             self.viewer.close()
404 |             self.viewer = None
405 | 
406 |     def state_dict(self):
407 |         return {'rng': self.np_random.get_state()}
408 | 
409 |     def load_state_dict(self, state_dict):
410 |         self.np_random.set_state(state_dict['rng'])
411 | 
412 | 
413 | class LunarLanderReacher(LunarLander):
414 |     continuous = True
415 | 
416 | def heuristic(env, s):
417 |     # Heuristic for:
418 |     # 1. Testing.
419 |     # 2. Demonstration rollout.
420 |     angle_targ = s[0]*0.5 + s[2]*1.0         # angle should point towards center (s[0] is horizontal coordinate, s[2] hor speed)
421 |     if angle_targ >  0.4: angle_targ =  0.4  # more than 0.4 radians (22 degrees) is bad
422 |     if angle_targ < -0.4: angle_targ = -0.4
423 |     hover_targ = 0.55*np.abs(s[0])           # target y should be proporional to horizontal offset
424 | 
425 |     # PID controller: s[4] angle, s[5] angularSpeed
426 |     angle_todo = (angle_targ - s[4])*0.5 - (s[5])*1.0
427 |     #print("angle_targ=%0.2f, angle_todo=%0.2f" % (angle_targ, angle_todo))
428 | 
429 |     # PID controller: s[1] vertical coordinate s[3] vertical speed
430 |     hover_todo = (hover_targ - s[1])*0.5 - (s[3])*0.5
431 |     #print("hover_targ=%0.2f, hover_todo=%0.2f" % (hover_targ, hover_todo))
432 | 
433 |     if s[6] or s[7]: # legs have contact
434 |         angle_todo = 0
435 |         hover_todo = -(s[3])*0.5  # override to reduce fall speed, that's all we need after contact
436 | 
437 |     if env.continuous:
438 |         a = np.array( [hover_todo*20 - 1, -angle_todo*20] )
439 |         a = np.clip(a, -1, +1)
440 |     else:
441 |         a = 0
442 |         if hover_todo > np.abs(angle_todo) and hover_todo > 0.05: a = 2
443 |         elif angle_todo < -0.05: a = 3
444 |         elif angle_todo > +0.05: a = 1
445 |     return a
446 | 
447 | def demo_heuristic_lander(env, seed=None, render=False):
448 |     env.seed(seed)
449 |     total_reward = 0
450 |     steps = 0
451 |     s = env.reset()
452 |     while True:
453 |         a = heuristic(env, s)
454 |         s, r, done, info = env.step(a)
455 |         total_reward += r
456 | 
457 |         if render:
458 |             still_open = env.render()
459 |             if still_open == False: break
460 | 
461 |         if steps % 20 == 0 or done:
462 |             print("observations:", " ".join(["{:+0.2f}".format(x) for x in s]))
463 |             print("step {} total_reward {:+0.2f}".format(steps, total_reward))
464 |         steps += 1
465 |         if done: break
466 |     return total_reward
467 | 
468 | 
469 | if __name__ == '__main__':
470 |     demo_heuristic_lander(LunarLander(), render=True)
471 | 


--------------------------------------------------------------------------------
/lunar_lander/env_lander.py:
--------------------------------------------------------------------------------
  1 | import sys, math
  2 | import numpy as np
  3 | 
  4 | import Box2D
  5 | from Box2D.b2 import (edgeShape, circleShape, fixtureDef, polygonShape, revoluteJointDef, contactListener)
  6 | 
  7 | import gym
  8 | from gym import spaces
  9 | from gym.utils import seeding, EzPickle
 10 | 
 11 | # Rocket trajectory optimization is a classic topic in Optimal Control.
 12 | #
 13 | # According to Pontryagin's maximum principle it's optimal to fire engine full throttle or
 14 | # turn it off. That's the reason this environment is OK to have discreet actions (engine on or off).
 15 | #
 16 | # Landing pad is always at coordinates (0,0). Coordinates are the first two numbers in state vector.
 17 | # Reward for moving from the top of the screen to landing pad and zero speed is about 100..140 points.
 18 | # If lander moves away from landing pad it loses reward back. Episode finishes if the lander crashes or
 19 | # comes to rest, receiving additional -100 or +100 points. Each leg ground contact is +10. Firing main
 20 | # engine is -0.3 points each frame. Firing side engine is -0.03 points each frame. Solved is 200 points.
 21 | #
 22 | # Landing outside landing pad is possible. Fuel is infinite, so an agent can learn to fly and then land
 23 | # on its first attempt. Please see source code for details.
 24 | #
 25 | # To see heuristic landing, run:
 26 | #
 27 | # python gym/envs/box2d/lunar_lander.py
 28 | #
 29 | # To play yourself, run:
 30 | #
 31 | # python examples/agents/keyboard_agent.py LunarLander-v2
 32 | #
 33 | # Created by Oleg Klimov. Licensed on the same terms as the rest of OpenAI Gym.
 34 | 
 35 | FPS    = 50
 36 | SCALE  = 30.0   # affects how fast-paced the game is, forces should be adjusted as well
 37 | 
 38 | N_OBS_DIM = 9
 39 | MAX_NUM_STEPS = 1000
 40 | GROUND_SLACK = 0.01
 41 | LANDED_SLACK = 50
 42 | 
 43 | MAIN_ENGINE_POWER  = 13.0
 44 | SIDE_ENGINE_POWER  =  0.6
 45 | 
 46 | INITIAL_RANDOM = 1000.0   # Set 1500 to make game harder
 47 | 
 48 | LANDER_POLY =[
 49 |     (-14,+17), (-17,0), (-17,-10),
 50 |     (+17,-10), (+17,0), (+14,+17)
 51 |     ]
 52 | LEG_AWAY = 20
 53 | LEG_DOWN = 18
 54 | LEG_W, LEG_H = 2, 8
 55 | LEG_SPRING_TORQUE = 40
 56 | 
 57 | SIDE_ENGINE_HEIGHT = 14.0
 58 | SIDE_ENGINE_AWAY   = 12.0
 59 | 
 60 | VIEWPORT_W = 600
 61 | VIEWPORT_H = 400
 62 | 
 63 | FUEL_COST_MAIN = 0.3
 64 | FUEL_COST_SIDE = 0.03
 65 | 
 66 | class ContactDetector(contactListener):
 67 |     def __init__(self, env):
 68 |         contactListener.__init__(self)
 69 |         self.env = env
 70 |     def BeginContact(self, contact):
 71 |         if self.env.lander==contact.fixtureA.body or self.env.lander==contact.fixtureB.body:
 72 |             self.env.game_over = True
 73 |         for i in range(2):
 74 |             if self.env.legs[i] in [contact.fixtureA.body, contact.fixtureB.body]:
 75 |                 self.env.legs[i].ground_contact = True
 76 |     def EndContact(self, contact):
 77 |         for i in range(2):
 78 |             if self.env.legs[i] in [contact.fixtureA.body, contact.fixtureB.body]:
 79 |                 self.env.legs[i].ground_contact = False
 80 | 
 81 | class LunarLander(gym.Env, EzPickle):
 82 |     metadata = {
 83 |         'render.modes': ['human', 'rgb_array'],
 84 |         'video.frames_per_second' : FPS
 85 |     }
 86 | 
 87 |     continuous = False
 88 |     goal_seeking = True
 89 |     terrain_in_ob = True
 90 |     human_test = False
 91 | 
 92 |     def __init__(self):
 93 |         EzPickle.__init__(self)
 94 |         self.seed()
 95 |         self.viewer = None
 96 | 
 97 |         self.world = Box2D.b2World()
 98 |         self.moon = None
 99 |         self.lander = None
100 |         self.particles = []
101 | 
102 |         self.prev_reward = None
103 |         self.curr_step = None
104 |         self.at_site = None
105 |         self.grounded = None
106 | 
107 |         # useful range is -1 .. +1, but spikes can be higher
108 |         ob_dim = N_OBS_DIM + 22 if self.terrain_in_ob else N_OBS_DIM
109 |         self.observation_space = spaces.Box(-np.inf, np.inf, shape=(ob_dim,), dtype=np.float32)
110 | 
111 |         if self.continuous:
112 |             # Action is two floats [main engine, left-right engines].
113 |             # Main engine: -1..0 off, 0..+1 throttle from 50% to 100% power. Engine can't work with less than 50% power.
114 |             # Left-right:  -1.0..-0.5 fire left engine, +0.5..+1.0 fire right engine, -0.5..0.5 off
115 |             self.action_space = spaces.Box(-1, +1, (2,), dtype=np.float32)
116 |         else:
117 |             # Nop, fire left engine, main engine, right engine
118 |             self.action_space = spaces.Discrete(4)
119 | 
120 |         self.reset()
121 | 
122 |     def seed(self, seed=None):
123 |         self.np_random, seed = seeding.np_random(seed)
124 |         return [seed]
125 | 
126 |     def _destroy(self):
127 |         if not self.moon: return
128 |         self.world.contactListener = None
129 |         self._clean_particles(True)
130 |         self.world.DestroyBody(self.moon)
131 |         self.moon = None
132 |         self.world.DestroyBody(self.lander)
133 |         self.lander = None
134 |         self.world.DestroyBody(self.legs[0])
135 |         self.world.DestroyBody(self.legs[1])
136 | 
137 |     def reset(self):
138 |         self._destroy()
139 |         self.world.contactListener_keepref = ContactDetector(self)
140 |         self.world.contactListener = self.world.contactListener_keepref
141 |         self.game_over = False
142 |         self.prev_shaping = None
143 |         self.curr_step = 0
144 |         self.at_site = [False for _ in range(LANDED_SLACK)]
145 |         self.grounded = [False for _ in range(LANDED_SLACK)]
146 | 
147 |         W = VIEWPORT_W/SCALE
148 |         H = VIEWPORT_H/SCALE
149 | 
150 |         # terrain
151 |         CHUNKS = 11
152 |         height = self.np_random.uniform(0, H/2, size=(CHUNKS+1,) )
153 |         chunk_x  = [W/(CHUNKS-1)*i for i in range(CHUNKS)]
154 |         # randomize helipad x coord
155 |         helipad_chunk = np.random.choice(range(1, CHUNKS-1))
156 | 
157 |         self.helipad_x1 = chunk_x[helipad_chunk-1]
158 |         self.helipad_x2 = chunk_x[helipad_chunk+1]
159 |         self.helipad_y  = H/4
160 |         height[helipad_chunk-2] = self.helipad_y
161 |         height[helipad_chunk-1] = self.helipad_y
162 |         height[helipad_chunk+0] = self.helipad_y
163 |         height[helipad_chunk+1] = self.helipad_y
164 |         height[helipad_chunk+2] = self.helipad_y
165 |         smooth_y = [0.33*(height[i-1] + height[i+0] + height[i+1]) for i in range(CHUNKS)]
166 |         self.terrain_x = chunk_x
167 |         self.terrain_y = smooth_y
168 | 
169 |         self.moon = self.world.CreateStaticBody( shapes=edgeShape(vertices=[(0, 0), (W, 0)]) )
170 |         self.sky_polys = []
171 |         for i in range(CHUNKS-1):
172 |             p1 = (chunk_x[i],   smooth_y[i])
173 |             p2 = (chunk_x[i+1], smooth_y[i+1])
174 |             self.moon.CreateEdgeFixture(
175 |                 vertices=[p1,p2],
176 |                 density=0,
177 |                 friction=0.1)
178 |             self.sky_polys.append( [p1, p2, (p2[0],H), (p1[0],H)] )
179 | 
180 |         self.moon.color1 = (0.0,0.0,0.0)
181 |         self.moon.color2 = (0.0,0.0,0.0)
182 | 
183 |         initial_y = VIEWPORT_H/SCALE
184 |         self.lander = self.world.CreateDynamicBody(
185 |             position = (VIEWPORT_W/SCALE/2, initial_y),
186 |             angle=0.0,
187 |             fixtures = fixtureDef(
188 |                 shape=polygonShape(vertices=[ (x/SCALE,y/SCALE) for x,y in LANDER_POLY ]),
189 |                 density=5.0,
190 |                 friction=0.1,
191 |                 categoryBits=0x0010,
192 |                 maskBits=0x001,  # collide only with ground
193 |                 restitution=0.0) # 0.99 bouncy
194 |                 )
195 |         self.lander.color1 = (0.5,0.4,0.9)
196 |         self.lander.color2 = (0.3,0.3,0.5)
197 |         self.lander.ApplyForceToCenter( (
198 |             self.np_random.uniform(-INITIAL_RANDOM, INITIAL_RANDOM),
199 |             self.np_random.uniform(-INITIAL_RANDOM, INITIAL_RANDOM)
200 |             ), True)
201 | 
202 |         self.legs = []
203 |         for i in [-1,+1]:
204 |             leg = self.world.CreateDynamicBody(
205 |                 position = (VIEWPORT_W/SCALE/2 - i*LEG_AWAY/SCALE, initial_y),
206 |                 angle = (i*0.05),
207 |                 fixtures = fixtureDef(
208 |                     shape=polygonShape(box=(LEG_W/SCALE, LEG_H/SCALE)),
209 |                     density=1.0,
210 |                     restitution=0.0,
211 |                     categoryBits=0x0020,
212 |                     maskBits=0x001)
213 |                 )
214 |             leg.ground_contact = False
215 |             leg.color1 = (0.5,0.4,0.9)
216 |             leg.color2 = (0.3,0.3,0.5)
217 |             rjd = revoluteJointDef(
218 |                 bodyA=self.lander,
219 |                 bodyB=leg,
220 |                 localAnchorA=(0, 0),
221 |                 localAnchorB=(i*LEG_AWAY/SCALE, LEG_DOWN/SCALE),
222 |                 enableMotor=True,
223 |                 enableLimit=True,
224 |                 maxMotorTorque=LEG_SPRING_TORQUE,
225 |                 motorSpeed=+0.3*i  # low enough not to jump back into the sky
226 |                 )
227 |             if i==-1:
228 |                 rjd.lowerAngle = +0.9 - 0.5  # Yes, the most esoteric numbers here, angles legs have freedom to travel within
229 |                 rjd.upperAngle = +0.9
230 |             else:
231 |                 rjd.lowerAngle = -0.9
232 |                 rjd.upperAngle = -0.9 + 0.5
233 |             leg.joint = self.world.CreateJoint(rjd)
234 |             self.legs.append(leg)
235 | 
236 |         self.drawlist = [self.lander] + self.legs
237 | 
238 |         return self.step(np.array([0,0]) if self.continuous else 0)[0]
239 | 
240 |     def _create_particle(self, mass, x, y, ttl):
241 |         p = self.world.CreateDynamicBody(
242 |             position = (x,y),
243 |             angle=0.0,
244 |             fixtures = fixtureDef(
245 |                 shape=circleShape(radius=2/SCALE, pos=(0,0)),
246 |                 density=mass,
247 |                 friction=0.1,
248 |                 categoryBits=0x0100,
249 |                 maskBits=0x001,  # collide only with ground
250 |                 restitution=0.3)
251 |                 )
252 |         p.ttl = ttl
253 |         self.particles.append(p)
254 |         self._clean_particles(False)
255 |         return p
256 | 
257 |     def _clean_particles(self, all):
258 |         while self.particles and (all or self.particles[0].ttl<0):
259 |             self.world.DestroyBody(self.particles.pop(0))
260 | 
261 |     def step(self, action):
262 |         if self.continuous:
263 |             action = np.clip(action, -1, +1).astype(np.float32)
264 |         else:
265 |             assert self.action_space.contains(action), "%r (%s) invalid " % (action, type(action))
266 | 
267 |         # Engines
268 |         tip  = (math.sin(self.lander.angle), math.cos(self.lander.angle))
269 |         side = (-tip[1], tip[0]);
270 |         dispersion = [self.np_random.uniform(-1.0, +1.0) / SCALE for _ in range(2)]
271 | 
272 |         m_power = 0.0
273 |         if (self.continuous and action[0] > 0.0) or (not self.continuous and action==2):
274 |             # Main engine
275 |             if self.continuous:
276 |                 m_power = (np.clip(action[0], 0.0,1.0) + 1.0)*0.5   # 0.5..1.0
277 |                 assert m_power>=0.5 and m_power <= 1.0
278 |             else:
279 |                 m_power = 1.0
280 |             ox =  tip[0]*(4/SCALE + 2*dispersion[0]) + side[0]*dispersion[1]   # 4 is move a bit downwards, +-2 for randomness
281 |             oy = -tip[1]*(4/SCALE + 2*dispersion[0]) - side[1]*dispersion[1]
282 |             impulse_pos = (self.lander.position[0] + ox, self.lander.position[1] + oy)
283 |             p = self._create_particle(3.5, impulse_pos[0], impulse_pos[1], m_power)    # particles are just a decoration, 3.5 is here to make particle speed adequate
284 |             p.ApplyLinearImpulse(           ( ox*MAIN_ENGINE_POWER*m_power,  oy*MAIN_ENGINE_POWER*m_power), impulse_pos, True)
285 |             self.lander.ApplyLinearImpulse( (-ox*MAIN_ENGINE_POWER*m_power, -oy*MAIN_ENGINE_POWER*m_power), impulse_pos, True)
286 | 
287 |         s_power = 0.0
288 |         if (self.continuous and np.abs(action[1]) > 0.5) or (not self.continuous and action in [1,3]):
289 |             # Orientation engines
290 |             if self.continuous:
291 |                 direction = np.sign(action[1])
292 |                 s_power = np.clip(np.abs(action[1]), 0.5,1.0)
293 |                 assert s_power>=0.5 and s_power <= 1.0
294 |             else:
295 |                 direction = action-2
296 |                 s_power = 1.0
297 |             ox =  tip[0]*dispersion[0] + side[0]*(3*dispersion[1]+direction*SIDE_ENGINE_AWAY/SCALE)
298 |             oy = -tip[1]*dispersion[0] - side[1]*(3*dispersion[1]+direction*SIDE_ENGINE_AWAY/SCALE)
299 |             impulse_pos = (self.lander.position[0] + ox - tip[0]*17/SCALE, self.lander.position[1] + oy + tip[1]*SIDE_ENGINE_HEIGHT/SCALE)
300 |             p = self._create_particle(0.7, impulse_pos[0], impulse_pos[1], s_power)
301 |             p.ApplyLinearImpulse(           ( ox*SIDE_ENGINE_POWER*s_power,  oy*SIDE_ENGINE_POWER*s_power), impulse_pos, True)
302 |             self.lander.ApplyLinearImpulse( (-ox*SIDE_ENGINE_POWER*s_power, -oy*SIDE_ENGINE_POWER*s_power), impulse_pos, True)
303 | 
304 |         self.world.Step(1.0/FPS, 6*30, 2*30)
305 | 
306 |         pos = self.lander.position
307 |         vel = self.lander.linearVelocity
308 |         helipad_x = (self.helipad_x1 + self.helipad_x2) / 2
309 |         state = [
310 |             (pos.x - VIEWPORT_W/SCALE/2) / (VIEWPORT_W/SCALE/2),
311 |             (pos.y - (self.helipad_y+LEG_DOWN/SCALE)) / (VIEWPORT_H/SCALE/2),
312 |             vel.x*(VIEWPORT_W/SCALE/2)/FPS,
313 |             vel.y*(VIEWPORT_H/SCALE/2)/FPS,
314 |             self.lander.angle,
315 |             20.0*self.lander.angularVelocity/FPS,
316 |             1.0 if self.legs[0].ground_contact else 0.0,
317 |             1.0 if self.legs[1].ground_contact else 0.0,
318 |             (helipad_x - VIEWPORT_W/SCALE/2) / (VIEWPORT_W/SCALE/2)
319 |             ]
320 |         if self.terrain_in_ob:
321 |             for x, y in zip(self.terrain_x, self.terrain_y):
322 |                 state.append((x - VIEWPORT_W/SCALE/2) / (VIEWPORT_W/SCALE/2))
323 |                 state.append((y - (self.helipad_y+LEG_DOWN/SCALE)) / (VIEWPORT_H/SCALE/2))
324 |         assert len(state)==N_OBS_DIM + 22 * self.terrain_in_ob
325 |         self.curr_step += 1
326 | 
327 |         reward = 0
328 |         dx = (pos.x - helipad_x) / (VIEWPORT_W/SCALE/2)
329 |         if self.goal_seeking:
330 |             shaping = \
331 |                 - 100*np.sqrt(dx*dx + state[1]*state[1]) \
332 |                 - 100*np.sqrt(state[2]*state[2] + state[3]*state[3]) \
333 |                 - 100*abs(state[4]) + 10*state[6] + 10*state[7]   # And ten points for legs contact, the idea is if you
334 |                                                                   # lose contact again after landing, you get negative reward
335 | 
336 |         # Remove distance to goal from reward shaping. CHANGED
337 |         else:
338 |             shaping = \
339 |                 - 100*np.sqrt(state[2]*state[2] + state[3]*state[3]) \
340 |                 - 100*abs(state[4]) + 10*state[6] + 10*state[7]   # And ten points for legs contact, the idea is if you
341 |                                                               # lose contact again after landing, you get negative reward
342 |         if self.prev_shaping is not None:
343 |             reward = shaping - self.prev_shaping
344 |         self.prev_shaping = shaping
345 | 
346 |         reward -= m_power*FUEL_COST_MAIN
347 |         reward -= s_power*FUEL_COST_SIDE
348 | 
349 |         oob = abs(state[0]) >= 1.0
350 |         timeout = self.curr_step >= MAX_NUM_STEPS
351 |         not_awake = not self.lander.awake
352 | 
353 |         at_site = pos.x >= self.helipad_x1 and pos.x <= self.helipad_x2
354 |         if self.goal_seeking:
355 |             self.at_site[:-1] = self.at_site[1:]
356 |             self.at_site[-1] = pos.x >= self.helipad_x1 and pos.x <= self.helipad_x2 and state[1] <= GROUND_SLACK
357 |             self.grounded[:-1] = self.grounded[1:]
358 |             self.grounded[-1] = self.legs[0].ground_contact and self.legs[1].ground_contact
359 |             landed = all(self.at_site) and any(self.grounded)
360 |         else:
361 |             landed = False
362 | 
363 | 
364 |         done = self.game_over or oob or not_awake or timeout or landed
365 |         success = False
366 |         crash = False
367 |         if done:
368 |             if self.game_over or oob:
369 |                 reward -= 100
370 |                 crash = True
371 |                 self.lander.color1 = (1.0,0.0,0.0)
372 |             elif ((not timeout and at_site) or landed) and self.goal_seeking:
373 |                 success = True
374 |                 reward += 100
375 |                 self.lander.color1 = (0.0,1.0,0.0)
376 |             if self.human_test:
377 |                 self.render()
378 |                 import time
379 |                 time.sleep(1.0)
380 | 
381 |         info = {'success': success, 'crash': crash}
382 |         return np.array(state, dtype=np.float32), reward, done, info
383 | 
384 |     def render(self, mode='human'):
385 |         from gym.envs.classic_control import rendering
386 |         if self.viewer is None:
387 |             self.viewer = rendering.Viewer(VIEWPORT_W, VIEWPORT_H)
388 |             self.viewer.set_bounds(0, VIEWPORT_W/SCALE, 0, VIEWPORT_H/SCALE)
389 | 
390 |         for obj in self.particles:
391 |             obj.ttl -= 0.15
392 |             obj.color1 = (max(0.2,0.2+obj.ttl), max(0.2,0.5*obj.ttl), max(0.2,0.5*obj.ttl))
393 |             obj.color2 = (max(0.2,0.2+obj.ttl), max(0.2,0.5*obj.ttl), max(0.2,0.5*obj.ttl))
394 | 
395 |         self._clean_particles(False)
396 | 
397 |         for p in self.sky_polys:
398 |             self.viewer.draw_polygon(p, color=(0,0,0))
399 | 
400 |         for obj in self.particles + self.drawlist:
401 |             for f in obj.fixtures:
402 |                 trans = f.body.transform
403 |                 if type(f.shape) is circleShape:
404 |                     t = rendering.Transform(translation=trans*f.shape.pos)
405 |                     self.viewer.draw_circle(f.shape.radius, 20, color=obj.color1).add_attr(t)
406 |                     self.viewer.draw_circle(f.shape.radius, 20, color=obj.color2, filled=False, linewidth=2).add_attr(t)
407 |                 else:
408 |                     path = [trans*v for v in f.shape.vertices]
409 |                     self.viewer.draw_polygon(path, color=obj.color1)
410 |                     path.append(path[0])
411 |                     self.viewer.draw_polyline(path, color=obj.color2, linewidth=2)
412 | 
413 |         for x in [self.helipad_x1, self.helipad_x2]:
414 |             flagy1 = self.helipad_y
415 |             flagy2 = flagy1 + 50/SCALE
416 |             self.viewer.draw_polyline( [(x, flagy1), (x, flagy2)], color=(1,1,1) )
417 |             self.viewer.draw_polygon( [(x, flagy2), (x, flagy2-10/SCALE), (x+25/SCALE, flagy2-5/SCALE)], color=(0.8,0.8,0) )
418 | 
419 |         return self.viewer.render(return_rgb_array = mode=='rgb_array')
420 | 
421 |     def close(self):
422 |         if self.viewer is not None:
423 |             self.viewer.close()
424 |             self.viewer = None
425 | 
426 |     def state_dict(self):
427 |         return {'rng': self.np_random.get_state()}
428 | 
429 |     def load_state_dict(self, state_dict):
430 |         self.np_random.set_state(state_dict['rng'])
431 | 
432 | 
433 | class LunarLanderContinuous(LunarLander):
434 |     continuous = True
435 | 
436 | class LunarLanderNoGoal(LunarLander):
437 |     goal_seeking = False
438 | 
439 | class LunarLanderContinuousNoGoal(LunarLander):
440 |     continuous = True
441 |     goal_seeking = False
442 | 
443 | def heuristic(env, s):
444 |     # Heuristic for:
445 |     # 1. Testing.
446 |     # 2. Demonstration rollout.
447 |     angle_targ = s[0]*0.5 + s[2]*1.0         # angle should point towards center (s[0] is horizontal coordinate, s[2] hor speed)
448 |     if angle_targ >  0.4: angle_targ =  0.4  # more than 0.4 radians (22 degrees) is bad
449 |     if angle_targ < -0.4: angle_targ = -0.4
450 |     hover_targ = 0.55*np.abs(s[0])           # target y should be proporional to horizontal offset
451 | 
452 |     # PID controller: s[4] angle, s[5] angularSpeed
453 |     angle_todo = (angle_targ - s[4])*0.5 - (s[5])*1.0
454 |     #print("angle_targ=%0.2f, angle_todo=%0.2f" % (angle_targ, angle_todo))
455 | 
456 |     # PID controller: s[1] vertical coordinate s[3] vertical speed
457 |     hover_todo = (hover_targ - s[1])*0.5 - (s[3])*0.5
458 |     #print("hover_targ=%0.2f, hover_todo=%0.2f" % (hover_targ, hover_todo))
459 | 
460 |     if s[6] or s[7]: # legs have contact
461 |         angle_todo = 0
462 |         hover_todo = -(s[3])*0.5  # override to reduce fall speed, that's all we need after contact
463 | 
464 |     if env.continuous:
465 |         a = np.array( [hover_todo*20 - 1, -angle_todo*20] )
466 |         a = np.clip(a, -1, +1)
467 |     else:
468 |         a = 0
469 |         if hover_todo > np.abs(angle_todo) and hover_todo > 0.05: a = 2
470 |         elif angle_todo < -0.05: a = 3
471 |         elif angle_todo > +0.05: a = 1
472 |     return a
473 | 
474 | def demo_heuristic_lander(env, seed=None, render=False):
475 |     env.seed(seed)
476 |     total_reward = 0
477 |     steps = 0
478 |     s = env.reset()
479 |     while True:
480 |         a = heuristic(env, s)
481 |         s, r, done, info = env.step(a)
482 |         total_reward += r
483 | 
484 |         if render:
485 |             still_open = env.render()
486 |             if still_open == False: break
487 | 
488 |         if steps % 20 == 0 or done:
489 |             print("observations:", " ".join(["{:+0.2f}".format(x) for x in s]))
490 |             print("step {} total_reward {:+0.2f}".format(steps, total_reward))
491 |         steps += 1
492 |         if done: break
493 |     return total_reward
494 | 
495 | 
496 | if __name__ == '__main__':
497 |     demo_heuristic_lander(LunarLander(), render=True)
498 | 


--------------------------------------------------------------------------------