├── ppo ├── train.sh ├── models │ ├── pretrained_drone │ │ ├── ckpts │ │ │ └── 100007936.pt │ │ └── config.gin │ ├── pretrained_lunar │ │ ├── ckpts │ │ │ └── 100007936.pt │ │ └── config.gin │ ├── drone_reacher_agent │ │ └── ckpts │ │ │ └── 010485760.pt │ ├── lunar_lander_agent │ │ └── ckpts │ │ │ └── 001000000.pt │ ├── behavioral_cloning_agents_drone │ │ ├── bc0 │ │ │ └── ckpts │ │ │ │ └── 002728350.pt │ │ ├── bc1 │ │ │ └── ckpts │ │ │ │ └── 002556300.pt │ │ ├── bc2 │ │ │ └── ckpts │ │ │ │ └── 002116950.pt │ │ ├── bc3 │ │ │ └── ckpts │ │ │ │ └── 001719600.pt │ │ ├── bc4 │ │ │ └── ckpts │ │ │ │ └── 002935500.pt │ │ ├── bc5 │ │ │ └── ckpts │ │ │ │ └── 002101950.pt │ │ ├── bc6 │ │ │ └── ckpts │ │ │ │ └── 001462650.pt │ │ ├── bc7 │ │ │ └── ckpts │ │ │ │ └── 002159100.pt │ │ ├── bc8 │ │ │ └── ckpts │ │ │ │ └── 002733300.pt │ │ ├── bc9 │ │ │ └── ckpts │ │ │ │ └── 002580450.pt │ │ ├── bc10 │ │ │ └── ckpts │ │ │ │ └── 002958900.pt │ │ ├── bc11 │ │ │ └── ckpts │ │ │ │ └── 001742550.pt │ │ ├── bc12 │ │ │ └── ckpts │ │ │ │ └── 002901450.pt │ │ ├── bc13 │ │ │ └── ckpts │ │ │ │ └── 003014400.pt │ │ └── bc14 │ │ │ └── ckpts │ │ │ └── 001760700.pt │ └── behavioral_cloning_agents_lunar │ │ ├── bc0 │ │ └── ckpts │ │ │ └── 001747200.pt │ │ ├── bc1 │ │ └── ckpts │ │ │ └── 001272700.pt │ │ ├── bc2 │ │ └── ckpts │ │ │ └── 001376400.pt │ │ ├── bc3 │ │ └── ckpts │ │ │ └── 001579400.pt │ │ ├── bc4 │ │ └── ckpts │ │ │ └── 001349200.pt │ │ ├── bc5 │ │ └── ckpts │ │ │ └── 000993600.pt │ │ ├── bc6 │ │ └── ckpts │ │ │ └── 001220200.pt │ │ └── bc7 │ │ └── ckpts │ │ └── 001122700.pt ├── __init__.py ├── joystick_control.py ├── actor.py ├── configs │ ├── ppo_drone_noisy.gin │ ├── ppo_drone_laggy.gin │ ├── ppo_drone.gin │ ├── ppo_lunar_noisy.gin │ ├── ppo_lunar_laggy.gin │ └── ppo_lunar.gin └── constrained_residual_ppo.py ├── drone_sim ├── train.sh ├── train.py ├── watch.py ├── __init__.py ├── ppo.gin ├── actor.py ├── joystick_agent.py ├── sim.py ├── env.py └── rendering.py ├── imitation_learning ├── train.sh ├── __init__.py ├── drone_ob_norm.py ├── bc_lunar.gin ├── bc_drone.gin ├── ob_norm.py ├── watch.py ├── data_collection_lunar.py ├── data_collection_drone.py └── trainer.py ├── lunar_lander ├── train.sh ├── train.py ├── watch.py ├── td3.gin ├── __init__.py ├── actor.py ├── joystick_agent.py ├── fuel_wrapper.py ├── env_reacher.py └── env_lander.py ├── __init__.py ├── launch_docker.sh ├── Dockerfile ├── .gitignore ├── residual_wrapper.py ├── base_actors.py └── README.md /ppo/train.sh: -------------------------------------------------------------------------------- 1 | python -m dl.main $@ 2 | -------------------------------------------------------------------------------- /drone_sim/train.sh: -------------------------------------------------------------------------------- 1 | python -m dl.main logs ppo.gin 2 | -------------------------------------------------------------------------------- /imitation_learning/train.sh: -------------------------------------------------------------------------------- 1 | python -m dl.main $@ 2 | -------------------------------------------------------------------------------- /lunar_lander/train.sh: -------------------------------------------------------------------------------- 1 | python -m dl.main logs td3.gin 2 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | from residual_shared_autonomy.residual_wrapper import ResidualWrapper 2 | from residual_shared_autonomy.base_actors import * 3 | -------------------------------------------------------------------------------- /ppo/models/pretrained_drone/ckpts/100007936.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbschaff/rsa/HEAD/ppo/models/pretrained_drone/ckpts/100007936.pt -------------------------------------------------------------------------------- /ppo/models/pretrained_lunar/ckpts/100007936.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbschaff/rsa/HEAD/ppo/models/pretrained_lunar/ckpts/100007936.pt -------------------------------------------------------------------------------- /ppo/models/drone_reacher_agent/ckpts/010485760.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbschaff/rsa/HEAD/ppo/models/drone_reacher_agent/ckpts/010485760.pt -------------------------------------------------------------------------------- /ppo/models/lunar_lander_agent/ckpts/001000000.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbschaff/rsa/HEAD/ppo/models/lunar_lander_agent/ckpts/001000000.pt -------------------------------------------------------------------------------- /ppo/__init__.py: -------------------------------------------------------------------------------- 1 | from residual_shared_autonomy.ppo.actor import * 2 | from residual_shared_autonomy.ppo.constrained_residual_ppo import ConstrainedResidualPPO 3 | -------------------------------------------------------------------------------- /launch_docker.sh: -------------------------------------------------------------------------------- 1 | sudo x-docker run --gpus all --net=host -it --rm --privileged -v `pwd`:/root/pkgs/residual_shared_autonomy residual_shared_autonomy:latest bash 2 | -------------------------------------------------------------------------------- /ppo/models/behavioral_cloning_agents_drone/bc0/ckpts/002728350.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbschaff/rsa/HEAD/ppo/models/behavioral_cloning_agents_drone/bc0/ckpts/002728350.pt -------------------------------------------------------------------------------- /ppo/models/behavioral_cloning_agents_drone/bc1/ckpts/002556300.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbschaff/rsa/HEAD/ppo/models/behavioral_cloning_agents_drone/bc1/ckpts/002556300.pt -------------------------------------------------------------------------------- /ppo/models/behavioral_cloning_agents_drone/bc2/ckpts/002116950.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbschaff/rsa/HEAD/ppo/models/behavioral_cloning_agents_drone/bc2/ckpts/002116950.pt -------------------------------------------------------------------------------- /ppo/models/behavioral_cloning_agents_drone/bc3/ckpts/001719600.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbschaff/rsa/HEAD/ppo/models/behavioral_cloning_agents_drone/bc3/ckpts/001719600.pt -------------------------------------------------------------------------------- /ppo/models/behavioral_cloning_agents_drone/bc4/ckpts/002935500.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbschaff/rsa/HEAD/ppo/models/behavioral_cloning_agents_drone/bc4/ckpts/002935500.pt -------------------------------------------------------------------------------- /ppo/models/behavioral_cloning_agents_drone/bc5/ckpts/002101950.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbschaff/rsa/HEAD/ppo/models/behavioral_cloning_agents_drone/bc5/ckpts/002101950.pt -------------------------------------------------------------------------------- /ppo/models/behavioral_cloning_agents_drone/bc6/ckpts/001462650.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbschaff/rsa/HEAD/ppo/models/behavioral_cloning_agents_drone/bc6/ckpts/001462650.pt -------------------------------------------------------------------------------- /ppo/models/behavioral_cloning_agents_drone/bc7/ckpts/002159100.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbschaff/rsa/HEAD/ppo/models/behavioral_cloning_agents_drone/bc7/ckpts/002159100.pt -------------------------------------------------------------------------------- /ppo/models/behavioral_cloning_agents_drone/bc8/ckpts/002733300.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbschaff/rsa/HEAD/ppo/models/behavioral_cloning_agents_drone/bc8/ckpts/002733300.pt -------------------------------------------------------------------------------- /ppo/models/behavioral_cloning_agents_drone/bc9/ckpts/002580450.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbschaff/rsa/HEAD/ppo/models/behavioral_cloning_agents_drone/bc9/ckpts/002580450.pt -------------------------------------------------------------------------------- /ppo/models/behavioral_cloning_agents_lunar/bc0/ckpts/001747200.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbschaff/rsa/HEAD/ppo/models/behavioral_cloning_agents_lunar/bc0/ckpts/001747200.pt -------------------------------------------------------------------------------- /ppo/models/behavioral_cloning_agents_lunar/bc1/ckpts/001272700.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbschaff/rsa/HEAD/ppo/models/behavioral_cloning_agents_lunar/bc1/ckpts/001272700.pt -------------------------------------------------------------------------------- /ppo/models/behavioral_cloning_agents_lunar/bc2/ckpts/001376400.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbschaff/rsa/HEAD/ppo/models/behavioral_cloning_agents_lunar/bc2/ckpts/001376400.pt -------------------------------------------------------------------------------- /ppo/models/behavioral_cloning_agents_lunar/bc3/ckpts/001579400.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbschaff/rsa/HEAD/ppo/models/behavioral_cloning_agents_lunar/bc3/ckpts/001579400.pt -------------------------------------------------------------------------------- /ppo/models/behavioral_cloning_agents_lunar/bc4/ckpts/001349200.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbschaff/rsa/HEAD/ppo/models/behavioral_cloning_agents_lunar/bc4/ckpts/001349200.pt -------------------------------------------------------------------------------- /ppo/models/behavioral_cloning_agents_lunar/bc5/ckpts/000993600.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbschaff/rsa/HEAD/ppo/models/behavioral_cloning_agents_lunar/bc5/ckpts/000993600.pt -------------------------------------------------------------------------------- /ppo/models/behavioral_cloning_agents_lunar/bc6/ckpts/001220200.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbschaff/rsa/HEAD/ppo/models/behavioral_cloning_agents_lunar/bc6/ckpts/001220200.pt -------------------------------------------------------------------------------- /ppo/models/behavioral_cloning_agents_lunar/bc7/ckpts/001122700.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbschaff/rsa/HEAD/ppo/models/behavioral_cloning_agents_lunar/bc7/ckpts/001122700.pt -------------------------------------------------------------------------------- /ppo/models/behavioral_cloning_agents_drone/bc10/ckpts/002958900.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbschaff/rsa/HEAD/ppo/models/behavioral_cloning_agents_drone/bc10/ckpts/002958900.pt -------------------------------------------------------------------------------- /ppo/models/behavioral_cloning_agents_drone/bc11/ckpts/001742550.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbschaff/rsa/HEAD/ppo/models/behavioral_cloning_agents_drone/bc11/ckpts/001742550.pt -------------------------------------------------------------------------------- /ppo/models/behavioral_cloning_agents_drone/bc12/ckpts/002901450.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbschaff/rsa/HEAD/ppo/models/behavioral_cloning_agents_drone/bc12/ckpts/002901450.pt -------------------------------------------------------------------------------- /ppo/models/behavioral_cloning_agents_drone/bc13/ckpts/003014400.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbschaff/rsa/HEAD/ppo/models/behavioral_cloning_agents_drone/bc13/ckpts/003014400.pt -------------------------------------------------------------------------------- /ppo/models/behavioral_cloning_agents_drone/bc14/ckpts/001760700.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbschaff/rsa/HEAD/ppo/models/behavioral_cloning_agents_drone/bc14/ckpts/001760700.pt -------------------------------------------------------------------------------- /imitation_learning/__init__.py: -------------------------------------------------------------------------------- 1 | from residual_shared_autonomy.imitation_learning.trainer import * 2 | from residual_shared_autonomy.imitation_learning.ob_norm import * 3 | from residual_shared_autonomy.imitation_learning.drone_ob_norm import * 4 | -------------------------------------------------------------------------------- /drone_sim/train.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import dl 3 | 4 | 5 | if __name__=='__main__': 6 | parser = argparse.ArgumentParser(description='Train Agent.') 7 | parser.add_argument('--expdir', type=str, help='expdir', required=True) 8 | parser.add_argument('--gin_config', type=str, help='gin config', required=True) 9 | parser.add_argument('-b', '--gin_bindings', nargs='+', help='gin bindings to overwrite config') 10 | args = parser.parse_args() 11 | dl.load_config(args.gin_config, args.gin_bindings) 12 | dl.train(args.expdir) 13 | -------------------------------------------------------------------------------- /lunar_lander/train.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import dl 3 | 4 | 5 | if __name__=='__main__': 6 | parser = argparse.ArgumentParser(description='Train Agent.') 7 | parser.add_argument('--expdir', type=str, help='expdir', required=True) 8 | parser.add_argument('--gin_config', type=str, help='gin config', required=True) 9 | parser.add_argument('-b', '--gin_bindings', nargs='+', help='gin bindings to overwrite config') 10 | args = parser.parse_args() 11 | dl.load_config(args.gin_config, args.gin_bindings) 12 | dl.train(args.expdir) 13 | -------------------------------------------------------------------------------- /imitation_learning/drone_ob_norm.py: -------------------------------------------------------------------------------- 1 | """Observation norm used by behavioral cloning agents and the shared autonomy agent.""" 2 | import numpy as np 3 | import gin 4 | 5 | 6 | @gin.configurable 7 | def drone_bc_mean(): 8 | return np.array([0.0043, -0.0065, 0.0965, 0.7479, -0.0983, -2.4260, 9 | -0.0219, -0.0484, 0.3360, -0.0124, -0.0320, 0.0437, 10 | 0.0000, 0.0000, 0.0570], 11 | dtype=np.float32) 12 | 13 | 14 | @gin.configurable 15 | def drone_bc_std(): 16 | return np.array([0.2092, 0.2249, 0.0903, 4.4870, 4.4999, 4.6344, 0.4893, 17 | 0.2624, 2.6024, 0.2528, 0.2541, 0.5348, 0.3333, 0.3333, 18 | 0.0455], dtype=np.float32) 19 | -------------------------------------------------------------------------------- /lunar_lander/watch.py: -------------------------------------------------------------------------------- 1 | """Render trained agents.""" 2 | import dl 3 | import argparse 4 | from dl.rl import rl_record, misc, PPO 5 | import os 6 | 7 | if __name__ == '__main__': 8 | parser = argparse.ArgumentParser(description='Train Script.') 9 | parser.add_argument('logdir', type=str, help='logdir') 10 | args = parser.parse_args() 11 | 12 | dl.load_config(os.path.join(args.logdir, 'config.gin')) 13 | 14 | t = PPO(args.logdir, nenv=1) 15 | t.load() 16 | t.pi.eval() 17 | misc.set_env_to_eval_mode(t.env) 18 | os.makedirs(os.path.join(t.logdir, 'video'), exist_ok=True) 19 | outfile = os.path.join(t.logdir, 'video', 20 | t.ckptr.format.format(t.t) + '.mp4') 21 | rl_record(t.env, t.pi, 10, outfile, t.device) 22 | t.close() 23 | -------------------------------------------------------------------------------- /drone_sim/watch.py: -------------------------------------------------------------------------------- 1 | """Render trained agents.""" 2 | import dl 3 | import argparse 4 | from dl.rl import rl_record, misc, PPO, VecFrameStack 5 | import residual_shared_autonomy.drone_sim 6 | import os 7 | 8 | if __name__ == '__main__': 9 | parser = argparse.ArgumentParser(description='Train Script.') 10 | parser.add_argument('logdir', type=str, help='logdir') 11 | args = parser.parse_args() 12 | 13 | dl.load_config(os.path.join(args.logdir, 'config.gin')) 14 | 15 | t = PPO(args.logdir, nenv=1) 16 | t.load() 17 | t.pi.eval() 18 | env = t.env 19 | misc.set_env_to_eval_mode(env) 20 | os.makedirs(os.path.join(t.logdir, 'video'), exist_ok=True) 21 | outfile = os.path.join(t.logdir, 'video', 22 | t.ckptr.format.format(t.t) + '.mp4') 23 | rl_record(env, t.pi, 10, outfile, t.device) 24 | t.close() 25 | -------------------------------------------------------------------------------- /imitation_learning/bc_lunar.gin: -------------------------------------------------------------------------------- 1 | import residual_shared_autonomy.imitation_learning 2 | 3 | 4 | train.algorithm = @BCTrainer 5 | train.maxt = 100 6 | train.seed = 0 7 | train.eval = True 8 | train.eval_period = 1 9 | train.save_period = 1 10 | train.maxseconds = None 11 | 12 | optim.Adam.lr = 0.001 13 | optim.Adam.betas = (0.9, 0.999) 14 | optim.Adam.eps = 1e-8 15 | 16 | BCTrainer.opt = @optim.Adam 17 | BCTrainer.model = @BCNet() 18 | BCTrainer.datafile = './data/chip.pt' # Change this to your datafile 19 | BCTrainer.batch_size = 32 20 | BCTrainer.num_workers = 1 21 | BCTrainer.gpu = True 22 | 23 | DemonstrationData.mean = @bc_mean() 24 | DemonstrationData.std = @bc_std() 25 | 26 | BCNet.ob_shape = 9 27 | BCNet.action_shape = 2 28 | BCNet.nunits = 128 29 | 30 | DiagGaussian.constant_log_std = False 31 | 32 | Checkpointer.ckpt_period = 2 33 | -------------------------------------------------------------------------------- /imitation_learning/bc_drone.gin: -------------------------------------------------------------------------------- 1 | import residual_shared_autonomy.imitation_learning 2 | 3 | 4 | train.algorithm = @BCTrainer 5 | train.maxt = 150 6 | train.seed = 0 7 | train.eval = True 8 | train.eval_period = 1 9 | train.save_period = 1 10 | train.maxseconds = None 11 | 12 | optim.Adam.lr = 0.001 13 | optim.Adam.betas = (0.9, 0.999) 14 | optim.Adam.eps = 1e-8 15 | 16 | BCTrainer.opt = @optim.Adam 17 | BCTrainer.model = @BCNet() 18 | BCTrainer.datafile = './drone_data/chip.pt' # Change this to your data file 19 | BCTrainer.batch_size = 32 20 | BCTrainer.num_workers = 1 21 | BCTrainer.gpu = True 22 | 23 | DemonstrationData.mean = @drone_bc_mean() 24 | DemonstrationData.std = @drone_bc_std() 25 | 26 | BCNet.ob_shape = 15 27 | BCNet.action_shape = 4 28 | BCNet.nunits = 128 29 | 30 | DiagGaussian.constant_log_std = False 31 | 32 | Checkpointer.ckpt_period = 2 33 | -------------------------------------------------------------------------------- /drone_sim/__init__.py: -------------------------------------------------------------------------------- 1 | from residual_shared_autonomy.drone_sim.sim import Drone 2 | from residual_shared_autonomy.drone_sim.rendering import DroneRenderer, DroneFollower 3 | from residual_shared_autonomy.drone_sim.actor import drone_ppo_policy_fn 4 | try: 5 | from residual_shared_autonomy.drone_sim.joystick_agent import DroneJoystickActor 6 | except: 7 | pass 8 | 9 | from gym.envs.registration import register 10 | 11 | register( 12 | id='DroneReacherHuman-v0', 13 | entry_point='residual_shared_autonomy.drone_sim.env:DroneReacherHuman', 14 | max_episode_steps=1000, 15 | reward_threshold=200 16 | ) 17 | 18 | register( 19 | id='DroneReacherBot-v0', 20 | entry_point='residual_shared_autonomy.drone_sim.env:DroneReacherBot', 21 | max_episode_steps=1000, 22 | reward_threshold=200 23 | ) 24 | 25 | register( 26 | id='DroneStabilizer-v0', 27 | entry_point='residual_shared_autonomy.drone_sim.env:DroneStabilizer', 28 | max_episode_steps=1000, 29 | reward_threshold=200 30 | ) 31 | -------------------------------------------------------------------------------- /imitation_learning/ob_norm.py: -------------------------------------------------------------------------------- 1 | """Observation norm used by behavioral cloning agents and the shared autonomy agent.""" 2 | import numpy as np 3 | import gin 4 | 5 | 6 | @gin.configurable 7 | def bc_mean(): 8 | return np.array([0.0024, 0.7034, 0.0347, -0.2975, -0.0375, -0.0126, 9 | 0.0528, 0.0550, -0.0248, -1.0000, -0.0947, -0.7991, 10 | -0.0927, -0.6008, -0.0884, -0.3996, -0.0877, -0.1998, 11 | -0.0865, 0.0000, -0.0876, 0.1998, -0.0892, 0.3996, 12 | -0.0901, 0.6008, -0.0882, 0.7991, -0.0827, 1.0000, 13 | -0.0851], dtype=np.float32) 14 | 15 | 16 | @gin.configurable 17 | def bc_std(): 18 | return np.array([0.3812, 0.4952, 0.6652, 0.3537, 0.5616, 0.4115, 0.2236, 19 | 0.2279, 0.5270, 0.0000, 0.1382, 0.0000, 0.1300, 0.0000, 20 | 0.1248, 0.0000, 0.1140, 0.0000, 0.1124, 0.0000, 0.1058, 21 | 0.0000, 0.1059, 0.0000, 0.1081, 0.0000, 0.1262, 0.0000, 22 | 0.1441, 0.0000, 0.1496], dtype=np.float32) 23 | -------------------------------------------------------------------------------- /drone_sim/ppo.gin: -------------------------------------------------------------------------------- 1 | import residual_shared_autonomy.drone_sim 2 | import residual_shared_autonomy.imitation_learning 3 | 4 | train.algorithm = @PPO 5 | train.maxt = 20000000 6 | train.seed = 0 7 | train.eval = True 8 | train.eval_period = 1000000 9 | train.save_period = 1000000 10 | train.maxseconds = None 11 | 12 | optim.Adam.lr = 0.001 13 | optim.Adam.betas = (0.9, 0.999) 14 | optim.Adam.eps = 1e-5 15 | 16 | PPO.env_fn = @make_env 17 | PPO.policy_fn = @drone_ppo_policy_fn 18 | PPO.nenv = 64 19 | PPO.eval_num_episodes = 100 20 | PPO.record_num_episodes = 0 21 | PPO.rollout_length = 1024 22 | PPO.batch_size = 512 23 | PPO.gamma = 0.99 24 | PPO.lambda_ = 0.95 25 | PPO.norm_advantages = True 26 | PPO.optimizer = @optim.Adam 27 | PPO.clip_param = 0.2 28 | PPO.epochs_per_rollout = 4 29 | PPO.max_grad_norm = 0.5 30 | PPO.ent_coef = 0.01 31 | PPO.vf_coef = 0.5 32 | PPO.gpu = True 33 | 34 | Checkpointer.ckpt_period = 1000000 35 | 36 | make_env.env_id = "DroneReacherBot-v0" 37 | make_env.norm_observations = True 38 | 39 | DiagGaussian.constant_log_std = False 40 | 41 | VecObsNormWrapper.steps = 10000 42 | VecObsNormWrapper.mean = @drone_bc_mean() 43 | VecObsNormWrapper.std = @drone_bc_std() 44 | VecObsNormWrapper.eps = 1e-2 45 | VecObsNormWrapper.log = True 46 | VecObsNormWrapper.log_prob = 0.01 47 | -------------------------------------------------------------------------------- /lunar_lander/td3.gin: -------------------------------------------------------------------------------- 1 | import residual_shared_autonomy.lunar_lander 2 | import residual_shared_autonomy.imitation_learning 3 | 4 | train.algorithm = @TD3 5 | train.maxt = 1000000 6 | train.seed = 0 7 | train.eval = True 8 | train.eval_period = 100000 9 | train.save_period = 100000 10 | train.maxseconds = None 11 | 12 | TD3.env_fn = @make_env 13 | TD3.policy_fn = @lunar_lander_policy_fn 14 | TD3.qf_fn = @lunar_lander_qf_fn 15 | TD3.nenv = 1 16 | TD3.eval_num_episodes = 20 17 | TD3.gpu = True 18 | TD3.record_num_episodes = 5 19 | TD3.buffer_size = 100000 20 | TD3.frame_stack = 1 21 | TD3.learning_starts = 10000 22 | TD3.update_period = 1 23 | TD3.env_fn = @make_env 24 | TD3.optimizer = @optim.Adam 25 | TD3.batch_size = 256 26 | TD3.lr = 3e-4 27 | TD3.gamma = 0.99 28 | TD3.exploration_noise = 0.1 29 | TD3.policy_noise = 0.2 30 | TD3.policy_noise_clip = 0.5 31 | TD3.policy_update_period = 2 32 | TD3.target_smoothing_coef = 0.005 33 | TD3.reward_scale = 1 34 | TD3.log_period = 100 35 | 36 | Checkpointer.ckpt_period = 100000 37 | 38 | optim.Adam.betas = (0.9, 0.999) 39 | 40 | make_env.env_id = "LunarLanderRandomContinuous-v2" 41 | make_env.norm_observations = True 42 | 43 | VecObsNormWrapper.steps = 10000 44 | VecObsNormWrapper.mean = @bc_mean() 45 | VecObsNormWrapper.std = @bc_std() 46 | VecObsNormWrapper.eps = 1e-2 47 | VecObsNormWrapper.log = True 48 | VecObsNormWrapper.log_prob = 0.01 49 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvidia/cudagl:10.1-devel-ubuntu18.04 2 | 3 | RUN apt-get update && apt-get upgrade -y 4 | 5 | # Install packages 6 | RUN apt-get install -y wget git vim libsm6 libxext6 libxrender-dev ffmpeg python-opengl 7 | 8 | # install anaconda 9 | RUN wget https://repo.anaconda.com/archive/Anaconda3-2019.10-Linux-x86_64.sh 10 | RUN bash Anaconda3-2019.10-Linux-x86_64.sh -b 11 | RUN rm Anaconda3-2019.10-Linux-x86_64.sh 12 | ENV PATH /root/anaconda3/bin:$PATH 13 | RUN conda update conda 14 | RUN yes | conda update anaconda 15 | RUN yes | conda update --all 16 | RUN conda init 17 | 18 | # Install packages 19 | RUN conda install -y -c pytorch pytorch torchvision 20 | RUN conda install -y tensorflow-gpu==1.14.0 21 | RUN pip install gin-config 22 | RUN pip install gym[atari] 23 | RUN pip install gym[box2d] 24 | 25 | WORKDIR /root 26 | RUN git clone https://github.com/openai/baselines.git 27 | WORKDIR /root/baselines 28 | RUN git checkout adba88b218edbb3f3bd8044dc963f8ca0901d6c6 29 | RUN pip install . 30 | 31 | # Add a directory for python packages to be mounted 32 | ENV PYTHONPATH /root/pkgs:$PYTHONPATH 33 | 34 | RUN apt-get install -y freeglut3-dev 35 | RUN conda install -y PyOpenGL 36 | RUN pip install pygame PyOpenGL_accelerate 37 | 38 | WORKDIR /root/pkgs 39 | RUN git clone https://github.com/cbschaff/dl.git 40 | WORKDIR /root/pkgs/dl 41 | RUN git checkout 650db8abc90053305be95e73ce28da624e9092dc 42 | 43 | WORKDIR /root 44 | # Bash entrypoint 45 | ENTRYPOINT /bin/bash 46 | -------------------------------------------------------------------------------- /imitation_learning/watch.py: -------------------------------------------------------------------------------- 1 | """Render trained behavioral cloning agents.""" 2 | import dl 3 | from dl.rl import make_env, VecObsNormWrapper 4 | from residual_shared_autonomy.imitation_learning import bc_mean, bc_std 5 | from residual_shared_autonomy.imitation_learning import drone_bc_mean, drone_bc_std 6 | from residual_shared_autonomy.imitation_learning import BCTrainer 7 | import residual_shared_autonomy.lunar_lander 8 | import residual_shared_autonomy.drone_sim 9 | import argparse 10 | import os 11 | import torch 12 | 13 | if __name__ == '__main__': 14 | parser = argparse.ArgumentParser(description='Train Script.') 15 | parser.add_argument('logdir', type=str, help='logdir') 16 | parser.add_argument('--drone', action='store_true', help='logdir') 17 | args = parser.parse_args() 18 | 19 | dl.load_config(os.path.join(args.logdir, 'config.gin')) 20 | 21 | trainer = BCTrainer(args.logdir) 22 | trainer.load() 23 | 24 | id = "DroneReacherBot-v0" if args.drone else "LunarLanderRandomContinuous-v2" 25 | env = make_env(env_id=id, 26 | nenv=1, norm_observations=False) 27 | if args.drone: 28 | env = VecObsNormWrapper(env, mean=drone_bc_mean(), 29 | std=drone_bc_std(), log=False) 30 | else: 31 | env = VecObsNormWrapper(env, mean=bc_mean(), std=bc_std(), log=False) 32 | 33 | for _ in range(5): 34 | ob = env.reset() 35 | env.render() 36 | done = False 37 | while not done: 38 | dist = trainer.model(torch.from_numpy(ob).to(trainer.device)) 39 | ob, _, done, _ = env.step(dist.sample().cpu().numpy()) 40 | env.render() 41 | -------------------------------------------------------------------------------- /ppo/joystick_control.py: -------------------------------------------------------------------------------- 1 | """Render trained agents.""" 2 | import dl 3 | import argparse 4 | from residual_shared_autonomy.ppo import ConstrainedResidualPPO 5 | from residual_shared_autonomy.lunar_lander import LunarLanderJoystickActor 6 | from residual_shared_autonomy.drone_sim import DroneJoystickActor 7 | import os 8 | 9 | if __name__ == '__main__': 10 | parser = argparse.ArgumentParser(description='Train Script.') 11 | parser.add_argument('logdir', type=str, help='logdir') 12 | parser.add_argument('--drone', action='store_true', help='conrol drone env') 13 | parser.add_argument('--reacher', action='store_true', help='conrol luanr reacher env') 14 | args = parser.parse_args() 15 | 16 | if args.drone: 17 | dl.load_config(os.path.join(args.logdir, 'config.gin'), 18 | ['make_env.env_id="DroneReacherBot-v0"']) 19 | trainer = ConstrainedResidualPPO(args.logdir, nenv=1, 20 | base_actor_cls=DroneJoystickActor) 21 | elif args.reacher: 22 | dl.load_config(os.path.join(args.logdir, 'config.gin'), 23 | ['make_env.env_id="LunarLanderReacher-v2"']) 24 | trainer = ConstrainedResidualPPO(args.logdir, nenv=1, 25 | base_actor_cls=LunarLanderJoystickActor) 26 | else: 27 | dl.load_config(os.path.join(args.logdir, 'config.gin'), 28 | ['make_env.env_id="LunarLanderRandomContinuous-v2"']) 29 | trainer = ConstrainedResidualPPO(args.logdir, nenv=1, 30 | base_actor_cls=LunarLanderJoystickActor) 31 | trainer.load() 32 | trainer.evaluate() 33 | trainer.close() 34 | -------------------------------------------------------------------------------- /drone_sim/actor.py: -------------------------------------------------------------------------------- 1 | """Defines networks for PPO experiments.""" 2 | from dl.rl.modules import ActorCriticBase, Policy 3 | from dl.modules import DiagGaussian 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | import gin 8 | 9 | 10 | class FeedForwardActorCriticBase(ActorCriticBase): 11 | """Policy and Value networks.""" 12 | 13 | def __init__(self, *args, nunits=128, **kwargs): 14 | """Init.""" 15 | self.nunits = nunits 16 | super().__init__(*args, **kwargs) 17 | 18 | def build(self): 19 | """Build.""" 20 | inshape = self.observation_space.shape[0] 21 | self.fc1 = nn.Linear(inshape, self.nunits) 22 | self.fc2 = nn.Linear(self.nunits, self.nunits) 23 | self.dist = DiagGaussian(self.nunits, self.action_space.shape[0]) 24 | for p in self.dist.fc_mean.parameters(): 25 | nn.init.constant_(p, 0.) 26 | 27 | self.vf_fc1 = nn.Linear(inshape, self.nunits) 28 | self.vf_fc2 = nn.Linear(self.nunits, self.nunits) 29 | self.vf_out = nn.Linear(self.nunits, 1) 30 | 31 | def forward(self, x): 32 | """Forward.""" 33 | net = x 34 | net = F.relu(self.fc1(net)) 35 | net = F.relu(self.fc2(net)) 36 | pi = self.dist(net) 37 | 38 | net = x 39 | net = F.relu(self.vf_fc1(net)) 40 | net = F.relu(self.vf_fc2(net)) 41 | vf = self.vf_out(net) 42 | 43 | return pi, vf 44 | 45 | 46 | @gin.configurable 47 | def drone_ppo_policy_fn(env, nunits=128): 48 | """Create a policy network.""" 49 | return Policy(FeedForwardActorCriticBase(env.observation_space, 50 | env.action_space, 51 | nunits=nunits)) 52 | -------------------------------------------------------------------------------- /lunar_lander/__init__.py: -------------------------------------------------------------------------------- 1 | from residual_shared_autonomy.lunar_lander.actor import lunar_lander_policy_fn, lunar_lander_qf_fn 2 | from residual_shared_autonomy.lunar_lander.fuel_wrapper import LunarLanderFuelWrapper 3 | try: 4 | from residual_shared_autonomy.lunar_lander.joystick_agent import LunarLanderJoystickActor 5 | from residual_shared_autonomy.lunar_lander.keyboard_agent import LunarLanderKeyboardActor 6 | except: 7 | pass 8 | 9 | from gym.envs.registration import register 10 | 11 | register( 12 | id='LunarLanderRandom-v2', 13 | entry_point='residual_shared_autonomy.lunar_lander.env_lander:LunarLander', 14 | max_episode_steps=1000, 15 | reward_threshold=200 16 | ) 17 | 18 | register( 19 | id='LunarLanderRandomContinuous-v2', 20 | entry_point='residual_shared_autonomy.lunar_lander.env_lander:LunarLanderContinuous', 21 | max_episode_steps=1000, 22 | reward_threshold=200 23 | ) 24 | 25 | register( 26 | id='LunarLanderRandomNoGoal-v2', 27 | entry_point='residual_shared_autonomy.lunar_lander.env_lander:LunarLanderNoGoal', 28 | max_episode_steps=1000, 29 | reward_threshold=200 30 | ) 31 | 32 | register( 33 | id='LunarLanderRandomContinuousNoGoal-v2', 34 | entry_point='residual_shared_autonomy.lunar_lander.env_lander:LunarLanderContinuousNoGoal', 35 | max_episode_steps=1000, 36 | reward_threshold=200 37 | ) 38 | 39 | register( 40 | id='LunarLanderRandomConstrained-v2', 41 | entry_point='residual_shared_autonomy.lunar_lander.env_lander:LunarLanderConstrained', 42 | max_episode_steps=1000, 43 | reward_threshold=200 44 | ) 45 | 46 | register( 47 | id='LunarLanderReacher-v2', 48 | entry_point='residual_shared_autonomy.lunar_lander.env_reacher:LunarLanderReacher', 49 | max_episode_steps=1000, 50 | reward_threshold=200 51 | ) 52 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # training logs 2 | logs/ 3 | ppo/logs* 4 | 5 | # Byte-compiled / optimized / DLL files 6 | __pycache__/ 7 | *.py[cod] 8 | *$py.class 9 | 10 | # C extensions 11 | *.so 12 | 13 | # Distribution / packaging 14 | .Python 15 | build/ 16 | develop-eggs/ 17 | dist/ 18 | downloads/ 19 | eggs/ 20 | .eggs/ 21 | lib/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | wheels/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | MANIFEST 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | 38 | # Installer logs 39 | pip-log.txt 40 | pip-delete-this-directory.txt 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .coverage 46 | .coverage.* 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | *.cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | 63 | # Flask stuff: 64 | instance/ 65 | .webassets-cache 66 | 67 | # Scrapy stuff: 68 | .scrapy 69 | 70 | # Sphinx documentation 71 | docs/_build/ 72 | 73 | # PyBuilder 74 | target/ 75 | 76 | # Jupyter Notebook 77 | .ipynb_checkpoints 78 | 79 | # pyenv 80 | .python-version 81 | 82 | # celery beat schedule file 83 | celerybeat-schedule 84 | 85 | # SageMath parsed files 86 | *.sage.py 87 | 88 | # Environments 89 | .env 90 | .venv 91 | env/ 92 | venv/ 93 | ENV/ 94 | env.bak/ 95 | venv.bak/ 96 | 97 | # Spyder project settings 98 | .spyderproject 99 | .spyproject 100 | 101 | # Rope project settings 102 | .ropeproject 103 | 104 | # mkdocs documentation 105 | /site 106 | 107 | # mypy 108 | .mypy_cache/ 109 | -------------------------------------------------------------------------------- /imitation_learning/data_collection_lunar.py: -------------------------------------------------------------------------------- 1 | """Collect human data from LunarLander.""" 2 | import gym 3 | from residual_shared_autonomy.lunar_lander import LunarLanderJoystickActor 4 | from dl.rl import ensure_vec_env 5 | import torch 6 | import time 7 | import argparse 8 | import os 9 | 10 | if __name__ == '__main__': 11 | parser = argparse.ArgumentParser(description='Data collection.') 12 | parser.add_argument('datafile', type=str, help='datafile') 13 | parser.add_argument('neps', type=int, help='# of episodes') 14 | parser.add_argument('--overwrite', action='store_true', 15 | help='overwrite data') 16 | args = parser.parse_args() 17 | 18 | env = gym.make("LunarLanderRandomContinuous-v2") 19 | env = ensure_vec_env(env) 20 | 21 | if os.path.exists(args.datafile): 22 | if not args.overwrite: 23 | raise ValueError('Data file {} already exists. Add --overwrite to ' 24 | ' overwrite the file.'.format(args.datafile)) 25 | 26 | actor = LunarLanderJoystickActor(env) 27 | 28 | transitions = [] 29 | for i in range(args.neps): 30 | print(i) 31 | ob = env.reset() 32 | env.render() 33 | done = False 34 | time.sleep(1.) 35 | 36 | while not done: 37 | action = actor(ob) 38 | ob_next, reward, done, _ = env.step(action) 39 | transitions.append([ob, action.copy(), reward, done]) 40 | ob = ob_next 41 | time.sleep(0.5) 42 | 43 | obs, actions, rewards, dones = zip(*transitions) 44 | data = {'obs': torch.Tensor(obs).squeeze(), 45 | 'actions': torch.Tensor(actions).squeeze(), 46 | 'rewards': torch.Tensor(rewards).squeeze(), 47 | 'dones': torch.Tensor(dones).squeeze()} 48 | 49 | torch.save(data, args.datafile) 50 | -------------------------------------------------------------------------------- /lunar_lander/actor.py: -------------------------------------------------------------------------------- 1 | """Defines networks for Lunar Lander experiments.""" 2 | from dl.rl.modules import PolicyBase, ContinuousQFunctionBase 3 | from dl.rl.modules import QFunction, UnnormActionPolicy 4 | from dl.modules import TanhDelta 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | import gin 9 | 10 | 11 | class FeedForwardPolicyBase(PolicyBase): 12 | """Policy network.""" 13 | 14 | def build(self): 15 | """Build.""" 16 | self.fc1 = nn.Linear(9, 256) 17 | self.fc2 = nn.Linear(256, 256) 18 | self.dist = TanhDelta(256, self.action_space.shape[0]) 19 | 20 | def forward(self, x): 21 | """Forward.""" 22 | x = x[:, :9] 23 | x = F.relu(self.fc1(x)) 24 | x = F.relu(self.fc2(x)) 25 | return self.dist(x) 26 | 27 | 28 | class AppendActionFeedForwardQFBase(ContinuousQFunctionBase): 29 | """Q network.""" 30 | 31 | def build(self): 32 | """Build.""" 33 | nin = 9 + self.action_space.shape[0] 34 | self.fc1 = nn.Linear(nin, 256) 35 | self.fc2 = nn.Linear(256, 256) 36 | self.qvalue = nn.Linear(256, 1) 37 | 38 | def forward(self, x, a): 39 | """Forward.""" 40 | x = x[:, :9] 41 | x = F.relu(self.fc1(torch.cat([x, a], dim=1))) 42 | x = F.relu(self.fc2(x)) 43 | return self.qvalue(x) 44 | 45 | 46 | @gin.configurable 47 | def lunar_lander_policy_fn(env): 48 | """Create a policy network.""" 49 | return UnnormActionPolicy(FeedForwardPolicyBase(env.observation_space, 50 | env.action_space)) 51 | 52 | 53 | @gin.configurable 54 | def lunar_lander_qf_fn(env): 55 | """Create a qfunction network.""" 56 | return QFunction(AppendActionFeedForwardQFBase(env.observation_space, 57 | env.action_space)) 58 | -------------------------------------------------------------------------------- /ppo/actor.py: -------------------------------------------------------------------------------- 1 | """Defines networks for Residual PPO experiments.""" 2 | from dl.rl.modules import ActorCriticBase, Policy 3 | from dl.modules import DiagGaussian 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | import gin 8 | 9 | 10 | class FeedForwardActorCriticBase(ActorCriticBase): 11 | """Policy and Value networks.""" 12 | 13 | def __init__(self, *args, nunits=128, **kwargs): 14 | """Init.""" 15 | self.nunits = nunits 16 | super().__init__(*args, **kwargs) 17 | 18 | def build(self): 19 | """Build.""" 20 | inshape = (self.observation_space.spaces[0].shape[0] 21 | + self.observation_space.spaces[1].shape[0]) 22 | self.fc1 = nn.Linear(inshape, self.nunits) 23 | self.fc2 = nn.Linear(self.nunits, self.nunits) 24 | self.dist = DiagGaussian(self.nunits, self.action_space.shape[0]) 25 | for p in self.dist.fc_mean.parameters(): 26 | nn.init.constant_(p, 0.) 27 | 28 | self.vf_fc1 = nn.Linear(inshape, self.nunits) 29 | self.vf_fc2 = nn.Linear(self.nunits, self.nunits) 30 | self.vf_out = nn.Linear(self.nunits, 1) 31 | 32 | def forward(self, x): 33 | """Forward.""" 34 | ob = torch.cat(x, axis=1) 35 | net = ob 36 | net = F.relu(self.fc1(net)) 37 | net = F.relu(self.fc2(net)) 38 | pi = self.dist(net) 39 | 40 | net = ob 41 | net = F.relu(self.vf_fc1(net)) 42 | net = F.relu(self.vf_fc2(net)) 43 | vf = self.vf_out(net) 44 | 45 | return pi, vf 46 | 47 | 48 | @gin.configurable 49 | def ppo_policy_fn(env, nunits=128): 50 | """Create a policy network.""" 51 | return Policy(FeedForwardActorCriticBase(env.observation_space, 52 | env.action_space, 53 | nunits=nunits)) 54 | -------------------------------------------------------------------------------- /imitation_learning/data_collection_drone.py: -------------------------------------------------------------------------------- 1 | """Collect human data from DroneReacher.""" 2 | import gym 3 | import residual_shared_autonomy.drone_sim 4 | from residual_shared_autonomy.drone_sim.joystick_agent import DroneJoystickActor 5 | from dl.rl import ensure_vec_env 6 | import torch 7 | import time 8 | import argparse 9 | import os 10 | 11 | if __name__ == '__main__': 12 | parser = argparse.ArgumentParser(description='Data collection.') 13 | parser.add_argument('datafile', type=str, help='datafile') 14 | parser.add_argument('neps', type=int, help='# of episodes') 15 | parser.add_argument('--overwrite', action='store_true', 16 | help='overwrite data') 17 | args = parser.parse_args() 18 | 19 | env = gym.make("DroneReacherBot-v0") 20 | env = ensure_vec_env(env) 21 | 22 | if os.path.exists(args.datafile): 23 | if not args.overwrite: 24 | raise ValueError('Data file {} already exists. Add --overwrite to ' 25 | ' overwrite the file.'.format(args.datafile)) 26 | 27 | actor = DroneJoystickActor(env) 28 | 29 | transitions = [] 30 | for i in range(args.neps): 31 | print(i) 32 | ob = env.reset() 33 | actor.reset() 34 | env.render() 35 | done = False 36 | time.sleep(1.) 37 | 38 | while not done: 39 | action = actor(ob) 40 | ob_next, reward, done, _ = env.step(action) 41 | transitions.append([ob, action.copy(), reward, done]) 42 | ob = ob_next 43 | time.sleep(0.5) 44 | 45 | obs, actions, rewards, dones = zip(*transitions) 46 | data = {'obs': torch.Tensor(obs).squeeze(), 47 | 'actions': torch.Tensor(actions).squeeze(), 48 | 'rewards': torch.Tensor(rewards).squeeze(), 49 | 'dones': torch.Tensor(dones).squeeze()} 50 | 51 | torch.save(data, args.datafile) 52 | -------------------------------------------------------------------------------- /ppo/configs/ppo_drone_noisy.gin: -------------------------------------------------------------------------------- 1 | import residual_shared_autonomy.ppo 2 | import residual_shared_autonomy.drone_sim 3 | import residual_shared_autonomy.imitation_learning 4 | 5 | train.algorithm = @ConstrainedResidualPPO 6 | train.maxt = 100000000 7 | train.seed = 0 8 | train.eval = True 9 | train.eval_period = 1000000 10 | train.save_period = 1000000 11 | train.maxseconds = None 12 | 13 | optim.Adam.lr = 0.001 14 | optim.Adam.betas = (0.9, 0.999) 15 | optim.Adam.eps = 1e-5 16 | 17 | ConstrainedResidualPPO.policy_training_start = 500000 18 | ConstrainedResidualPPO.lambda_training_start = 5000000 19 | ConstrainedResidualPPO.lambda_lr = 0.001 20 | ConstrainedResidualPPO.lambda_init = 10 21 | ConstrainedResidualPPO.lr_decay_rate = 0.31622776601 22 | ConstrainedResidualPPO.lr_decay_freq = 20000000 23 | ConstrainedResidualPPO.l2_reg = False 24 | ConstrainedResidualPPO.reward_threshold = -0.875 25 | ConstrainedResidualPPO.env_fn = @make_env 26 | ConstrainedResidualPPO.policy_fn = @ppo_policy_fn 27 | ConstrainedResidualPPO.nenv = 64 28 | ConstrainedResidualPPO.eval_num_episodes = 100 29 | ConstrainedResidualPPO.record_num_episodes = 0 30 | ConstrainedResidualPPO.rollout_length = 1024 31 | ConstrainedResidualPPO.batch_size = 512 32 | ConstrainedResidualPPO.gamma = 0.99 33 | ConstrainedResidualPPO.lambda_ = 0.95 34 | ConstrainedResidualPPO.norm_advantages = True 35 | ConstrainedResidualPPO.optimizer = @optim.Adam 36 | ConstrainedResidualPPO.clip_param = 0.2 37 | ConstrainedResidualPPO.epochs_per_rollout = 4 38 | ConstrainedResidualPPO.max_grad_norm = 0.5 39 | ConstrainedResidualPPO.ent_coef = 0.01 40 | ConstrainedResidualPPO.vf_coef = 0.5 41 | ConstrainedResidualPPO.gpu = True 42 | ConstrainedResidualPPO.base_actor_cls = @NoisyActor 43 | 44 | NoisyActor.actor_cls = @DroneReacherActor 45 | NoisyActor.eps = 0.5 46 | DroneReacherActor.device = 'cuda:0' 47 | DroneReacherActor.logdir = './models/drone_reacher_agent' 48 | 49 | 50 | Checkpointer.ckpt_period = 100000 51 | 52 | make_env.env_id = "DroneStabilizer-v0" 53 | make_env.norm_observations = True 54 | 55 | VecObsNormWrapper.steps = 10000 56 | VecObsNormWrapper.mean = @drone_bc_mean() 57 | VecObsNormWrapper.std = @drone_bc_std() 58 | VecObsNormWrapper.eps = 1e-2 59 | VecObsNormWrapper.log = True 60 | VecObsNormWrapper.log_prob = 0.01 61 | 62 | # Model Params 63 | ppo_policy_fn.nunits = 128 64 | DiagGaussian.constant_log_std = False 65 | -------------------------------------------------------------------------------- /ppo/configs/ppo_drone_laggy.gin: -------------------------------------------------------------------------------- 1 | import residual_shared_autonomy.ppo 2 | import residual_shared_autonomy.drone_sim 3 | import residual_shared_autonomy.imitation_learning 4 | 5 | train.algorithm = @ConstrainedResidualPPO 6 | train.maxt = 100000000 7 | train.seed = 0 8 | train.eval = True 9 | train.eval_period = 1000000 10 | train.save_period = 1000000 11 | train.maxseconds = None 12 | 13 | optim.Adam.lr = 0.001 14 | optim.Adam.betas = (0.9, 0.999) 15 | optim.Adam.eps = 1e-5 16 | 17 | ConstrainedResidualPPO.policy_training_start = 500000 18 | ConstrainedResidualPPO.lambda_training_start = 5000000 19 | ConstrainedResidualPPO.lambda_lr = 0.001 20 | ConstrainedResidualPPO.lambda_init = 10 21 | ConstrainedResidualPPO.lr_decay_rate = 0.31622776601 22 | ConstrainedResidualPPO.lr_decay_freq = 20000000 23 | ConstrainedResidualPPO.l2_reg = False 24 | ConstrainedResidualPPO.reward_threshold = -0.875 25 | ConstrainedResidualPPO.env_fn = @make_env 26 | ConstrainedResidualPPO.policy_fn = @ppo_policy_fn 27 | ConstrainedResidualPPO.nenv = 64 28 | ConstrainedResidualPPO.eval_num_episodes = 100 29 | ConstrainedResidualPPO.record_num_episodes = 0 30 | ConstrainedResidualPPO.rollout_length = 1024 31 | ConstrainedResidualPPO.batch_size = 512 32 | ConstrainedResidualPPO.gamma = 0.99 33 | ConstrainedResidualPPO.lambda_ = 0.95 34 | ConstrainedResidualPPO.norm_advantages = True 35 | ConstrainedResidualPPO.optimizer = @optim.Adam 36 | ConstrainedResidualPPO.clip_param = 0.2 37 | ConstrainedResidualPPO.epochs_per_rollout = 4 38 | ConstrainedResidualPPO.max_grad_norm = 0.5 39 | ConstrainedResidualPPO.ent_coef = 0.01 40 | ConstrainedResidualPPO.vf_coef = 0.5 41 | ConstrainedResidualPPO.gpu = True 42 | ConstrainedResidualPPO.base_actor_cls = @LaggyActor 43 | 44 | LaggyActor.actor_cls = @DroneReacherActor 45 | LaggyActor.repeat_prob = 0.8 46 | DroneReacherActor.device = 'cuda:0' 47 | DroneReacherActor.logdir = './models/drone_reacher_agent' 48 | 49 | 50 | Checkpointer.ckpt_period = 100000 51 | 52 | make_env.env_id = "DroneStabilizer-v0" 53 | make_env.norm_observations = True 54 | 55 | VecObsNormWrapper.steps = 10000 56 | VecObsNormWrapper.mean = @drone_bc_mean() 57 | VecObsNormWrapper.std = @drone_bc_std() 58 | VecObsNormWrapper.eps = 1e-2 59 | VecObsNormWrapper.log = True 60 | VecObsNormWrapper.log_prob = 0.01 61 | 62 | # Model Params 63 | ppo_policy_fn.nunits = 128 64 | DiagGaussian.constant_log_std = False 65 | -------------------------------------------------------------------------------- /ppo/configs/ppo_drone.gin: -------------------------------------------------------------------------------- 1 | import residual_shared_autonomy.ppo 2 | import residual_shared_autonomy.drone_sim 3 | import residual_shared_autonomy.imitation_learning 4 | 5 | train.algorithm = @ConstrainedResidualPPO 6 | train.maxt = 100000000 7 | train.seed = 0 8 | train.eval = True 9 | train.eval_period = 1000000 10 | train.save_period = 1000000 11 | train.maxseconds = None 12 | 13 | optim.Adam.lr = 0.001 14 | optim.Adam.betas = (0.9, 0.999) 15 | optim.Adam.eps = 1e-5 16 | 17 | ConstrainedResidualPPO.policy_training_start = 500000 18 | ConstrainedResidualPPO.lambda_training_start = 5000000 19 | ConstrainedResidualPPO.lambda_lr = 0.001 20 | ConstrainedResidualPPO.lambda_init = 10 21 | ConstrainedResidualPPO.lr_decay_rate = 0.31622776601 22 | ConstrainedResidualPPO.lr_decay_freq = 20000000 23 | ConstrainedResidualPPO.l2_reg = False 24 | ConstrainedResidualPPO.reward_threshold = -0.875 25 | ConstrainedResidualPPO.env_fn = @make_env 26 | ConstrainedResidualPPO.policy_fn = @ppo_policy_fn 27 | ConstrainedResidualPPO.nenv = 64 28 | ConstrainedResidualPPO.eval_num_episodes = 100 29 | ConstrainedResidualPPO.record_num_episodes = 0 30 | ConstrainedResidualPPO.rollout_length = 1024 31 | ConstrainedResidualPPO.batch_size = 512 32 | ConstrainedResidualPPO.gamma = 0.99 33 | ConstrainedResidualPPO.lambda_ = 0.95 34 | ConstrainedResidualPPO.norm_advantages = True 35 | ConstrainedResidualPPO.optimizer = @optim.Adam 36 | ConstrainedResidualPPO.clip_param = 0.2 37 | ConstrainedResidualPPO.epochs_per_rollout = 4 38 | ConstrainedResidualPPO.max_grad_norm = 0.5 39 | ConstrainedResidualPPO.ent_coef = 0.01 40 | ConstrainedResidualPPO.vf_coef = 0.5 41 | ConstrainedResidualPPO.gpu = True 42 | ConstrainedResidualPPO.base_actor_cls = @BCMultiActor 43 | 44 | BCMultiActor.logdir = "./models/behavioral_cloning_agents_drone" 45 | BCMultiActor.device = "cuda:0" 46 | BCMultiActor.switch_prob = 0.001 47 | 48 | BCNet.ob_shape = 15 49 | BCNet.action_shape = 4 50 | BCNet.nunits = 128 51 | 52 | 53 | Checkpointer.ckpt_period = 100000 54 | 55 | make_env.env_id = "DroneStabilizer-v0" 56 | make_env.norm_observations = True 57 | 58 | VecObsNormWrapper.steps = 10000 59 | VecObsNormWrapper.mean = @drone_bc_mean() 60 | VecObsNormWrapper.std = @drone_bc_std() 61 | VecObsNormWrapper.eps = 1e-2 62 | VecObsNormWrapper.log = True 63 | VecObsNormWrapper.log_prob = 0.01 64 | 65 | # Model Params 66 | ppo_policy_fn.nunits = 128 67 | DiagGaussian.constant_log_std = False 68 | -------------------------------------------------------------------------------- /ppo/configs/ppo_lunar_noisy.gin: -------------------------------------------------------------------------------- 1 | import residual_shared_autonomy.ppo 2 | import residual_shared_autonomy.imitation_learning 3 | import residual_shared_autonomy.lunar_lander 4 | 5 | train.algorithm = @ConstrainedResidualPPO 6 | train.maxt = 100000000 7 | train.seed = 0 8 | train.eval = True 9 | train.eval_period = 1000000 10 | train.save_period = 1000000 11 | train.maxseconds = None 12 | 13 | optim.Adam.lr = 0.001 14 | optim.Adam.betas = (0.9, 0.999) 15 | optim.Adam.eps = 1e-5 16 | 17 | ConstrainedResidualPPO.policy_training_start = 100000 18 | ConstrainedResidualPPO.lambda_training_start = 2000000 19 | ConstrainedResidualPPO.lambda_lr = 0.001 20 | ConstrainedResidualPPO.lambda_init = 10 21 | ConstrainedResidualPPO.lr_decay_rate = 0.31622776601 22 | ConstrainedResidualPPO.lr_decay_freq = 20000000 23 | ConstrainedResidualPPO.l2_reg = False 24 | ConstrainedResidualPPO.reward_threshold = -50.0 25 | ConstrainedResidualPPO.env_fn = @make_env 26 | ConstrainedResidualPPO.policy_fn = @ppo_policy_fn 27 | ConstrainedResidualPPO.nenv = 32 28 | ConstrainedResidualPPO.eval_num_episodes = 100 29 | ConstrainedResidualPPO.record_num_episodes = 0 30 | ConstrainedResidualPPO.rollout_length = 1024 31 | ConstrainedResidualPPO.batch_size = 256 32 | ConstrainedResidualPPO.gamma = 0.99 33 | ConstrainedResidualPPO.lambda_ = 0.95 34 | ConstrainedResidualPPO.norm_advantages = True 35 | ConstrainedResidualPPO.optimizer = @optim.Adam 36 | ConstrainedResidualPPO.clip_param = 0.2 37 | ConstrainedResidualPPO.epochs_per_rollout = 4 38 | ConstrainedResidualPPO.max_grad_norm = 0.5 39 | ConstrainedResidualPPO.ent_coef = 0.01 40 | ConstrainedResidualPPO.vf_coef = 0.5 41 | ConstrainedResidualPPO.gpu = True 42 | ConstrainedResidualPPO.base_actor_cls = @NoisyActor 43 | ConstrainedResidualPPO.wrapper_fn = @lunar_lander_fuel_wrapper 44 | 45 | NoisyActor.actor_cls = @LunarLanderActor 46 | NoisyActor.eps = 0.5 47 | LunarLanderActor.logdir = "./models/lunar_lander_agent" 48 | LunarLanderActor.device = "cuda:0" 49 | 50 | Checkpointer.ckpt_period = 100000 51 | 52 | make_env.env_id = "LunarLanderRandomContinuousNoGoal-v2" 53 | make_env.norm_observations = True 54 | 55 | VecObsNormWrapper.steps = 10000 56 | VecObsNormWrapper.mean = @bc_mean() 57 | VecObsNormWrapper.std = @bc_std() 58 | VecObsNormWrapper.eps = 1e-2 59 | VecObsNormWrapper.log = True 60 | VecObsNormWrapper.log_prob = 0.01 61 | 62 | # Model Params 63 | ppo_policy_fn.nunits = 128 64 | DiagGaussian.constant_log_std = False 65 | -------------------------------------------------------------------------------- /ppo/configs/ppo_lunar_laggy.gin: -------------------------------------------------------------------------------- 1 | import residual_shared_autonomy.ppo 2 | import residual_shared_autonomy.imitation_learning 3 | import residual_shared_autonomy.lunar_lander 4 | 5 | train.algorithm = @ConstrainedResidualPPO 6 | train.maxt = 100000000 7 | train.seed = 0 8 | train.eval = True 9 | train.eval_period = 1000000 10 | train.save_period = 1000000 11 | train.maxseconds = None 12 | 13 | optim.Adam.lr = 0.001 14 | optim.Adam.betas = (0.9, 0.999) 15 | optim.Adam.eps = 1e-5 16 | 17 | ConstrainedResidualPPO.policy_training_start = 100000 18 | ConstrainedResidualPPO.lambda_training_start = 2000000 19 | ConstrainedResidualPPO.lambda_lr = 0.001 20 | ConstrainedResidualPPO.lambda_init = 10 21 | ConstrainedResidualPPO.lr_decay_rate = 0.31622776601 22 | ConstrainedResidualPPO.lr_decay_freq = 20000000 23 | ConstrainedResidualPPO.l2_reg = False 24 | ConstrainedResidualPPO.reward_threshold = -40.0 25 | ConstrainedResidualPPO.env_fn = @make_env 26 | ConstrainedResidualPPO.policy_fn = @ppo_policy_fn 27 | ConstrainedResidualPPO.nenv = 32 28 | ConstrainedResidualPPO.eval_num_episodes = 100 29 | ConstrainedResidualPPO.record_num_episodes = 0 30 | ConstrainedResidualPPO.rollout_length = 1024 31 | ConstrainedResidualPPO.batch_size = 256 32 | ConstrainedResidualPPO.gamma = 0.99 33 | ConstrainedResidualPPO.lambda_ = 0.95 34 | ConstrainedResidualPPO.norm_advantages = True 35 | ConstrainedResidualPPO.optimizer = @optim.Adam 36 | ConstrainedResidualPPO.clip_param = 0.2 37 | ConstrainedResidualPPO.epochs_per_rollout = 4 38 | ConstrainedResidualPPO.max_grad_norm = 0.5 39 | ConstrainedResidualPPO.ent_coef = 0.01 40 | ConstrainedResidualPPO.vf_coef = 0.5 41 | ConstrainedResidualPPO.gpu = True 42 | ConstrainedResidualPPO.base_actor_cls = @LaggyActor 43 | ConstrainedResidualPPO.wrapper_fn = @lunar_lander_fuel_wrapper 44 | 45 | LaggyActor.actor_cls = @LunarLanderActor 46 | LaggyActor.repeat_prob = 0.8 47 | LunarLanderActor.logdir = "./models/lunar_lander_agent" 48 | LunarLanderActor.device = "cuda:0" 49 | 50 | Checkpointer.ckpt_period = 100000 51 | 52 | make_env.env_id = "LunarLanderRandomContinuousNoGoal-v2" 53 | make_env.norm_observations = True 54 | 55 | VecObsNormWrapper.steps = 10000 56 | VecObsNormWrapper.mean = @bc_mean() 57 | VecObsNormWrapper.std = @bc_std() 58 | VecObsNormWrapper.eps = 1e-2 59 | VecObsNormWrapper.log = True 60 | VecObsNormWrapper.log_prob = 0.01 61 | 62 | # Model Params 63 | ppo_policy_fn.nunits = 128 64 | DiagGaussian.constant_log_std = False 65 | -------------------------------------------------------------------------------- /ppo/configs/ppo_lunar.gin: -------------------------------------------------------------------------------- 1 | import residual_shared_autonomy.ppo 2 | import residual_shared_autonomy.imitation_learning 3 | import residual_shared_autonomy.lunar_lander 4 | 5 | train.algorithm = @ConstrainedResidualPPO 6 | train.maxt = 100000000 7 | train.seed = 0 8 | train.eval = True 9 | train.eval_period = 1000000 10 | train.save_period = 1000000 11 | train.maxseconds = None 12 | 13 | optim.Adam.lr = 0.001 14 | optim.Adam.betas = (0.9, 0.999) 15 | optim.Adam.eps = 1e-5 16 | 17 | ConstrainedResidualPPO.policy_training_start = 100000 18 | ConstrainedResidualPPO.lambda_training_start = 2000000 19 | ConstrainedResidualPPO.lambda_lr = 0.003 20 | ConstrainedResidualPPO.lambda_init = 20 21 | ConstrainedResidualPPO.lr_decay_rate = 0.31622776601 22 | ConstrainedResidualPPO.lr_decay_freq = 20000000 23 | ConstrainedResidualPPO.l2_reg = False 24 | ConstrainedResidualPPO.reward_threshold = -155 25 | ConstrainedResidualPPO.env_fn = @make_env 26 | ConstrainedResidualPPO.policy_fn = @ppo_policy_fn 27 | ConstrainedResidualPPO.nenv = 32 28 | ConstrainedResidualPPO.eval_num_episodes = 100 29 | ConstrainedResidualPPO.record_num_episodes = 0 30 | ConstrainedResidualPPO.rollout_length = 1024 31 | ConstrainedResidualPPO.batch_size = 256 32 | ConstrainedResidualPPO.gamma = 0.99 33 | ConstrainedResidualPPO.lambda_ = 0.95 34 | ConstrainedResidualPPO.norm_advantages = True 35 | ConstrainedResidualPPO.optimizer = @optim.Adam 36 | ConstrainedResidualPPO.clip_param = 0.2 37 | ConstrainedResidualPPO.epochs_per_rollout = 4 38 | ConstrainedResidualPPO.max_grad_norm = 0.5 39 | ConstrainedResidualPPO.ent_coef = 0.01 40 | ConstrainedResidualPPO.vf_coef = 0.5 41 | ConstrainedResidualPPO.gpu = True 42 | ConstrainedResidualPPO.base_actor_cls = @BCMultiActor 43 | ConstrainedResidualPPO.wrapper_fn = @lunar_lander_fuel_wrapper 44 | 45 | BCMultiActor.logdir = "./models/behavioral_cloning_agents_lunar" 46 | BCMultiActor.device = "cuda:0" 47 | BCMultiActor.switch_prob = 0.001 48 | 49 | BCNet.ob_shape = 9 50 | BCNet.action_shape = 2 51 | BCNet.nunits = 128 52 | 53 | 54 | Checkpointer.ckpt_period = 100000 55 | 56 | make_env.env_id = "LunarLanderRandomContinuousNoGoal-v2" 57 | make_env.norm_observations = True 58 | 59 | VecObsNormWrapper.steps = 10000 60 | VecObsNormWrapper.mean = @bc_mean() 61 | VecObsNormWrapper.std = @bc_std() 62 | VecObsNormWrapper.eps = 1e-2 63 | VecObsNormWrapper.log = True 64 | VecObsNormWrapper.log_prob = 0.01 65 | 66 | # Model Params 67 | ppo_policy_fn.nunits = 128 68 | DiagGaussian.constant_log_std = False 69 | -------------------------------------------------------------------------------- /drone_sim/joystick_agent.py: -------------------------------------------------------------------------------- 1 | """LunarLander Joystick Agent.""" 2 | import pygame 3 | import numpy as np 4 | import time 5 | 6 | 7 | ##################################### 8 | # Change these to match your joystick 9 | THRUST_AXIS = 1 10 | ROLL_AXIS = 3 11 | PITCH_AXIS = 4 12 | YAW_AXIS = 0 13 | ##################################### 14 | 15 | 16 | class DroneJoystickActor(object): 17 | """Joystick Controller for Drone tasks. 18 | 19 | The left stick controls upward force and yaw. 20 | The right stick controls roll and pitch. 21 | """ 22 | 23 | def __init__(self, env, fps=50): 24 | """Init.""" 25 | if env.num_envs > 1: 26 | raise ValueError("Only one env can be controlled with the joystick") 27 | self.env = env 28 | self.human_agent_action = np.zeros((1, 4), dtype=np.float32) 29 | pygame.joystick.init() 30 | joysticks = [pygame.joystick.Joystick(x) 31 | for x in range(pygame.joystick.get_count())] 32 | if len(joysticks) != 1: 33 | raise ValueError("There must be exactly 1 joystick connected." 34 | f"Found {len(joysticks)}") 35 | self.joy = joysticks[0] 36 | self.joy.init() 37 | pygame.init() 38 | self.axes = [THRUST_AXIS, ROLL_AXIS, PITCH_AXIS, YAW_AXIS] 39 | self.t = None 40 | self.fps = fps 41 | 42 | def _get_human_action(self): 43 | for event in pygame.event.get(): 44 | if event.type == pygame.JOYAXISMOTION: 45 | for i, ax in enumerate(self.axes): 46 | if event.axis == ax: 47 | self.human_agent_action[0, i] = event.value 48 | break 49 | return self.human_agent_action 50 | 51 | def __call__(self, ob): 52 | """Act.""" 53 | if self.t and (time.time() - self.t) < 1. / self.fps: 54 | st = 1. / self.fps - (time.time() - self.t) 55 | if st > 0.: 56 | time.sleep(st) 57 | self.t = time.time() 58 | self.env.render() 59 | action = self._get_human_action() 60 | return action 61 | 62 | def reset(self): 63 | self.human_agent_action[:] = 0. 64 | 65 | 66 | if __name__ == '__main__': 67 | import gym 68 | import residual_shared_autonomy.drone_sim 69 | from dl.rl import ensure_vec_env 70 | 71 | env = gym.make("DroneReacherHuman-v0") 72 | env = ensure_vec_env(env) 73 | 74 | actor = DroneJoystickActor(env) 75 | 76 | for _ in range(10): 77 | ob = env.reset() 78 | actor.reset() 79 | env.render() 80 | done = False 81 | reward = 0.0 82 | 83 | while not done: 84 | ob, r, done, _ = env.step(actor(ob)) 85 | reward += r 86 | print(reward) 87 | -------------------------------------------------------------------------------- /lunar_lander/joystick_agent.py: -------------------------------------------------------------------------------- 1 | """LunarLander Joystick Agent.""" 2 | import pygame 3 | import numpy as np 4 | import time 5 | 6 | ##################################### 7 | # Change these to match your joystick 8 | UP_AXIS = 4 9 | SIDE_AXIS = 3 10 | ##################################### 11 | 12 | 13 | class LunarLanderJoystickActor(object): 14 | """Joystick Controller for Lunar Lander.""" 15 | 16 | def __init__(self, env, fps=50): 17 | """Init.""" 18 | if env.num_envs > 1: 19 | raise ValueError("Only one env can be controlled with the joystick.") 20 | self.env = env 21 | self.human_agent_action = np.array([[0., 0.]], dtype=np.float32) # noop 22 | pygame.joystick.init() 23 | joysticks = [pygame.joystick.Joystick(x) 24 | for x in range(pygame.joystick.get_count())] 25 | if len(joysticks) != 1: 26 | raise ValueError("There must be exactly 1 joystick connected." 27 | f"Found {len(joysticks)}") 28 | self.joy = joysticks[0] 29 | self.joy.init() 30 | pygame.init() 31 | self.t = None 32 | self.fps = fps 33 | 34 | def _get_human_action(self): 35 | for event in pygame.event.get(): 36 | if event.type == pygame.JOYAXISMOTION: 37 | if event.axis == SIDE_AXIS: 38 | self.human_agent_action[0, 1] = event.value 39 | elif event.axis == UP_AXIS: 40 | self.human_agent_action[0, 0] = -1.0 * event.value 41 | if abs(self.human_agent_action[0, 0]) < 0.1: 42 | self.human_agent_action[0, 0] = 0.0 43 | return self.human_agent_action 44 | 45 | def __call__(self, ob): 46 | """Act.""" 47 | self.env.render() 48 | action = self._get_human_action() 49 | if self.t and (time.time() - self.t) < 1. / self.fps: 50 | st = 1. / self.fps - (time.time() - self.t) 51 | if st > 0.: 52 | time.sleep(st) 53 | self.t = time.time() 54 | return action 55 | 56 | def reset(self): 57 | self.human_agent_action[:] = 0. 58 | 59 | 60 | if __name__ == '__main__': 61 | import gym 62 | import residual_shared_autonomy.lunar_lander 63 | from dl.rl import ensure_vec_env 64 | import argparse 65 | 66 | parser = argparse.ArgumentParser(description='play') 67 | parser.add_argument('--reacher', action='store_true', help='play lunar reacher') 68 | args = parser.parse_args() 69 | 70 | 71 | if args.reacher: 72 | env = gym.make("LunarLanderReacher-v2") 73 | else: 74 | env = gym.make("LunarLanderRandomContinuous-v2") 75 | env = ensure_vec_env(env) 76 | 77 | actor = LunarLanderJoystickActor(env) 78 | 79 | for _ in range(10): 80 | ob = env.reset() 81 | env.render() 82 | done = False 83 | reward = 0.0 84 | 85 | while not done: 86 | ob, r, done, _ = env.step(actor(ob)) 87 | reward += r 88 | print(reward) 89 | -------------------------------------------------------------------------------- /lunar_lander/fuel_wrapper.py: -------------------------------------------------------------------------------- 1 | from baselines.common.vec_env import VecEnvWrapper 2 | from residual_shared_autonomy import ResidualWrapper 3 | from residual_shared_autonomy.lunar_lander import env_lander 4 | import numpy as np 5 | import gin 6 | 7 | 8 | class LunarLanderFuelWrapper(VecEnvWrapper): 9 | """Wrap the ResidualWrapper to modify fuel cost for the co-pilot. 10 | 11 | Don't make the co-pilot pay for fuel that the pilot wants to use. 12 | """ 13 | 14 | def __init__(self, venv): 15 | assert isinstance(venv, ResidualWrapper) 16 | super().__init__(venv) 17 | 18 | def step(self, actions): 19 | pilot_actions = self.venv._action 20 | applied_actions = self.venv._add_actions(np.asarray(actions), 21 | pilot_actions) 22 | m_pow_pilot, s_pow_pilot = self._get_power_from_action(pilot_actions) 23 | m_pow, s_pow = self._get_power_from_action(applied_actions) 24 | 25 | obs, rews, dones, infos = self.venv.step(actions) 26 | 27 | fuel_diff = self._fuel_cost(m_pow - m_pow_pilot, s_pow - s_pow_pilot) 28 | 29 | # print(fuel_diff.shape, rews.shape) 30 | # print(actions, pilot_actions) 31 | # print(fuel_diff, m_pow, m_pow_pilot, s_pow, s_pow_pilot) 32 | # print(lunar_lander_gym.FUEL_COST_MAIN) 33 | 34 | # Don't incentivise co-pilot for reducing the fuel costs of pilot. 35 | # The co pilot is only incentivised to not add extra fuel costs. 36 | rews = rews + np.minimum(0.0, fuel_diff) 37 | return obs, rews, dones, infos 38 | 39 | def _get_power_from_action(self, ac): 40 | main, side = ac[:, 0], ac[:, 1] 41 | main_on = main > 0. 42 | side_on = np.abs(side) > 0.5 43 | m_power = main_on * (np.clip(main, 0.0, 1.0) + 1.0) * 0.5 44 | s_power = side_on * np.clip(np.abs(side), 0.5, 1.0) 45 | return m_power, s_power 46 | 47 | def _fuel_cost(self, main_pow, side_pow): 48 | return (env_lander.FUEL_COST_MAIN * main_pow 49 | + env_lander.FUEL_COST_SIDE * side_pow) 50 | 51 | def reset(self): 52 | """Reset.""" 53 | return self.venv.reset() 54 | 55 | def step_wait(self): 56 | """Step wait.""" 57 | return self.venv.step_wait() 58 | 59 | 60 | @gin.configurable 61 | def lunar_lander_fuel_wrapper(env): 62 | return LunarLanderFuelWrapper(env) 63 | 64 | 65 | if __name__ == '__main__': 66 | import unittest 67 | import gym 68 | from dl.rl import ensure_vec_env, VecFrameStack 69 | 70 | class RandomActor(object): 71 | """Output random actions.""" 72 | 73 | def __init__(self, action_space): 74 | """Init.""" 75 | self.action_space = action_space 76 | 77 | def __call__(self, ob): 78 | """Act.""" 79 | batch_size = ob.shape[0] 80 | return np.asarray([self.action_space.sample() 81 | for _ in range(batch_size)]) 82 | 83 | class Test(unittest.TestCase): 84 | """Tests.""" 85 | 86 | def test_vec_env_wrapper(self): 87 | """Test.""" 88 | env = gym.make("LunarLanderContinuous-v2") 89 | env = ensure_vec_env(env) 90 | actor = RandomActor(env.action_space) 91 | env = ResidualWrapper(env, actor) 92 | env = LunarLanderFuelWrapper(env) 93 | env = VecFrameStack(env, 4) 94 | ob, ac = env.reset() 95 | assert ac.shape == (1, 4*env.action_space.shape[0]) 96 | assert ob.shape == (1, *env.observation_space.spaces[0].shape) 97 | assert ac.shape == (1, *env.observation_space.spaces[1].shape) 98 | 99 | for _ in range(10): 100 | residual_ac = [env.action_space.sample()] 101 | (ob, ac_next), _, _, infos = env.step(residual_ac) 102 | rac = np.minimum(np.maximum(residual_ac[0] + ac[0][-2:], -1), 1) 103 | assert np.allclose(infos[0]['action'], rac) 104 | ac = ac_next 105 | 106 | unittest.main() 107 | -------------------------------------------------------------------------------- /ppo/models/pretrained_drone/config.gin: -------------------------------------------------------------------------------- 1 | import residual_shared_autonomy.drone_sim 2 | import residual_shared_autonomy.imitation_learning 3 | import residual_shared_autonomy.ppo 4 | 5 | # Parameters for Adam: 6 | # ============================================================================== 7 | Adam.amsgrad = False 8 | Adam.betas = (0.9, 0.999) 9 | Adam.eps = 1e-05 10 | Adam.lr = 0.001 11 | Adam.weight_decay = 0 12 | 13 | # Parameters for BCMultiActor: 14 | # ============================================================================== 15 | BCMultiActor.device = 'cuda:0' 16 | BCMultiActor.logdir = \ 17 | '/home-nfs/cbschaff/projects/residual_shared_autonomy/ppo/bc_agents_drone' 18 | BCMultiActor.switch_prob = 0.001 19 | 20 | # Parameters for BCNet: 21 | # ============================================================================== 22 | BCNet.action_shape = 4 23 | BCNet.nunits = 128 24 | BCNet.ob_shape = 15 25 | 26 | # Parameters for Checkpointer: 27 | # ============================================================================== 28 | Checkpointer.ckpt_period = 100000 29 | Checkpointer.format = '{:09d}' 30 | 31 | # Parameters for ConstrainedResidualPPO: 32 | # ============================================================================== 33 | ConstrainedResidualPPO.base_actor_cls = @BCMultiActor 34 | ConstrainedResidualPPO.batch_size = 512 35 | ConstrainedResidualPPO.clip_param = 0.2 36 | ConstrainedResidualPPO.ent_coef = 0.01 37 | ConstrainedResidualPPO.env_fn = @make_env 38 | ConstrainedResidualPPO.epochs_per_rollout = 4 39 | ConstrainedResidualPPO.eval_num_episodes = 100 40 | ConstrainedResidualPPO.gamma = 0.99 41 | ConstrainedResidualPPO.gpu = True 42 | ConstrainedResidualPPO.l2_reg = False 43 | ConstrainedResidualPPO.lambda_ = 0.95 44 | ConstrainedResidualPPO.lambda_init = 10.0 45 | ConstrainedResidualPPO.lambda_lr = 0.001 46 | ConstrainedResidualPPO.lambda_training_start = 5000000 47 | ConstrainedResidualPPO.lr_decay_freq = 20000000 48 | ConstrainedResidualPPO.lr_decay_rate = 0.31622776601 49 | ConstrainedResidualPPO.max_grad_norm = 0.5 50 | ConstrainedResidualPPO.nenv = 64 51 | ConstrainedResidualPPO.norm_advantages = True 52 | ConstrainedResidualPPO.optimizer = @optim.Adam 53 | ConstrainedResidualPPO.policy_fn = @ppo_policy_fn 54 | ConstrainedResidualPPO.policy_training_start = 500000 55 | ConstrainedResidualPPO.record_num_episodes = 5 56 | ConstrainedResidualPPO.reward_threshold = -0.875 57 | ConstrainedResidualPPO.rollout_length = 1024 58 | ConstrainedResidualPPO.vf_coef = 0.5 59 | ConstrainedResidualPPO.wrapper_fn = None 60 | 61 | # Parameters for DiagGaussian: 62 | # ============================================================================== 63 | DiagGaussian.constant_log_std = False 64 | DiagGaussian.log_std_max = 2 65 | DiagGaussian.log_std_min = -20 66 | 67 | # Parameters for drone_bc_mean: 68 | # ============================================================================== 69 | # None. 70 | 71 | # Parameters for drone_bc_std: 72 | # ============================================================================== 73 | # None. 74 | 75 | # Parameters for make_env: 76 | # ============================================================================== 77 | make_env.env_id = 'DroneStabilizer-v0' 78 | make_env.norm_observations = True 79 | make_env.seed = 0 80 | 81 | # Parameters for Policy: 82 | # ============================================================================== 83 | # None. 84 | 85 | # Parameters for ppo_policy_fn: 86 | # ============================================================================== 87 | ppo_policy_fn.nunits = 128 88 | 89 | # Parameters for train: 90 | # ============================================================================== 91 | train.algorithm = @ConstrainedResidualPPO 92 | train.eval = True 93 | train.eval_period = 1000000 94 | train.maxseconds = None 95 | train.maxt = 100000000 96 | train.save_period = 1000000 97 | train.seed = 0 98 | 99 | # Parameters for VecObsNormWrapper: 100 | # ============================================================================== 101 | VecObsNormWrapper.eps = 0.01 102 | VecObsNormWrapper.log = True 103 | VecObsNormWrapper.log_prob = 0.01 104 | VecObsNormWrapper.mean = @drone_bc_mean() 105 | VecObsNormWrapper.std = @drone_bc_std() 106 | VecObsNormWrapper.steps = 10000 107 | -------------------------------------------------------------------------------- /ppo/models/pretrained_lunar/config.gin: -------------------------------------------------------------------------------- 1 | import residual_shared_autonomy.lunar_lander 2 | import residual_shared_autonomy.imitation_learning 3 | import residual_shared_autonomy.ppo 4 | 5 | # Parameters for Adam: 6 | # ============================================================================== 7 | Adam.amsgrad = False 8 | Adam.betas = (0.9, 0.999) 9 | Adam.eps = 1e-05 10 | Adam.lr = 0.001 11 | Adam.weight_decay = 0 12 | 13 | # Parameters for bc_mean: 14 | # ============================================================================== 15 | # None. 16 | 17 | # Parameters for bc_std: 18 | # ============================================================================== 19 | # None. 20 | 21 | # Parameters for BCMultiActor: 22 | # ============================================================================== 23 | BCMultiActor.device = 'cuda:0' 24 | BCMultiActor.logdir = \ 25 | '/home-nfs/cbschaff/projects/residual_shared_autonomy/ppo/bc_agents_no_terrain' 26 | BCMultiActor.switch_prob = 0.001 27 | 28 | # Parameters for BCNet: 29 | # ============================================================================== 30 | BCNet.action_shape = 2 31 | BCNet.nunits = 128 32 | BCNet.ob_shape = 9 33 | 34 | # Parameters for Checkpointer: 35 | # ============================================================================== 36 | Checkpointer.ckpt_period = 100000 37 | Checkpointer.format = '{:09d}' 38 | 39 | # Parameters for ConstrainedResidualPPO: 40 | # ============================================================================== 41 | ConstrainedResidualPPO.base_actor_cls = @BCMultiActor 42 | ConstrainedResidualPPO.batch_size = 256 43 | ConstrainedResidualPPO.clip_param = 0.2 44 | ConstrainedResidualPPO.ent_coef = 0.01 45 | ConstrainedResidualPPO.env_fn = @make_env 46 | ConstrainedResidualPPO.epochs_per_rollout = 4 47 | ConstrainedResidualPPO.eval_num_episodes = 100 48 | ConstrainedResidualPPO.gamma = 0.99 49 | ConstrainedResidualPPO.gpu = True 50 | ConstrainedResidualPPO.l2_reg = False 51 | ConstrainedResidualPPO.lambda_ = 0.95 52 | ConstrainedResidualPPO.lambda_init = 20.0 53 | ConstrainedResidualPPO.lambda_lr = 0.003 54 | ConstrainedResidualPPO.lambda_training_start = 2000000 55 | ConstrainedResidualPPO.lr_decay_freq = 20000000 56 | ConstrainedResidualPPO.lr_decay_rate = 0.31622776601 57 | ConstrainedResidualPPO.max_grad_norm = 0.5 58 | ConstrainedResidualPPO.nenv = 32 59 | ConstrainedResidualPPO.norm_advantages = True 60 | ConstrainedResidualPPO.optimizer = @optim.Adam 61 | ConstrainedResidualPPO.policy_fn = @ppo_policy_fn 62 | ConstrainedResidualPPO.policy_training_start = 100000 63 | ConstrainedResidualPPO.record_num_episodes = 5 64 | ConstrainedResidualPPO.reward_threshold = -155.0 65 | ConstrainedResidualPPO.vf_coef = 0.5 66 | ConstrainedResidualPPO.wrapper_fn = @lunar_lander_fuel_wrapper 67 | 68 | # Parameters for DiagGaussian: 69 | # ============================================================================== 70 | DiagGaussian.constant_log_std = False 71 | DiagGaussian.log_std_max = 2 72 | DiagGaussian.log_std_min = -20 73 | 74 | # Parameters for lunar_lander_fuel_wrapper: 75 | # ============================================================================== 76 | # None. 77 | 78 | # Parameters for make_env: 79 | # ============================================================================== 80 | make_env.env_id = 'LunarLanderRandomContinuousNoGoal-v2' 81 | make_env.norm_observations = True 82 | make_env.seed = 0 83 | 84 | # Parameters for Policy: 85 | # ============================================================================== 86 | # None. 87 | 88 | # Parameters for ppo_policy_fn: 89 | # ============================================================================== 90 | ppo_policy_fn.nunits = 128 91 | 92 | # Parameters for train: 93 | # ============================================================================== 94 | train.algorithm = @ConstrainedResidualPPO 95 | train.eval = True 96 | train.eval_period = 1000000 97 | train.maxseconds = None 98 | train.maxt = 100000000 99 | train.save_period = 1000000 100 | train.seed = 0 101 | 102 | # Parameters for VecObsNormWrapper: 103 | # ============================================================================== 104 | VecObsNormWrapper.eps = 0.01 105 | VecObsNormWrapper.log = True 106 | VecObsNormWrapper.log_prob = 0.01 107 | VecObsNormWrapper.mean = @bc_mean() 108 | VecObsNormWrapper.std = @bc_std() 109 | VecObsNormWrapper.steps = 10000 110 | -------------------------------------------------------------------------------- /imitation_learning/trainer.py: -------------------------------------------------------------------------------- 1 | """Defines trainer and network for MNIST.""" 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import gin 6 | import time 7 | from dl import logger, Checkpointer, nest 8 | from dl.util import StatefulSampler 9 | from dl.modules import DiagGaussian 10 | from torch.utils.data import DataLoader 11 | import os 12 | 13 | 14 | @gin.configurable 15 | class BCNet(nn.Module): 16 | """Imitation network.""" 17 | 18 | def __init__(self, ob_shape, action_shape, nunits): 19 | """Init.""" 20 | super().__init__() 21 | self.nunits = nunits 22 | self.fc1 = nn.Linear(ob_shape, self.nunits) 23 | self.fc2 = nn.Linear(self.nunits, self.nunits) 24 | self.dist = DiagGaussian(self.nunits, action_shape) 25 | self.ob_shape = ob_shape 26 | 27 | def forward(self, ob): 28 | """Forward.""" 29 | if ob.shape[-1] > self.ob_shape: 30 | # HACK. While iterating on the project, I changed the 31 | # observation space of the environments. To use human 32 | # data collected before that, I manually crop out 33 | # newer observation data. 34 | ob = ob[:, :self.ob_shape] 35 | net = F.relu(self.fc1(ob)) 36 | net = F.relu(self.fc2(net)) 37 | return self.dist(net) 38 | 39 | 40 | @gin.configurable(blacklist=['datafile']) 41 | class DemonstrationData(object): 42 | def __init__(self, datafile, mean=None, std=None): 43 | data = torch.load(datafile) 44 | self.obs = data['obs'] 45 | self.actions = data['actions'] 46 | self.n = self.obs.shape[0] 47 | if mean is None or std is None: 48 | self.mean = self.obs.mean(dim=0) 49 | self.std = self.obs.std(dim=0) 50 | else: 51 | self.mean = torch.from_numpy(mean) 52 | self.std = torch.from_numpy(std) 53 | self.obs = (self.obs - self.mean) / (self.std + 1e-5) 54 | 55 | def __len__(self): 56 | return self.n 57 | 58 | def __getitem__(self, idx): 59 | return self.obs[idx], self.actions[idx] 60 | 61 | 62 | @gin.configurable(blacklist=['logdir']) 63 | class BCTrainer(object): 64 | """Behavioral cloning.""" 65 | 66 | def __init__(self, logdir, model, opt, datafile, batch_size, num_workers, 67 | gpu=True): 68 | self.logdir = logdir 69 | self.ckptr = Checkpointer(os.path.join(logdir, 'ckpts')) 70 | self.data = DemonstrationData(datafile) 71 | self.sampler = StatefulSampler(self.data, shuffle=True) 72 | self.dtrain = DataLoader(self.data, sampler=self.sampler, 73 | batch_size=batch_size, 74 | num_workers=num_workers) 75 | self._diter = None 76 | self.t = 0 77 | self.epochs = 0 78 | self.batch_size = batch_size 79 | 80 | self.device = torch.device('cuda:0' if gpu and torch.cuda.is_available() 81 | else 'cpu') 82 | self.model = model 83 | self.model.to(self.device) 84 | self.opt = opt(self.model.parameters()) 85 | 86 | def step(self): 87 | # Get batch. 88 | if self._diter is None: 89 | self._diter = self.dtrain.__iter__() 90 | try: 91 | batch = self._diter.__next__() 92 | except StopIteration: 93 | self.epochs += 1 94 | self._diter = None 95 | return self.epochs 96 | batch = nest.map_structure(lambda x: x.to(self.device), batch) 97 | 98 | # compute loss 99 | ob, ac = batch 100 | self.model.train() 101 | loss = -self.model(ob).log_prob(ac).mean() 102 | 103 | logger.add_scalar('train/loss', loss.detach().cpu().numpy(), 104 | self.t, time.time()) 105 | 106 | # update model 107 | self.opt.zero_grad() 108 | loss.backward() 109 | self.opt.step() 110 | 111 | # increment step 112 | self.t += min(len(self.data) - (self.t % len(self.data)), 113 | self.batch_size) 114 | return self.epochs 115 | 116 | def evaluate(self): 117 | """Evaluate model.""" 118 | self.model.eval() 119 | 120 | nll = 0. 121 | with torch.no_grad(): 122 | for batch in self.dtrain: 123 | ob, ac = nest.map_structure(lambda x: x.to(self.device), batch) 124 | nll += -self.model(ob).log_prob(ac).sum() 125 | avg_nll = nll / len(self.data) 126 | 127 | logger.add_scalar('train/NLL', nll, self.epochs, time.time()) 128 | logger.add_scalar('train/AVG_NLL', avg_nll, self.epochs, 129 | time.time()) 130 | 131 | def save(self): 132 | state_dict = {} 133 | state_dict['model'] = self.model.state_dict() 134 | state_dict['opt'] = self.opt.state_dict() 135 | state_dict['sampler'] = self.sampler.state_dict(self._diter) 136 | state_dict['t'] = self.t 137 | state_dict['epochs'] = self.epochs 138 | self.ckptr.save(state_dict, self.t) 139 | 140 | def load(self, t=None): 141 | state_dict = self.ckptr.load() 142 | if state_dict is None: 143 | self.t = 0 144 | self.epochs = 0 145 | return self.epochs 146 | self.model.load_state_dict(state_dict['model']) 147 | self.opt.load_state_dict(state_dict['opt']) 148 | self.sampler.load_state_dict(state_dict['sampler']) 149 | self.t = state_dict['t'] 150 | self.epochs = state_dict['epochs'] 151 | if self._diter is not None: 152 | self._diter.__del__() 153 | self._diter = None 154 | 155 | def close(self): 156 | """Close data iterator.""" 157 | if self._diter is not None: 158 | self._diter.__del__() 159 | self._diter = None 160 | -------------------------------------------------------------------------------- /residual_wrapper.py: -------------------------------------------------------------------------------- 1 | """Residual Policy environment wrapper.""" 2 | from gym.spaces import Tuple, Box 3 | from baselines.common.vec_env import VecEnvWrapper 4 | import numpy as np 5 | 6 | 7 | class ResidualWrapper(VecEnvWrapper): 8 | """Wrapper for residual policy learning. 9 | 10 | https://arxiv.org/abs/1812.06298 11 | 12 | Requires a callable which returns an action. The chosen action is added to 13 | the observation. 14 | """ 15 | 16 | def __init__(self, venv, act_fn): 17 | """Init.""" 18 | super().__init__(venv) 19 | self.act_fn = act_fn 20 | self._ob = None 21 | if not isinstance(self.action_space, Box): 22 | raise ValueError("ResidualWrapper can only be used with continuous " 23 | "action spaces.") 24 | self.observation_space = Tuple([self.observation_space, 25 | self.action_space]) 26 | bound = self.action_space.high - self.action_space.low 27 | self.action_space = Box(-bound, bound) 28 | self._action = None 29 | 30 | def reset(self): 31 | """Reset.""" 32 | ob = self.venv.reset() 33 | self._action = np.asarray(self.act_fn(ob)) 34 | if hasattr(self.act_fn, 'reset'): 35 | # reset joystick to zero action 36 | self.act_fn.reset() 37 | return (ob, self._norm_action(self._action)) 38 | 39 | def step(self, action): 40 | """Step.""" 41 | action = self._add_actions(np.asarray(action), self._action) 42 | ob, rs, dones, infos = self.venv.step(action) 43 | for i, info in enumerate(infos): 44 | info['action'] = action[i] 45 | info['assistant_action'] = np.asarray(action)[i] 46 | info['player_action'] = self._action[i] 47 | self._action = self.act_fn(ob) 48 | return (ob, self._norm_action(self._action)), rs, dones, infos 49 | 50 | def _clip_action(self, ac): 51 | return np.maximum(np.minimum(ac, self.venv.action_space.high), 52 | self.venv.action_space.low) 53 | 54 | def _add_actions(self, ac1, ac2): 55 | return self._clip_action(ac1 + ac2) 56 | 57 | def _norm_action(self, ac): 58 | high = self.venv.action_space.high 59 | low = self.venv.action_space.low 60 | return 2 * (ac - low) / (high - low) - 1.0 61 | 62 | def step_wait(self): 63 | """Step wait.""" 64 | return self.venv.step_wait() 65 | 66 | 67 | if __name__ == '__main__': 68 | import unittest 69 | import gym 70 | from dl.rl import ensure_vec_env, VecFrameStack 71 | 72 | class ZeroActor(object): 73 | """Output zeros.""" 74 | 75 | def __init__(self, action_space): 76 | """Init.""" 77 | self.action_space = action_space 78 | 79 | def __call__(self, ob): 80 | """Act.""" 81 | batch_size = ob.shape[0] 82 | return np.zeros_like([self.action_space.sample() 83 | for _ in range(batch_size)]) 84 | 85 | class RandomActor(object): 86 | """Output random actions.""" 87 | 88 | def __init__(self, action_space): 89 | """Init.""" 90 | self.action_space = action_space 91 | 92 | def __call__(self, ob): 93 | """Act.""" 94 | batch_size = ob.shape[0] 95 | return np.asarray([self.action_space.sample() 96 | for _ in range(batch_size)]) 97 | 98 | class Test(unittest.TestCase): 99 | """Tests.""" 100 | 101 | def test_zero_actor(self): 102 | """Test.""" 103 | env = gym.make("LunarLanderContinuous-v2") 104 | env = ensure_vec_env(env) 105 | actor = ZeroActor(env.action_space) 106 | env = ResidualWrapper(env, actor) 107 | ob, ac = env.reset() 108 | assert np.allclose(ac, 0) 109 | assert ac.shape == (1, *env.action_space.shape) 110 | assert ob.shape == (1, *env.observation_space.spaces[0].shape) 111 | assert ac.shape == (1, *env.observation_space.spaces[1].shape) 112 | assert isinstance(env.observation_space, Tuple) 113 | 114 | residual_ac = [env.action_space.sample()] 115 | (ob, ac), _, _, infos = env.step(residual_ac) 116 | rac = np.minimum(np.maximum(residual_ac[0], -1), 1) 117 | assert np.allclose(infos[0]['action'], rac) 118 | assert np.allclose(ac, 0) 119 | assert ac.shape == (1, *env.action_space.shape) 120 | assert ob.shape == (1, *env.observation_space.spaces[0].shape) 121 | assert ac.shape == (1, *env.observation_space.spaces[1].shape) 122 | 123 | def test_random_actor(self): 124 | """Test.""" 125 | env = gym.make("LunarLanderContinuous-v2") 126 | env = ensure_vec_env(env) 127 | actor = RandomActor(env.action_space) 128 | env = ResidualWrapper(env, actor) 129 | ob, ac = env.reset() 130 | assert ac.shape == (1, *env.action_space.shape) 131 | assert ob.shape == (1, *env.observation_space.spaces[0].shape) 132 | assert ac.shape == (1, *env.observation_space.spaces[1].shape) 133 | assert isinstance(env.observation_space, Tuple) 134 | 135 | for _ in range(10): 136 | residual_ac = [env.action_space.sample()] 137 | (ob, ac_next), _, _, infos = env.step(residual_ac) 138 | rac = np.minimum(np.maximum(residual_ac[0] + ac[0], -1), 1) 139 | assert np.allclose(infos[0]['action'], rac) 140 | ac = ac_next 141 | 142 | def test_vec_env_wrapper(self): 143 | """Test.""" 144 | env = gym.make("LunarLanderContinuous-v2") 145 | env = ensure_vec_env(env) 146 | actor = RandomActor(env.action_space) 147 | env = ResidualWrapper(env, actor) 148 | env = VecFrameStack(env, 4) 149 | ob, ac = env.reset() 150 | assert ac.shape == (1, 4*env.action_space.shape[0]) 151 | assert ob.shape == (1, *env.observation_space.spaces[0].shape) 152 | assert ac.shape == (1, *env.observation_space.spaces[1].shape) 153 | assert isinstance(env.observation_space, Tuple) 154 | 155 | for _ in range(10): 156 | residual_ac = [env.action_space.sample()] 157 | (ob, ac_next), _, _, infos = env.step(residual_ac) 158 | rac = np.minimum(np.maximum(residual_ac[0] + ac[0][-2:], -1), 1) 159 | assert np.allclose(infos[0]['action'], rac) 160 | ac = ac_next 161 | 162 | unittest.main() 163 | -------------------------------------------------------------------------------- /base_actors.py: -------------------------------------------------------------------------------- 1 | """Base actors on which residuals are learned.""" 2 | import numpy as np 3 | import torch 4 | from residual_shared_autonomy.imitation_learning import BCNet 5 | from residual_shared_autonomy.lunar_lander import lunar_lander_policy_fn 6 | from residual_shared_autonomy.drone_sim import drone_ppo_policy_fn 7 | from dl import Checkpointer 8 | import gin 9 | import os 10 | 11 | 12 | @gin.configurable 13 | class ZeroActor(object): 14 | """Output random actions.""" 15 | 16 | def __init__(self, env): 17 | """Init.""" 18 | self.action_space = env.action_space 19 | self.batch_size = env.num_envs 20 | 21 | def __call__(self, ob): 22 | """Act.""" 23 | return np.zeros([self.batch_size] + list(self.action_space.shape), 24 | dtype=np.float32) 25 | 26 | 27 | @gin.configurable 28 | class RandomActor(object): 29 | """Output random actions.""" 30 | 31 | def __init__(self, env): 32 | """Init.""" 33 | self.action_space = env.action_space 34 | self.batch_size = env.num_envs 35 | 36 | def __call__(self, ob): 37 | """Act.""" 38 | return np.asarray([self.action_space.sample() 39 | for _ in range(self.batch_size)]) 40 | 41 | 42 | @gin.configurable 43 | class PolicyActor(object): 44 | """policy actor""" 45 | 46 | def __init__(self, pi, device): 47 | self.pi = pi 48 | self.device = device 49 | 50 | def __call__(self, ob): 51 | """Act.""" 52 | if isinstance(ob, np.ndarray): 53 | ob = torch.from_numpy(ob).to(self.device) 54 | return self.pi(ob).action.cpu().numpy() 55 | 56 | 57 | @gin.configurable 58 | class LunarLanderActor(object): 59 | """Lunar Lander actor.""" 60 | 61 | def __init__(self, env, logdir, device): 62 | self.ckptr = Checkpointer(os.path.join(logdir, 'ckpts')) 63 | if not torch.cuda.is_available(): 64 | device = 'cpu' 65 | self.pi = lunar_lander_policy_fn(env) 66 | self.pi.to(device) 67 | self.pi.load_state_dict(self.ckptr.load()['pi']) 68 | self.pi.eval() 69 | self.device = device 70 | 71 | def __call__(self, ob): 72 | """Act.""" 73 | with torch.no_grad(): 74 | if isinstance(ob, np.ndarray): 75 | ob = torch.from_numpy(ob).to(self.device) 76 | return self.pi(ob).action.cpu().numpy() 77 | 78 | 79 | @gin.configurable 80 | class DroneReacherActor(object): 81 | """DroneReacher actor.""" 82 | 83 | def __init__(self, env, logdir, device): 84 | self.ckptr = Checkpointer(os.path.join(logdir, 'ckpts')) 85 | if not torch.cuda.is_available(): 86 | device = 'cpu' 87 | self.pi = drone_ppo_policy_fn(env) 88 | self.pi.to(device) 89 | self.pi.load_state_dict(self.ckptr.load()['pi']) 90 | self.pi.eval() 91 | self.device = device 92 | 93 | def __call__(self, ob): 94 | """Act.""" 95 | with torch.no_grad(): 96 | if isinstance(ob, np.ndarray): 97 | ob = torch.from_numpy(ob).to(self.device) 98 | return self.pi(ob).action.cpu().numpy() 99 | 100 | 101 | @gin.configurable 102 | class LaggyActor(object): 103 | """Laggy actor""" 104 | 105 | def __init__(self, env, actor_cls, repeat_prob): 106 | self.actor = actor_cls(env) 107 | self.repeat_prob = repeat_prob 108 | self.action = None 109 | 110 | def __call__(self, ob): 111 | """Act.""" 112 | if self.action is None or np.random.rand() > self.repeat_prob: 113 | self.action = self.actor(ob) 114 | return self.action 115 | 116 | 117 | @gin.configurable 118 | class NoisyActor(object): 119 | """Noisy actor""" 120 | 121 | def __init__(self, env, actor_cls, eps): 122 | self.actor = actor_cls(env) 123 | self.eps = eps 124 | 125 | def __call__(self, ob): 126 | """Act.""" 127 | action = self.actor(ob) 128 | if np.random.rand() < self.eps: 129 | action = np.random.uniform(-1, 1, action.shape).astype(action.dtype) 130 | return action 131 | 132 | 133 | @gin.configurable 134 | class BCActor(object): 135 | """Actor trained with Behavioral cloning""" 136 | 137 | def __init__(self, env, logdir, device): 138 | self.ckptr = Checkpointer(os.path.join(logdir, 'ckpts')) 139 | if not torch.cuda.is_available(): 140 | device = 'cpu' 141 | self.device = device 142 | self.net = BCNet() 143 | self.net.to(device) 144 | self.net.load_state_dict(self.ckptr.load()['model']) 145 | 146 | def __call__(self, ob): 147 | """Act.""" 148 | with torch.no_grad(): 149 | dist = self.net(torch.from_numpy(ob).to(self.device)) 150 | ac = dist.sample().cpu().numpy() 151 | return np.clip(ac, -1., 1.) 152 | 153 | 154 | @gin.configurable 155 | class BCMultiActor(object): 156 | """Use multiple actors trained with Behavioral cloning""" 157 | 158 | def __init__(self, env, logdir, device, switch_prob=0.001): 159 | dirs = [x for x in os.listdir(logdir) if os.path.isdir( 160 | os.path.join(logdir, x, 'ckpts'))] 161 | 162 | self.ckptrs = [Checkpointer(os.path.join(logdir, x, 'ckpts')) 163 | for x in dirs] 164 | if not torch.cuda.is_available(): 165 | device = 'cpu' 166 | self.device = device 167 | self.nets = [BCNet() for _ in dirs] 168 | for net, ckptr in zip(self.nets, self.ckptrs): 169 | net.to(device) 170 | net.load_state_dict(ckptr.load()['model']) 171 | self.current_actor = np.random.choice(self.nets) 172 | self.switch_prob = switch_prob 173 | 174 | def __call__(self, ob): 175 | """Act.""" 176 | if np.random.rand() < self.switch_prob: 177 | self.current_actor = np.random.choice(self.nets) 178 | with torch.no_grad(): 179 | if isinstance(ob, np.ndarray): 180 | ob = torch.from_numpy(ob) 181 | dist = self.current_actor(ob.to(self.device)) 182 | ac = dist.sample().cpu().numpy() 183 | return np.clip(ac, -1., 1.) 184 | 185 | 186 | if __name__ == '__main__': 187 | import gym 188 | import residual_shared_autonomy.envs 189 | from dl.rl import ensure_vec_env 190 | import time 191 | 192 | env = gym.make("LunarLanderRandomContinuous-v2") 193 | env = ensure_vec_env(env) 194 | 195 | # actor = OrnsteinUhlenbeckActor(env, 0.5) 196 | actor = RandomActor(env) 197 | 198 | for _ in range(10): 199 | ob = env.reset() 200 | env.render() 201 | done = False 202 | reward = 0.0 203 | time.sleep(1.) 204 | 205 | while not done: 206 | ob, r, done, _ = env.step(actor(ob)) 207 | env.render() 208 | reward += r 209 | print(reward) 210 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Residual Policy Learning for Shared Autonomy 2 | 3 | This repository accompanies the paper [Residual Policy Learning for Shared Autonomy](https://arxiv.org/abs/2004.05097). 4 | 5 | This repository contains code to train our shared autonomy agent with several human models, including agents trained to 6 | imitate humans and corrupted versions of optimal policies. We have provided all of the human models used in our paper 7 | as well as code for you to make your own human models. Trained shared autonomy agents are also provided. 8 | 9 | You can interact with the models through a joystick and the pygame interface. 10 | 11 | The code is based on my [deep reinforcement learning library](https://github.com/cbschaff/dl) which builds on PyTorch, 12 | OpenAI Gym, gin-config, and Tensorboard. 13 | 14 | ## Installation 15 | 16 | 1. Install [docker](https://docs.docker.com/get-docker/). 17 | 2. Install [x-docker](https://github.com/afdaniele/x-docker), a wrapper around docker for running GUI applications inside a container. 18 | 3. In the top level directory, build the docker image by running: 19 | ```./build_docker.sh``` 20 | 4. Launch the docker container by running: 21 | ```./launch_docker.sh``` 22 | This will start a container and mount the code at ```/root/pkgs/residual_shared_autonomy```. 23 | 24 | All commands, unless otherwise stated, should be run from inside a docker container. 25 | 26 | ## Repository structure 27 | The code is organized into the following directories: 28 | 29 | * ```ppo/```: The code for training and interacting with shared autonomy agents. 30 | * ```lunar_lander/```: The code for LunarLander, LunarReacher, and training RL agents for these environments. 31 | * ```drone_sim/```: The code for DroneReacher and training RL agents for that environment. 32 | * ```imitation_learning/```: The code for collecting human data and training behavioral cloning agents. 33 | 34 | ## Playing the games with and without our assistants 35 | 36 | You can interact with the environments using a joystick controller (Xbox, etc.). 37 | 38 | NOTE: The controls are hard coded for our particular joystick (Xbox one controller). 39 | You may need to edit ```lunar_lander/joystick_agent.py``` and ```drone_sim/joystick_agent.py``` 40 | to match your joystick. 41 | 42 | NOTE: Docker seems to only recognize a joystick if it is connected to your computer before you start the container. 43 | 44 | ### Lunar Lander and Lunar Reacher 45 | 46 | In these environments, you control the thrusters of a spaceship. The controls are as follows: 47 | * Tilting the RIGHT joystick VERTICALLY will control the main thruster. 48 | * Tilting the RIGHT joystick HORIZONTALLY will control the side thrusters. 49 | 50 | To play the game, run: 51 | 52 | ``` 53 | cd /root/pkgs/residual_shared_autonomy/lunar_lander 54 | python joystick_agent.py 55 | ``` 56 | 57 | To play the game with our pretrained assistant, run: 58 | 59 | ``` 60 | cd /root/pkgs/residual_shared_autonomy/ppo 61 | python joystick_control.py models/pretrained_lunar 62 | ``` 63 | To play LunarReacher, add a ```--reacher``` flag to the python commands above. 64 | 65 | ### Drone Reacher 66 | 67 | In this environment, you will control a simulated drone to reacher a red floating target. You will control thrust and 68 | angular acceleration. The controls are as follows: 69 | * Tilt the LEFT joystick VERTICALLY to control thrust (the middle of the range outputs 1 g of thrust) 70 | * Tilt the LEFT joystick HORIZONTALLY to control yaw 71 | * Tilt the RIGHT joystick VERTICALLY to control pitch 72 | * Tilt the RIGHT joystick HORIZONTALLY to control roll 73 | 74 | To play the game, run: 75 | 76 | ``` 77 | cd /root/pkgs/residual_shared_autonomy/drone_sim 78 | python joystick_agent.py 79 | ``` 80 | 81 | To play the game with our pretrained assistant, run: 82 | 83 | ``` 84 | cd /root/pkgs/residual_shared_autonomy/ppo 85 | python joystick_control.py models/pretrained_drone --drone 86 | ``` 87 | 88 | ## Training shared autonomy agents 89 | 90 | Shared autonomy agents can be trained using the following commands: 91 | 92 | ``` 93 | cd /root/pkgs/residual_shared_autonomy/ppo 94 | ./train.sh logs configs/ppo_lunar.gin 95 | ``` 96 | 97 | Replace ```configs/ppo_lunar.gin``` with ```./configs/ppo_drone.gin``` to train the agent for the DroneReacher environment. 98 | 99 | Experiments can be visualized by running: 100 | 101 | ```tensorboard --logdir /path/to/log/directory``` 102 | 103 | To change hyperparameters, edit the ".gin" files. 104 | 105 | To play with your assistants, follow the instructions above. 106 | 107 | To train assistants with laggy or noisy models, call ```train.sh``` with the 'laggy' and 'noisy' gin files. 108 | 109 | ## Creating human models 110 | 111 | We provide the human models used in our paper, but you can create your own as well. 112 | 113 | ### Imitating human behavior 114 | 115 | To create your own models, you will need to collect data and then train an agent to imitate that data. 116 | 117 | To collect data, run: 118 | 119 | ``` 120 | cd /root/pkgs/residual_shared_autonomy/imitation_learning 121 | python data_collection_lunar.py path/to/save/data num_episodes 122 | ``` 123 | 124 | Replace 'lunar' with 'drone' to collect data for the drone simulator. 125 | 126 | To train a model with the collected data, first edit ```bc_lunar.gin``` by changing 127 | BCTrainer.datafile to your data. Then run: 128 | 129 | ``` 130 | cd /root/pkgs/residual_shared_autonomy/imitation_learning 131 | ./train.sh /path/to/log/directory bc_lunar.gin 132 | ``` 133 | 134 | To view logs, run: ```tensorboard --logdir /path/to/logdir``` 135 | 136 | To observe your trained agent, run: ```python watch.py /path/to/logdir```. Add a ```--drone``` flag for the drone simulator. 137 | 138 | To use your models in training a shared autonomy agent, place all of their log directories in a common directory 139 | (or add them to the correct "behavioral_cloning_agents" directory in ```ppo/models```). Then change ```ppo/ppo_lunar.gin``` or 140 | ```ppo/ppo_drone.gin``` so that ```BCMultiActor.logdir = "path/to/your/directory"```. 141 | 142 | ### Modelling humans as corrupted experts 143 | 144 | We can also model humans as corrupted experts. In our paper, be explore laggy and noisy experts, where the expert is 145 | a policy trained using RL. We have included pretrained policies, but you can train your own using the following instructions. 146 | 147 | For Lunar Lander, run: 148 | 149 | ``` 150 | cd /root/pkgs/residual_shared_autonomy/lunar_lander 151 | ./train.sh 152 | ``` 153 | 154 | For Drone Reacher, run: 155 | 156 | ``` 157 | cd /root/pkgs/residual_shared_autonomy/drone_sim 158 | ./train.sh 159 | ``` 160 | 161 | To train shared autonomy agents with these models, edit the '.gin' files in ```ppo/configs``` to point to your model 162 | instead of the pretrained one by setting ```LunarLanderActor.logdir = "/path/to/your/log/directory"``` or 163 | ```DroneReacherActor.logdir = "/path/to/your/log/directory"```. 164 | -------------------------------------------------------------------------------- /drone_sim/sim.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class Drone(object): 5 | """Drone state.""" 6 | 7 | def __init__(self, p_n, p_e, h, u, v, w, phi, theta, psi, p, q, r, 8 | m_center, r_center, m_rotor, l): 9 | """State and dynamics of the drone. 10 | 11 | Notation and equations from "Quadrotor Dynamics and Control Rev 0.1" 12 | by Randal W. Beard. 13 | https://scholarsarchive.byu.edu/cgi/viewcontent.cgi?article=2324&context=facpub 14 | 15 | The drone is assumed to consist of a spherical dense center of mass 16 | m_center and radius r_center and 4 point mass rotors with mass m_rotor 17 | located a distance of l from the center. 18 | 19 | position in inertial frame 20 | -------------------------- 21 | p_n: position north measured along i_i 22 | p_e: position east measured along j_i 23 | h: height measured along k_i 24 | 25 | velocity in body frame 26 | ---------------------- 27 | u: velocity measured along i_b 28 | v: velocity measured along j_b 29 | w: velocity measured along k_b 30 | 31 | orientation in vehicle, vehicle_1, and vehicle_2 frames 32 | ------------------------------------------------------- 33 | phi: roll angle in vehicle_2 frame 34 | theta: pitch angle in vehicle_1 frame 35 | psi: yaw angle in vehicle frame 36 | 37 | angular rate in body frame 38 | -------------------------- 39 | p: roll rate measured along i_b 40 | q: pitch rate measured along j_b 41 | r: yaw rate measured along k_b 42 | 43 | drone parameters 44 | ---------------- 45 | m_center: mass of the center of the drone 46 | r_center: radius of the center of the drone 47 | m_rotor: mass of each rotor 48 | l: distance of rotors from center 49 | """ 50 | self.pos_i = np.array([p_n, p_e, h]) 51 | self.vel_b = np.array([u, v, w]) 52 | self.ori_v = np.array([phi, theta, psi]) 53 | self.angvel_b = np.array([p, q, r]) 54 | 55 | self.m = m_center + 4 * m_rotor 56 | self.m_center = m_center 57 | self.m_rotor = m_rotor 58 | self.r_center = r_center 59 | self.lengh = l 60 | 61 | # compute moment of inertia 62 | self.jx = 2 * m_center * (r_center ** 2) / 5 + 2 * (l ** 2) * m_rotor 63 | self.jy = 2 * m_center * (r_center ** 2) / 5 + 2 * (l ** 2) * m_rotor 64 | self.jz = 2 * m_center * (r_center ** 2) / 5 + 4 * (l ** 2) * m_rotor 65 | 66 | # compute useful constants 67 | self._jc = np.array([(self.jy - self.jz) / self.jx, 68 | (self.jz - self.jx) / self.jy, 69 | (self.jx - self.jy) / self.jz]) 70 | self._jinv = np.array([1/self.jx, 1/self.jy, 1/self.jz]) 71 | self._g = 9.80665 72 | 73 | def state(self): 74 | """Get the state of the drone. 75 | 76 | State consists of position, velocity, orientation, and angular velocity. 77 | """ 78 | return np.concatenate([self.pos_i, self.vel_b, self.ori_v, 79 | self.angvel_b]) 80 | 81 | def step(self, torques, f, time_step): 82 | """Apply forces and torques for time_step amount of time. 83 | 84 | torques: a numpy array of roll, pitch, yaw torques in the body frame. 85 | f: a numpy array of the x, y, z forces in the body frame, 86 | excluding gravity. 87 | time_step: the step size for approximating new state. 88 | """ 89 | c = np.cos(self.ori_v) 90 | s = np.sin(self.ori_v) 91 | t = np.tan(self.ori_v) 92 | p, q, r = self.angvel_b 93 | u, v, w = self.ori_v 94 | g = self._g 95 | 96 | # compute velocity in inertial frame 97 | rot_b_i = np.array([ 98 | [c[1]*c[2], s[0]*s[1]*c[2] - c[0]*s[2], c[0]*s[1]*c[2] + s[0]*s[2]], 99 | [c[1]*s[2], s[0]*s[1]*s[2] + c[0]*c[2], c[0]*s[1]*s[2] - s[0]*c[2]], 100 | [-s[1], s[0]*c[1], c[0]*c[1]] 101 | ]) 102 | vel_i = rot_b_i.dot(self.vel_b) 103 | 104 | # compute acceleration in body frame 105 | acc_b = (np.array([r*v - q*w, p*w - r*u, q*u - p*v]) 106 | + np.array([-g*s[1], g*c[1]*s[0], g*c[1]*c[0]]) 107 | + f / self.m) 108 | 109 | # compute angular velocity in vehicle frames 110 | rot_b_v = np.array([ 111 | [1., s[0]*t[1], c[0]*t[1]], 112 | [0., c[0], -s[0]], 113 | [0., s[0]/c[1], c[0]/c[1]] 114 | ]) 115 | angvel_v = rot_b_v.dot(self.angvel_b) 116 | 117 | # compute angular acceleration in body frame 118 | angacc_b = self._jc * np.array([q*r, p*r, p*q]) + self._jinv * torques 119 | 120 | # update state 121 | self.pos_i += time_step * vel_i 122 | self.vel_b += time_step * acc_b 123 | self.ori_v += time_step * angvel_v 124 | self.angvel_b += time_step * angacc_b 125 | self.ori_v = ((self.ori_v + np.pi) % (2 * np.pi)) - np.pi 126 | return self.state() 127 | 128 | def rotate_i_to_b(self, v): 129 | c = np.cos(self.ori_v) 130 | s = np.sin(self.ori_v) 131 | rot_b_i = np.array([ 132 | [c[1]*c[2], s[0]*s[1]*c[2] - c[0]*s[2], c[0]*s[1]*c[2] + s[0]*s[2]], 133 | [c[1]*s[2], s[0]*s[1]*s[2] + c[0]*c[2], c[0]*s[1]*s[2] - s[0]*c[2]], 134 | [-s[1], s[0]*c[1], c[0]*c[1]] 135 | ]) 136 | return rot_b_i.T.dot(v) 137 | 138 | 139 | 140 | if __name__ == '__main__': 141 | # Test the simulator by applying random torques for roll and pitch 142 | # with a constant upward force, then turning off all motors for a bit, 143 | # then continuing. 144 | d = Drone(0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 10., 1., 1., 2.) 145 | print(d.state()) 146 | torques = np.zeros(3) 147 | forces = np.array([0., 0., -200.]) 148 | states = [] 149 | for _ in range(100): 150 | torques = np.random.uniform(-1., 1., (3,)) 151 | s = d.step(torques, forces, 1./50.) 152 | print(s) 153 | states.append(s) 154 | h_off = states[-1][2] 155 | while states[-1][2] - 1e-5 < h_off: 156 | s = d.step(0*torques, 0*forces, 1./50.) 157 | print(s) 158 | states.append(s) 159 | for _ in range(100): 160 | torques = np.random.uniform(-1., 1., (3,)) 161 | s = d.step(torques, forces, 1./50.) 162 | print(s) 163 | states.append(s) 164 | 165 | from matplotlib import pyplot as plt 166 | states = np.array(states) 167 | x = np.arange(states.shape[0]) / 50. 168 | names = ['x', 'y', 'h', 'u', 'v', 'w'] 169 | names2 = ['phi', 'theta', 'psi', 'p', 'q', 'r'] 170 | f, (ax1, ax2) = plt.subplots(1, 2) 171 | for i, n in enumerate(names): 172 | ax1.plot(x, states[:, i], label=n) 173 | ax1.legend() 174 | for i, n in enumerate(names2): 175 | ax2.plot(x, states[:, 6 + i], label=n) 176 | ax2.legend() 177 | plt.show() 178 | -------------------------------------------------------------------------------- /drone_sim/env.py: -------------------------------------------------------------------------------- 1 | import gym 2 | from gym import spaces 3 | from gym.utils import seeding, EzPickle 4 | from residual_shared_autonomy.drone_sim import Drone 5 | from residual_shared_autonomy.drone_sim import DroneRenderer, DroneFollower 6 | 7 | import numpy as np 8 | 9 | 10 | FPS = 50 11 | N_OBS_DIM = 12 12 | MAX_NUM_STEPS = 1500 13 | SIMS_PER_STEP = 1 14 | DRONE_RADIUS = 0.5 15 | DRONE_LENGTH = 2.0 16 | DRONE_MASS_CENTER = 6.0 17 | DRONE_MASS_ROTOR = 1.0 18 | CAMERA_OFFSET = (-20, 0, -10) 19 | CAMERA_ALPHA_POS = 0.5 20 | CAMERA_ALPHA_ROT = 0.9 21 | MAX_ACC = (20, 1.0) # upward acceleration, angular acceleration 22 | BOUNDS = (40, 40, -100.) 23 | TARGET_RADIUS = 3. 24 | ACTION_NOISE = 0.05 25 | 26 | 27 | class DroneEnv(gym.Env, EzPickle): 28 | """Reacher task in simple drone simulator.""" 29 | 30 | goal_in_state = True 31 | goal_in_reward = True 32 | 33 | metadata = { 34 | 'render.modes': ['human', 'rgb_array'], 35 | } 36 | 37 | def __init__(self): 38 | """Init.""" 39 | EzPickle.__init__(self) 40 | self.seed() 41 | 42 | self.renderer = DroneRenderer() 43 | self.camera = DroneFollower(CAMERA_OFFSET, CAMERA_ALPHA_POS, 44 | CAMERA_ALPHA_ROT) 45 | self.observation_space = spaces.Box(-1., +1, 46 | (N_OBS_DIM + 3 47 | * self.goal_in_state,), 48 | dtype=np.float32) 49 | self.action_space = spaces.Box(-1, +1, (4,), dtype=np.float32) 50 | 51 | self.reset() 52 | 53 | def _out_of_bounds(self, s): 54 | bx, by, bz = BOUNDS 55 | if np.abs(s[0]) >= bx: 56 | return True 57 | if np.abs(s[1]) >= by: 58 | return True 59 | if s[2] < bz or s[2] >= 0.0: 60 | return True 61 | return False 62 | 63 | def _at_target(self, s): 64 | d = np.sqrt(np.sum((s[:3] - self.target) ** 2)) 65 | return d < TARGET_RADIUS 66 | 67 | def _ac_to_force_torque(self, ac): 68 | range = MAX_ACC[0] - self.drone._g 69 | # center upward acceleration at -self.drone.g 70 | force = np.array([ 71 | 0.0, 72 | 0.0, 73 | -self.drone.m * (range * -ac[0] + self.drone._g) 74 | ]) 75 | torque = np.array([ 76 | MAX_ACC[1] * ac[1] * self.drone.jx, 77 | MAX_ACC[1] * ac[2] * self.drone.jy, 78 | MAX_ACC[1] * ac[3] * self.drone.jz, 79 | ]) 80 | return force, torque 81 | 82 | def _norm_state(self, state): 83 | bx, by, bz = BOUNDS 84 | state[0] /= bx 85 | state[1] /= by 86 | state[2] /= bz 87 | if self.goal_in_state: 88 | state[-3] /= bx 89 | state[-2] /= by 90 | state[-1] /= bz 91 | return state.astype(np.float32) 92 | 93 | def seed(self, seed=None): 94 | """Set random seed.""" 95 | self.np_random, seed = seeding.np_random(seed) 96 | return [seed] 97 | 98 | def reset(self): 99 | """Reset environment.""" 100 | self.drone = Drone(0., 0., -1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 101 | DRONE_MASS_CENTER, DRONE_RADIUS, DRONE_MASS_ROTOR, 102 | DRONE_LENGTH) 103 | self.camera.reset(self.drone.pos_i, self.drone.ori_v) 104 | self.t = 0 105 | self.shaping = None 106 | state = self.drone.state() 107 | self.target = np.random.uniform(-40., 0., (3,)) 108 | self.target[:2] += 20. 109 | self.target[2] /= 4. 110 | self.target[2] -= TARGET_RADIUS 111 | if self.goal_in_state: 112 | target_v = self.target - state[:3] 113 | # target_v = self.drone.rotate_i_to_b(target_v) 114 | state = np.concatenate([state, target_v]) 115 | return self._norm_state(state) 116 | 117 | def step(self, action): 118 | """Step environment.""" 119 | action = action + np.random.normal(scale=ACTION_NOISE, 120 | size=action.shape) 121 | action = np.clip(action, -1.0, 1.0) 122 | force, torque = self._ac_to_force_torque(action) 123 | for _ in range(SIMS_PER_STEP): 124 | state = self.drone.step(torque, force, 1.0/FPS) 125 | self.camera.step(self.drone.pos_i, self.drone.ori_v) 126 | self.t += 1 127 | reward = 0.0 128 | if self.goal_in_state: 129 | target_v = self.target - state[:3] 130 | # target_v = self.drone.rotate_i_to_b(target_v) 131 | state = np.concatenate([state, target_v]) 132 | dist_to_targ = np.sqrt(np.sum(state[-3:] ** 2)) 133 | 134 | # shape reward if goal is to reach target, penalize if goal 135 | # is to stabilize 136 | # shaping for low velocity and staying upright and distance to target. 137 | shaping = (-np.sqrt((state ** 2)[3:6].sum()) # linear velocities 138 | - np.sqrt((state ** 2)[9:12].sum()) # angular rates 139 | - np.sqrt((state ** 2)[6:8].sum())) # roll and pitch 140 | if self.goal_in_reward: 141 | reward -= 0.01 * dist_to_targ 142 | 143 | if self.shaping is None: 144 | self.shaping = shaping 145 | else: 146 | reward += shaping - self.shaping 147 | self.shaping = shaping 148 | 149 | done = False 150 | success = False 151 | crash = False 152 | timeout = False 153 | if self._out_of_bounds(state): 154 | done = True 155 | crash = True 156 | if self.goal_in_reward: 157 | reward -= 100. 158 | else: 159 | reward -= 10. 160 | elif self._at_target(state) and self.goal_in_reward: 161 | done = True 162 | success = True 163 | reward += 10. 164 | elif self.t >= MAX_NUM_STEPS: 165 | done = True 166 | timeout = True 167 | 168 | info = {'success': success, 'crash': crash, 'timeout': timeout} 169 | return self._norm_state(state), reward, done, info 170 | 171 | def render(self, mode='human'): 172 | """Render.""" 173 | if self.goal_in_reward: 174 | self.renderer.set_target(self.target, TARGET_RADIUS) 175 | self.renderer.set_camera(self.camera.camera_pos, 176 | self.camera.camera_angle) 177 | self.renderer.draw_world(self.drone.pos_i, self.drone.ori_v, 178 | DRONE_RADIUS, DRONE_LENGTH, BOUNDS[:2]) 179 | if mode == 'rgb_array': 180 | return self.renderer.get_pixels() 181 | 182 | def close(self): 183 | import pygame 184 | pygame.display.quit() 185 | self.renderer.initialized = False 186 | 187 | def state_dict(self): 188 | """Get env state.""" 189 | return {'rng': self.np_random.get_state()} 190 | 191 | def load_state_dict(self, state_dict): 192 | """Load env state.""" 193 | self.np_random.set_state(state_dict['rng']) 194 | 195 | 196 | class DroneReacherBot(DroneEnv): 197 | """Reach the target.""" 198 | 199 | goal_in_state = True 200 | goal_in_reward = True 201 | 202 | 203 | class DroneReacherHuman(DroneEnv): 204 | """Reach the target.""" 205 | 206 | goal_in_state = False 207 | goal_in_reward = True 208 | 209 | 210 | class DroneStabilizer(DroneEnv): 211 | """Don't Crash.""" 212 | 213 | goal_in_state = True 214 | goal_in_reward = False 215 | 216 | 217 | if __name__ == '__main__': 218 | import time 219 | env = DroneStabilizer() 220 | done = False 221 | env.render() 222 | while not done: 223 | ac = env.action_space.sample() 224 | ac[0] = -1.0 # np.random.uniform(0.5, 1.0) 225 | s, r, done, _ = env.step(ac) 226 | env.render() 227 | time.sleep(0.02) 228 | -------------------------------------------------------------------------------- /drone_sim/rendering.py: -------------------------------------------------------------------------------- 1 | import pygame 2 | from pygame.locals import * 3 | import OpenGL 4 | from OpenGL.GL import * 5 | from OpenGL.GLU import * 6 | import numpy as np 7 | from scipy.spatial.transform import Rotation as R 8 | import time 9 | import sys 10 | 11 | 12 | class DroneRenderer(object): 13 | def __init__(self): 14 | self.q = gluNewQuadric() 15 | self.camera_xyz = [0., 0., 0.] 16 | self.camera_rpy = [0., 0., 0.] 17 | self.color1 = [0.7, 0.7, 0.7] 18 | self.color2 = [0.4, 0.4, 0.4] 19 | self.ground_color1 = [0.2, 1.0, 0.2] 20 | self.ground_color2 = [1.0, 0.2, 0.2] 21 | self.target = None 22 | self.initialized = False 23 | 24 | def _init_window(self): 25 | pygame.init() 26 | display = (800, 600) 27 | pygame.display.set_mode(display, DOUBLEBUF|OPENGL) 28 | glEnable(GL_DEPTH_TEST); 29 | glMatrixMode(GL_PROJECTION) 30 | gluPerspective(45.0, display[0]/display[1], 0.1, 100.0) 31 | glMatrixMode(GL_MODELVIEW) 32 | self.initialized = True 33 | 34 | def set_camera(self, xyz, rpy): 35 | self.camera_xyz = xyz 36 | self.camera_rpy = rpy 37 | 38 | def set_target(self, xyz, radius): 39 | self.target = (xyz, radius) 40 | 41 | def camera_transform(self): 42 | x, y, z = self.camera_xyz 43 | ar, ap, ay = self.camera_rpy 44 | glRotatef(-ar,1.0,0.0,0.0) 45 | glRotatef(-ap,0.0,1.0,0.0) 46 | glRotatef(-ay,0.,0.0,1.0) 47 | glTranslatef(-x, -y, -z) 48 | 49 | def draw_target(self, xyz, radius): 50 | glLoadIdentity() 51 | self.camera_transform() 52 | glTranslatef(*xyz) 53 | glColor3fv((1.0, 0., 0.)) 54 | gluSphere(self.q, radius, 32, 32) 55 | 56 | def _apply_rpy(self, rpy): 57 | r, p, y = rpy 58 | axes = np.eye(3) 59 | axes[:2] = R.from_rotvec(y * axes[2]).apply(axes[:2]) 60 | axes[:1] = R.from_rotvec(p * axes[1]).apply(axes[:1]) 61 | glRotatef(180. / np.pi * r, axes[0,0], axes[0,1], axes[0,2]) 62 | glRotatef(180. / np.pi * p, axes[1,0], axes[1,1], axes[1,2]) 63 | glRotatef(180. / np.pi * y, axes[2,0], axes[2,1], axes[2,2]) 64 | 65 | def draw_body(self, pos, orientation, radius): 66 | glLoadIdentity() 67 | self.camera_transform() 68 | glColor3fv(self.color1) 69 | glTranslatef(pos[0], pos[1], pos[2]) 70 | # self._apply_rpy(orientation) 71 | # glTranslatef(0., 0., -radius) 72 | # gluCylinder(self.q, radius, radius, 2*radius, 32, 32) 73 | gluSphere(self.q, radius, 32, 32) 74 | 75 | def draw_arm(self, arm, pos, orientation, length, rod_radius, 76 | rotor_radius): 77 | glLoadIdentity() 78 | if arm == 0: 79 | glColor3fv(self.color2) 80 | else: 81 | glColor3fv(self.color1) 82 | self.camera_transform() 83 | glTranslatef(pos[0], pos[1], pos[2]) 84 | self._apply_rpy(orientation) 85 | glRotatef(90*arm,0.0,0.0,1.0) 86 | glRotatef(90,0.0,1.0,0.0) 87 | gluCylinder(self.q, rod_radius, rod_radius, length, 32, 32) 88 | 89 | glLoadIdentity() 90 | self.camera_transform() 91 | glTranslatef(pos[0], pos[1], pos[2]) 92 | self._apply_rpy(orientation) 93 | glRotatef(90*arm,0.0,0.0,1.0) 94 | glTranslatef(length, 0., -3*rod_radius) 95 | gluCylinder(self.q, rotor_radius, 0., 4*rod_radius, 32, 32) 96 | 97 | def draw_ground(self, boundx=100, boundy=100): 98 | glLoadIdentity() 99 | self.camera_transform() 100 | glBegin(GL_LINES) 101 | for x in range(-100, 100): 102 | if np.abs(x) >= boundx: 103 | glColor3fv(self.ground_color2) 104 | glVertex3fv((x, -100., 0.)) 105 | glVertex3fv((x, 100., 0.)) 106 | else: 107 | glColor3fv(self.ground_color2) 108 | glVertex3fv((x, -100., 0.)) 109 | glVertex3fv((x, -boundx, 0.)) 110 | glVertex3fv((x, boundx, 0.)) 111 | glVertex3fv((x, 100., 0.)) 112 | glColor3fv(self.ground_color1) 113 | glVertex3fv((x, -boundx, 0.)) 114 | glVertex3fv((x, boundx, 0.)) 115 | for y in range(-100, 100): 116 | if np.abs(y) >= boundy: 117 | glColor3fv(self.ground_color2) 118 | glVertex3fv((-100., y, 0.)) 119 | glVertex3fv((100., y, 0.)) 120 | else: 121 | glColor3fv(self.ground_color2) 122 | glVertex3fv((-100., y, 0.)) 123 | glVertex3fv((-boundx, y, 0.)) 124 | glVertex3fv((boundx, y, 0.)) 125 | glVertex3fv((100., y, 0.)) 126 | glColor3fv(self.ground_color1) 127 | glVertex3fv((-boundx, y, 0.)) 128 | glVertex3fv((boundx, y, 0.)) 129 | glEnd() 130 | 131 | def draw_world(self, pos, orientation, radius, length, bounds=(100, 100)): 132 | if not self.initialized: 133 | self._init_window() 134 | glClear(GL_COLOR_BUFFER_BIT|GL_DEPTH_BUFFER_BIT) 135 | 136 | rotor_radius = (length - radius) / 2 137 | rod_radius = radius / 10. 138 | 139 | self.draw_ground(*bounds) 140 | if self.target: 141 | self.draw_target(*self.target) 142 | for i in range(4): 143 | self.draw_arm(i, pos, orientation, length, rod_radius, rotor_radius) 144 | self.draw_body(pos, orientation, radius) 145 | pygame.display.flip() 146 | 147 | def get_pixels(self): 148 | data = glReadPixels(0, 0, 800, 600, GL_RGB, GL_UNSIGNED_BYTE) 149 | return np.frombuffer(data, dtype=np.uint8).reshape(600, 800, 3)[::-1] 150 | 151 | 152 | class DroneFollower(object): 153 | def __init__(self, offset, alpha_pos, alpha_rot): 154 | self.offset = np.array(offset) 155 | self.alpha_pos = alpha_pos 156 | self.alpha_rot = alpha_rot 157 | 158 | self.camera_pos = np.zeros(3) 159 | self.camera_angle = np.zeros(3) 160 | 161 | def _get_angle_diff(self, a1, a2): 162 | d = a2 - a1 163 | for i, dd in enumerate(d): 164 | if dd < -180.: 165 | d[i] += 360. 166 | elif dd >= 180.: 167 | d[i] -= 360. 168 | return d 169 | 170 | def _get_rotation_from_vecs(self, v1, v2): 171 | v1 = v1 / np.sqrt(np.sum(v1 ** 2)) 172 | v2 = v2 / np.sqrt(np.sum(v2 ** 2)) 173 | rot_axis = np.cross(v1, v2) 174 | if np.allclose(rot_axis, 0.): 175 | return R.from_euler('xyz', [0., 0., 0.]) 176 | rot_axis /= np.sqrt((rot_axis ** 2).sum()) 177 | angle = np.arccos(v1.dot(v2)) 178 | return R.from_rotvec(angle * rot_axis) 179 | 180 | def _look_at_drone(self, camera_pos, drone_pos, drone_ori): 181 | dir = drone_pos - camera_pos 182 | dir /= np.sqrt(np.sum(dir ** 2)) 183 | camera_start = np.array([0., 0., -1.]) 184 | rot = self._get_rotation_from_vecs(camera_start, dir) 185 | 186 | # orient camera up 187 | camera_up = rot.apply(np.array([0., 1., 0.])) 188 | z_body = self._apply_rpy(np.array([0., 0., -1.]), drone_ori) 189 | 190 | # project z_body onto dir 191 | proj = z_body.dot(dir) * dir 192 | 193 | # find vector orthogonal to dir in the plane of (z_body, dir) 194 | dir_ortho = z_body - proj 195 | dir_ortho /= np.sqrt(np.sum(dir_ortho ** 2)) 196 | 197 | # find angle between dir_ortho and rotated camera_up 198 | # and the direction to rotate it 199 | angle = np.arccos(camera_up.dot(dir_ortho)) 200 | sign = np.sign(np.cross(dir_ortho, camera_up).dot(dir)) 201 | 202 | # combine rotations for looking at drone and orienting camera 203 | rot_up = rot.from_rotvec(- sign * angle * dir) 204 | rot_tot = rot_up * rot 205 | return rot_tot.as_euler('xyz', degrees=True) 206 | 207 | def _apply_rpy(self, v, rpy): 208 | xyz = np.eye(3) 209 | yaw = R.from_rotvec(rpy[2] * xyz[2]) 210 | xyz[:2] = yaw.apply(xyz[:2]) 211 | pitch = R.from_rotvec(rpy[1] * xyz[1]) 212 | xyz[:1] = pitch.apply(xyz[:1]) 213 | roll = R.from_rotvec(rpy[0] * xyz[0]) 214 | v = yaw.apply(v) 215 | v = pitch.apply(v) 216 | v = roll.apply(v) 217 | return v 218 | 219 | def _set_camera_pose(self, drone_pos, drone_ori, alpha_pos, alpha_ori): 220 | drone_pos = np.array(drone_pos) 221 | drone_ori = np.array(drone_ori) 222 | cp = drone_pos + self._apply_rpy(self.offset, drone_ori) 223 | self.camera_pos = alpha_pos * self.camera_pos + (1.0 - alpha_pos) * cp 224 | ca = self._look_at_drone(self.camera_pos, drone_pos, drone_ori) 225 | angle_diff = self._get_angle_diff(self.camera_angle, ca) 226 | self.camera_angle += (1.0 - alpha_ori) * angle_diff 227 | for i, c in enumerate(self.camera_angle): 228 | if c < -180.: 229 | self.camera_angle[i] += 360. 230 | elif c >= 180.: 231 | self.camera_angle[i] -= 360. 232 | 233 | def reset(self, drone_pos, drone_ori): 234 | self._set_camera_pose(drone_pos, drone_ori, 0.0, 0.0) 235 | return self.camera_pos, self.camera_angle 236 | 237 | def step(self, drone_pos, drone_ori): 238 | self._set_camera_pose(drone_pos, drone_ori, self.alpha_pos, 239 | self.alpha_rot) 240 | return self.camera_pos, self.camera_angle 241 | 242 | 243 | def main(): 244 | from residual_shared_autonomy.drone_sim import Drone 245 | np.random.seed(0) 246 | 247 | d = Drone(0., 0., 0., 0., 0., 0., 0., 248 | 0., 0., 0., 0., 0., 6, 0.5, 1, 2) 249 | torques = np.zeros(3) 250 | forces = np.array([0., 0., -200.]) 251 | states = [] 252 | r = DroneRenderer() 253 | f = DroneFollower([-10., 0., -10.], 0.9, 0.9) 254 | r.set_target([50., 0., 0.], 1.) 255 | cp, ca = f.reset(d.pos_i, d.ori_v) 256 | 257 | for _ in range(100): 258 | r.set_camera(cp, ca) 259 | r.draw_world(d.pos_i, d.ori_v, d.r_center, d.lengh) 260 | torques = 50 * np.random.uniform(-1., 1., (3,)) 261 | s = d.step(torques, forces, 1./50.) 262 | states.append(s) 263 | cp, ca = f.step(d.pos_i, d.ori_v) 264 | time.sleep(0.02) 265 | h_off = states[-1][2] 266 | while states[-1][2] - 1e-5 < h_off: 267 | r.set_camera(cp, ca) 268 | r.draw_world(d.pos_i, d.ori_v, d.r_center, d.lengh) 269 | s = d.step(0*torques, 0*forces, 1./50.) 270 | states.append(s) 271 | cp, ca = f.step(d.pos_i, d.ori_v) 272 | time.sleep(0.02) 273 | for _ in range(100): 274 | r.set_camera(cp, ca) 275 | r.draw_world(d.pos_i, d.ori_v, d.r_center, d.lengh) 276 | torques = 50 * np.random.uniform(-1., 1., (3,)) 277 | s = d.step(torques, forces, 1./50.) 278 | states.append(s) 279 | cp, ca = f.step(d.pos_i, d.ori_v) 280 | time.sleep(0.02) 281 | 282 | import matplotlib 283 | matplotlib.use('TkAgg') 284 | from matplotlib import pyplot as plt 285 | states = np.array(states) 286 | x = np.arange(states.shape[0]) / 50. 287 | names = ['x', 'y', 'h', 'u', 'v', 'w'] 288 | names2 = ['phi', 'theta', 'psi', 'p', 'q', 'r'] 289 | f, (ax1, ax2) = plt.subplots(1, 2) 290 | for i, n in enumerate(names): 291 | ax1.plot(x, states[:, i], label=n) 292 | ax1.legend() 293 | for i, n in enumerate(names2): 294 | ax2.plot(x, states[:, 6 + i], label=n) 295 | ax2.legend() 296 | plt.show() 297 | 298 | 299 | if __name__ == '__main__': 300 | main() 301 | -------------------------------------------------------------------------------- /ppo/constrained_residual_ppo.py: -------------------------------------------------------------------------------- 1 | """Residual Policy Learning with PPO. 2 | 3 | https://arxiv.org/abs/1812.06298 4 | """ 5 | from dl.rl.envs import VecEpisodeLogger 6 | from dl.rl.data_collection import RolloutDataManager 7 | from dl.rl.modules import Policy 8 | from dl.rl.util import rl_evaluate, rl_record, misc 9 | from dl import logger, Algorithm, Checkpointer 10 | from residual_shared_autonomy import ResidualWrapper 11 | import gin 12 | import numpy as np 13 | import os 14 | import torch 15 | import torch.nn as nn 16 | import torch.nn.functional as F 17 | import time 18 | 19 | 20 | class ResidualPPOActor(object): 21 | """Actor.""" 22 | 23 | def __init__(self, pi, policy_training_start): 24 | """Init.""" 25 | self.pi = pi 26 | self.policy_training_start = policy_training_start 27 | self.t = 0 28 | 29 | def __call__(self, ob, state_in=None, mask=None): 30 | """Produce decision from model.""" 31 | if self.t < self.policy_training_start: 32 | outs = self.pi(ob, state_in, mask, deterministic=True) 33 | if not torch.allclose(outs.action, torch.zeros_like(outs.action)): 34 | raise ValueError("Pi should be initialized to output zero " 35 | "actions so that an acurate value function " 36 | "can be learned for the base policy.") 37 | else: 38 | outs = self.pi(ob, state_in, mask) 39 | residual_norm = torch.mean(torch.sum(torch.abs(outs.action), dim=1)) 40 | logger.add_scalar('actor/l1_residual_norm', residual_norm, self.t, 41 | time.time()) 42 | self.t += outs.action.shape[0] 43 | data = {'action': outs.action, 44 | 'value': outs.value, 45 | 'logp': outs.dist.log_prob(outs.action)} 46 | if outs.state_out: 47 | data['state'] = outs.state_out 48 | return data 49 | 50 | def state_dict(self): 51 | """State dict.""" 52 | return {'t': self.t} 53 | 54 | def load_state_dict(self, state_dict): 55 | """Load state dict.""" 56 | self.t = state_dict['t'] 57 | 58 | 59 | @gin.configurable(blacklist=['logdir']) 60 | class ConstrainedResidualPPO(Algorithm): 61 | """Constrained Residual PPO algorithm.""" 62 | 63 | def __init__(self, 64 | logdir, 65 | env_fn, 66 | policy_fn, 67 | nenv=1, 68 | optimizer=torch.optim.Adam, 69 | lambda_lr=1e-4, 70 | lambda_init=100., 71 | lr_decay_rate=1./3.16227766017, 72 | lr_decay_freq=20000000, 73 | l2_reg=True, 74 | reward_threshold=-0.05, 75 | rollout_length=128, 76 | batch_size=32, 77 | gamma=0.99, 78 | lambda_=0.95, 79 | norm_advantages=False, 80 | epochs_per_rollout=10, 81 | max_grad_norm=None, 82 | ent_coef=0.01, 83 | vf_coef=0.5, 84 | clip_param=0.2, 85 | base_actor_cls=None, 86 | policy_training_start=10000, 87 | lambda_training_start=100000, 88 | eval_num_episodes=1, 89 | record_num_episodes=1, 90 | wrapper_fn=None, # additional wrappers for the env 91 | gpu=True): 92 | """Init.""" 93 | self.logdir = logdir 94 | self.ckptr = Checkpointer(os.path.join(logdir, 'ckpts')) 95 | self.env_fn = env_fn 96 | self.nenv = nenv 97 | self.eval_num_episodes = eval_num_episodes 98 | self.record_num_episodes = record_num_episodes 99 | self.epochs_per_rollout = epochs_per_rollout 100 | self.max_grad_norm = max_grad_norm 101 | self.ent_coef = ent_coef 102 | self.vf_coef = vf_coef 103 | self.clip_param = clip_param 104 | self.base_actor_cls = base_actor_cls 105 | self.policy_training_start = policy_training_start 106 | self.lambda_training_start = lambda_training_start 107 | self.lambda_lr = lambda_lr 108 | self.lr_decay_rate = lr_decay_rate 109 | self.lr_decay_freq = lr_decay_freq 110 | self.l2_reg = l2_reg 111 | self.reward_threshold = reward_threshold 112 | self.device = torch.device('cuda:0' if gpu and torch.cuda.is_available() 113 | else 'cpu') 114 | 115 | self.env = VecEpisodeLogger(env_fn(nenv=nenv)) 116 | self.env = ResidualWrapper(self.env, self.base_actor_cls(self.env)) 117 | if wrapper_fn: 118 | self.env = wrapper_fn(self.env) 119 | 120 | self.pi = policy_fn(self.env).to(self.device) 121 | self.opt = optimizer(self.pi.parameters()) 122 | self.pi_lr = self.opt.param_groups[0]['lr'] 123 | if lambda_init < 10: 124 | lambda_init = np.log(np.exp(lambda_init) - 1) 125 | self.log_lambda_ = nn.Parameter( 126 | torch.Tensor([lambda_init]).to(self.device)) 127 | self.opt_l = optimizer([self.log_lambda_], lr=lambda_lr) 128 | self._actor = ResidualPPOActor(self.pi, policy_training_start) 129 | self.data_manager = RolloutDataManager( 130 | self.env, 131 | self._actor, 132 | self.device, 133 | rollout_length=rollout_length, 134 | batch_size=batch_size, 135 | gamma=gamma, 136 | lambda_=lambda_, 137 | norm_advantages=norm_advantages) 138 | 139 | self.mse = nn.MSELoss(reduction='none') 140 | self.huber = nn.SmoothL1Loss() 141 | 142 | self.t = 0 143 | 144 | def loss(self, batch): 145 | """Compute loss.""" 146 | if self.data_manager.recurrent: 147 | outs = self.pi(batch['obs'], batch['state'], batch['mask']) 148 | else: 149 | outs = self.pi(batch['obs']) 150 | loss = {} 151 | 152 | # compute policy loss 153 | if self.t < self.policy_training_start: 154 | pi_loss = torch.Tensor([0.0]).to(self.device) 155 | else: 156 | logp = outs.dist.log_prob(batch['action']) 157 | assert logp.shape == batch['logp'].shape 158 | ratio = torch.exp(logp - batch['logp']) 159 | assert ratio.shape == batch['atarg'].shape 160 | ploss1 = ratio * batch['atarg'] 161 | ploss2 = torch.clamp(ratio, 1.0-self.clip_param, 162 | 1.0+self.clip_param) * batch['atarg'] 163 | pi_loss = -torch.min(ploss1, ploss2).mean() 164 | loss['pi'] = pi_loss 165 | 166 | # compute value loss 167 | vloss1 = 0.5 * self.mse(outs.value, batch['vtarg']) 168 | vpred_clipped = batch['vpred'] + ( 169 | outs.value - batch['vpred']).clamp(-self.clip_param, 170 | self.clip_param) 171 | vloss2 = 0.5 * self.mse(vpred_clipped, batch['vtarg']) 172 | vf_loss = torch.max(vloss1, vloss2).mean() 173 | loss['value'] = vf_loss 174 | 175 | # compute entropy loss 176 | if self.t < self.policy_training_start: 177 | ent_loss = torch.Tensor([0.0]).to(self.device) 178 | else: 179 | ent_loss = outs.dist.entropy().mean() 180 | loss['entropy'] = ent_loss 181 | 182 | # compute residual regularizer 183 | if self.t < self.policy_training_start: 184 | reg_loss = torch.Tensor([0.0]).to(self.device) 185 | else: 186 | if self.l2_reg: 187 | reg_loss = outs.dist.rsample().pow(2).sum(dim=-1).mean() 188 | else: # huber loss 189 | ac_norm = torch.norm(outs.dist.rsample(), dim=-1) 190 | reg_loss = self.huber(ac_norm, torch.zeros_like(ac_norm)) 191 | loss['reg'] = reg_loss 192 | 193 | ############################### 194 | # Constrained loss added here. 195 | ############################### 196 | 197 | # soft plus on lambda to constrain it to be positive. 198 | lambda_ = F.softplus(self.log_lambda_) 199 | logger.add_scalar('alg/lambda', lambda_, self.t, time.time()) 200 | logger.add_scalar('alg/lambda_', self.log_lambda_, self.t, time.time()) 201 | if self.t < max(self.policy_training_start, self.lambda_training_start): 202 | loss['lambda'] = torch.Tensor([0.0]).to(self.device) 203 | else: 204 | neps = (1.0 - batch['mask']).sum() 205 | loss['lambda'] = (lambda_ * (batch['reward'].sum() 206 | - self.reward_threshold * neps) 207 | / batch['reward'].size()[0]) 208 | if self.t >= self.policy_training_start: 209 | loss['pi'] = (reg_loss + lambda_ * loss['pi']) / (1. + lambda_) 210 | loss['total'] = (loss['pi'] + self.vf_coef * vf_loss 211 | - self.ent_coef * ent_loss) 212 | return loss 213 | 214 | def step(self): 215 | """Compute rollout, loss, and update model.""" 216 | self.pi.train() 217 | # adjust learning rate 218 | lr_frac = self.lr_decay_rate ** (self.t // self.lr_decay_freq) 219 | for g in self.opt.param_groups: 220 | g['lr'] = self.pi_lr * lr_frac 221 | for g in self.opt_l.param_groups: 222 | g['lr'] = self.lambda_lr * lr_frac 223 | 224 | self.data_manager.rollout() 225 | self.t += self.data_manager.rollout_length * self.nenv 226 | losses = {} 227 | for _ in range(self.epochs_per_rollout): 228 | for batch in self.data_manager.sampler(): 229 | loss = self.loss(batch) 230 | if losses == {}: 231 | losses = {k: [] for k in loss} 232 | for k, v in loss.items(): 233 | losses[k].append(v.detach().cpu().numpy()) 234 | if self.t >= max(self.policy_training_start, 235 | self.lambda_training_start): 236 | self.opt_l.zero_grad() 237 | loss['lambda'].backward(retain_graph=True) 238 | self.opt_l.step() 239 | self.opt.zero_grad() 240 | loss['total'].backward() 241 | if self.max_grad_norm: 242 | nn.utils.clip_grad_norm_(self.pi.parameters(), 243 | self.max_grad_norm) 244 | self.opt.step() 245 | for k, v in losses.items(): 246 | logger.add_scalar(f'loss/{k}', np.mean(v), self.t, time.time()) 247 | logger.add_scalar('alg/lr_pi', self.opt.param_groups[0]['lr'], self.t, 248 | time.time()) 249 | logger.add_scalar('alg/lr_lambda', self.opt_l.param_groups[0]['lr'], 250 | self.t, time.time()) 251 | return self.t 252 | 253 | def evaluate(self): 254 | """Evaluate model.""" 255 | self.pi.eval() 256 | misc.set_env_to_eval_mode(self.env) 257 | 258 | # Eval policy 259 | os.makedirs(os.path.join(self.logdir, 'eval'), exist_ok=True) 260 | outfile = os.path.join(self.logdir, 'eval', 261 | self.ckptr.format.format(self.t) + '.json') 262 | stats = rl_evaluate(self.env, self.pi, self.eval_num_episodes, 263 | outfile, self.device) 264 | logger.add_scalar('eval/mean_episode_reward', stats['mean_reward'], 265 | self.t, time.time()) 266 | logger.add_scalar('eval/mean_episode_length', stats['mean_length'], 267 | self.t, time.time()) 268 | 269 | # Record policy 270 | # os.makedirs(os.path.join(self.logdir, 'video'), exist_ok=True) 271 | # outfile = os.path.join(self.logdir, 'video', 272 | # self.ckptr.format.format(self.t) + '.mp4') 273 | # rl_record(self.env, self.pi, self.record_num_episodes, outfile, 274 | # self.device) 275 | 276 | self.pi.train() 277 | misc.set_env_to_train_mode(self.env) 278 | 279 | def save(self): 280 | """State dict.""" 281 | state_dict = { 282 | 'pi': self.pi.state_dict(), 283 | 'opt': self.opt.state_dict(), 284 | 'lambda_': self.log_lambda_, 285 | 'opt_l': self.opt_l.state_dict(), 286 | 'env': misc.env_state_dict(self.env), 287 | '_actor': self._actor.state_dict(), 288 | 't': self.t 289 | } 290 | self.ckptr.save(state_dict, self.t) 291 | 292 | def load(self, t=None): 293 | """Load state dict.""" 294 | state_dict = self.ckptr.load(t) 295 | if state_dict is None: 296 | self.t = 0 297 | return self.t 298 | self.pi.load_state_dict(state_dict['pi']) 299 | self.opt.load_state_dict(state_dict['opt']) 300 | self.opt_l.load_state_dict(state_dict['opt_l']) 301 | self.log_lambda_.data.copy_(state_dict['lambda_']) 302 | misc.env_load_state_dict(self.env, state_dict['env']) 303 | self._actor.load_state_dict(state_dict['_actor']) 304 | self.t = state_dict['t'] 305 | return self.t 306 | 307 | def close(self): 308 | """Close environment.""" 309 | try: 310 | self.env.close() 311 | except Exception: 312 | pass 313 | 314 | 315 | if __name__ == '__main__': 316 | import unittest 317 | import shutil 318 | from dl.rl.modules import ActorCriticBase 319 | from dl.rl import make_env 320 | from dl import train 321 | import residual_shared_autonomy.envs 322 | from dl.modules import DiagGaussian 323 | import torch.nn.functional as F 324 | from functools import partial 325 | 326 | class FeedForwardActorCriticBase(ActorCriticBase): 327 | """Policy and Value networks.""" 328 | 329 | def build(self): 330 | """Build.""" 331 | inshape = (self.observation_space.spaces[0].shape[0] 332 | + self.observation_space.spaces[1].shape[0]) 333 | self.fc1 = nn.Linear(inshape, 32) 334 | self.fc2 = nn.Linear(32, 32) 335 | self.fc3 = nn.Linear(32, 32) 336 | self.dist = DiagGaussian(32, self.action_space.shape[0]) 337 | for p in self.dist.fc_mean.parameters(): 338 | nn.init.constant_(p, 0.) 339 | 340 | self.vf_fc1 = nn.Linear(inshape, 32) 341 | self.vf_fc2 = nn.Linear(32, 32) 342 | self.vf_fc3 = nn.Linear(32, 32) 343 | self.vf_out = nn.Linear(32, 1) 344 | 345 | def forward(self, x): 346 | """Forward.""" 347 | ob = torch.cat(x, axis=1) 348 | net = ob 349 | net = F.relu(self.fc1(net)) 350 | net = F.relu(self.fc2(net)) 351 | net = F.relu(self.fc3(net)) 352 | pi = self.dist(net) 353 | 354 | net = ob 355 | net = F.relu(self.vf_fc1(net)) 356 | net = F.relu(self.vf_fc2(net)) 357 | net = F.relu(self.vf_fc3(net)) 358 | vf = self.vf_out(net) 359 | 360 | return pi, vf 361 | 362 | class RandomActor(object): 363 | """Output random actions.""" 364 | 365 | def __init__(self, env): 366 | """Init.""" 367 | self.action_space = env.action_space 368 | 369 | def __call__(self, ob): 370 | """Act.""" 371 | batch_size = ob.shape[0] 372 | return np.asarray([self.action_space.sample() 373 | for _ in range(batch_size)]) 374 | 375 | class TestResidualPPO(unittest.TestCase): 376 | """Test case.""" 377 | 378 | def test_feed_forward_ppo(self): 379 | """Test feed forward ppo.""" 380 | def env_fn(nenv): 381 | return make_env(env_id="LunarLanderRandomConstrained-v2", 382 | nenv=nenv) 383 | 384 | def policy_fn(env): 385 | return Policy(FeedForwardActorCriticBase(env.observation_space, 386 | env.action_space)) 387 | 388 | ppo = partial(ConstrainedResidualPPO, 389 | env_fn=env_fn, 390 | nenv=32, 391 | policy_fn=policy_fn, 392 | base_actor_cls=RandomActor, 393 | policy_training_start=500) 394 | train('test', ppo, maxt=1000, eval=True, eval_period=1000) 395 | alg = ppo('test') 396 | alg.load() 397 | shutil.rmtree('test') 398 | 399 | unittest.main() 400 | -------------------------------------------------------------------------------- /lunar_lander/env_reacher.py: -------------------------------------------------------------------------------- 1 | import sys, math 2 | import numpy as np 3 | 4 | import Box2D 5 | from Box2D.b2 import (edgeShape, circleShape, fixtureDef, polygonShape, revoluteJointDef, contactListener) 6 | 7 | import gym 8 | from gym import spaces 9 | from gym.utils import seeding, EzPickle 10 | 11 | # Rocket trajectory optimization is a classic topic in Optimal Control. 12 | # 13 | # According to Pontryagin's maximum principle it's optimal to fire engine full throttle or 14 | # turn it off. That's the reason this environment is OK to have discreet actions (engine on or off). 15 | # 16 | # Landing pad is always at coordinates (0,0). Coordinates are the first two numbers in state vector. 17 | # Reward for moving from the top of the screen to landing pad and zero speed is about 100..140 points. 18 | # If lander moves away from landing pad it loses reward back. Episode finishes if the lander crashes or 19 | # comes to rest, receiving additional -100 or +100 points. Each leg ground contact is +10. Firing main 20 | # engine is -0.3 points each frame. Firing side engine is -0.03 points each frame. Solved is 200 points. 21 | # 22 | # Landing outside landing pad is possible. Fuel is infinite, so an agent can learn to fly and then land 23 | # on its first attempt. Please see source code for details. 24 | # 25 | # To see heuristic landing, run: 26 | # 27 | # python gym/envs/box2d/lunar_lander.py 28 | # 29 | # To play yourself, run: 30 | # 31 | # python examples/agents/keyboard_agent.py LunarLander-v2 32 | # 33 | # Created by Oleg Klimov. Licensed on the same terms as the rest of OpenAI Gym. 34 | 35 | FPS = 50 36 | SCALE = 30.0 # affects how fast-paced the game is, forces should be adjusted as well 37 | 38 | N_OBS_DIM = 9 39 | MAX_NUM_STEPS = 1000 40 | 41 | MAIN_ENGINE_POWER = 13.0 42 | SIDE_ENGINE_POWER = 0.6 43 | 44 | INITIAL_RANDOM = 1000.0 # Set 1500 to make game harder 45 | 46 | LANDER_POLY =[ 47 | (-14,+17), (-17,0), (-17,-10), 48 | (+17,-10), (+17,0), (+14,+17) 49 | ] 50 | LEG_AWAY = 20 51 | LEG_DOWN = 18 52 | LEG_W, LEG_H = 2, 8 53 | LEG_SPRING_TORQUE = 40 54 | TARGET_R = 25 55 | 56 | SIDE_ENGINE_HEIGHT = 14.0 57 | SIDE_ENGINE_AWAY = 12.0 58 | 59 | VIEWPORT_W = 600 60 | VIEWPORT_H = 400 61 | 62 | class ContactDetector(contactListener): 63 | def __init__(self, env): 64 | contactListener.__init__(self) 65 | self.env = env 66 | def BeginContact(self, contact): 67 | if self.env.lander==contact.fixtureA.body or self.env.lander==contact.fixtureB.body: 68 | self.env.game_over = True 69 | for i in range(2): 70 | if self.env.legs[i] in [contact.fixtureA.body, contact.fixtureB.body]: 71 | self.env.legs[i].ground_contact = True 72 | def EndContact(self, contact): 73 | for i in range(2): 74 | if self.env.legs[i] in [contact.fixtureA.body, contact.fixtureB.body]: 75 | self.env.legs[i].ground_contact = False 76 | 77 | class LunarLander(gym.Env, EzPickle): 78 | metadata = { 79 | 'render.modes': ['human', 'rgb_array'], 80 | 'video.frames_per_second' : FPS 81 | } 82 | 83 | continuous = False 84 | terrain_in_ob = True 85 | human_test = False 86 | 87 | def __init__(self): 88 | EzPickle.__init__(self) 89 | self.seed() 90 | self.viewer = None 91 | 92 | self.world = Box2D.b2World() 93 | self.moon = None 94 | self.lander = None 95 | self.particles = [] 96 | 97 | self.prev_reward = None 98 | self.curr_step = None 99 | 100 | # useful range is -1 .. +1, but spikes can be higher 101 | ob_dim = N_OBS_DIM + 22 if self.terrain_in_ob else N_OBS_DIM 102 | self.observation_space = spaces.Box(-np.inf, np.inf, shape=(ob_dim,), dtype=np.float32) 103 | 104 | if self.continuous: 105 | # Action is two floats [main engine, left-right engines]. 106 | # Main engine: -1..0 off, 0..+1 throttle from 50% to 100% power. Engine can't work with less than 50% power. 107 | # Left-right: -1.0..-0.5 fire left engine, +0.5..+1.0 fire right engine, -0.5..0.5 off 108 | self.action_space = spaces.Box(-1, +1, (2,), dtype=np.float32) 109 | else: 110 | # Nop, fire left engine, main engine, right engine 111 | self.action_space = spaces.Discrete(4) 112 | 113 | self.reset() 114 | 115 | def seed(self, seed=None): 116 | self.np_random, seed = seeding.np_random(seed) 117 | return [seed] 118 | 119 | def _destroy(self): 120 | if not self.moon: return 121 | self.world.contactListener = None 122 | self._clean_particles(True) 123 | self.world.DestroyBody(self.moon) 124 | self.moon = None 125 | self.world.DestroyBody(self.lander) 126 | self.lander = None 127 | self.world.DestroyBody(self.legs[0]) 128 | self.world.DestroyBody(self.legs[1]) 129 | 130 | def reset(self): 131 | self._destroy() 132 | self.world.contactListener_keepref = ContactDetector(self) 133 | self.world.contactListener = self.world.contactListener_keepref 134 | self.game_over = False 135 | self.prev_shaping = None 136 | self.curr_step = 0 137 | 138 | W = VIEWPORT_W/SCALE 139 | H = VIEWPORT_H/SCALE 140 | 141 | # terrain 142 | CHUNKS = 11 143 | height = self.np_random.uniform(0, H/2, size=(CHUNKS+1,) ) 144 | chunk_x = [W/(CHUNKS-1)*i for i in range(CHUNKS)] 145 | # randomize helipad x coord 146 | helipad_chunk = np.random.choice(range(1, CHUNKS-1)) 147 | 148 | self.helipad_x1 = chunk_x[helipad_chunk-1] 149 | self.helipad_x2 = chunk_x[helipad_chunk+1] 150 | self.helipad_y = H/4 151 | height[helipad_chunk-2] = self.helipad_y 152 | height[helipad_chunk-1] = self.helipad_y 153 | height[helipad_chunk+0] = self.helipad_y 154 | height[helipad_chunk+1] = self.helipad_y 155 | height[helipad_chunk+2] = self.helipad_y 156 | smooth_y = [0.33*(height[i-1] + height[i+0] + height[i+1]) for i in range(CHUNKS)] 157 | self.terrain_x = chunk_x 158 | self.terrain_y = smooth_y 159 | 160 | # target 161 | radius = TARGET_R / SCALE 162 | # exclude middle half of the screen 163 | self.target_cx = self.np_random.uniform(radius, W/2. - radius) 164 | if self.target_cx >= W / 4.: 165 | self.target_cx += W / 2. 166 | self.target_cy = self.np_random.uniform(H/2 + radius, H - radius) 167 | 168 | self.moon = self.world.CreateStaticBody( shapes=edgeShape(vertices=[(0, 0), (W, 0)]) ) 169 | self.sky_polys = [] 170 | for i in range(CHUNKS-1): 171 | p1 = (chunk_x[i], smooth_y[i]) 172 | p2 = (chunk_x[i+1], smooth_y[i+1]) 173 | self.moon.CreateEdgeFixture( 174 | vertices=[p1,p2], 175 | density=0, 176 | friction=0.1) 177 | self.sky_polys.append( [p1, p2, (p2[0],H), (p1[0],H)] ) 178 | 179 | self.moon.color1 = (0.0,0.0,0.0) 180 | self.moon.color2 = (0.0,0.0,0.0) 181 | 182 | initial_y = VIEWPORT_H/SCALE 183 | self.lander = self.world.CreateDynamicBody( 184 | position = (VIEWPORT_W/SCALE/2, initial_y), 185 | angle=0.0, 186 | fixtures = fixtureDef( 187 | shape=polygonShape(vertices=[ (x/SCALE,y/SCALE) for x,y in LANDER_POLY ]), 188 | density=5.0, 189 | friction=0.1, 190 | categoryBits=0x0010, 191 | maskBits=0x001, # collide only with ground 192 | restitution=0.0) # 0.99 bouncy 193 | ) 194 | self.lander.color1 = (0.5,0.4,0.9) 195 | self.lander.color2 = (0.3,0.3,0.5) 196 | self.lander.ApplyForceToCenter( ( 197 | self.np_random.uniform(-INITIAL_RANDOM, INITIAL_RANDOM), 198 | self.np_random.uniform(-INITIAL_RANDOM, INITIAL_RANDOM) 199 | ), True) 200 | 201 | self.legs = [] 202 | for i in [-1,+1]: 203 | leg = self.world.CreateDynamicBody( 204 | position = (VIEWPORT_W/SCALE/2 - i*LEG_AWAY/SCALE, initial_y), 205 | angle = (i*0.05), 206 | fixtures = fixtureDef( 207 | shape=polygonShape(box=(LEG_W/SCALE, LEG_H/SCALE)), 208 | density=1.0, 209 | restitution=0.0, 210 | categoryBits=0x0020, 211 | maskBits=0x001) 212 | ) 213 | leg.ground_contact = False 214 | leg.color1 = (0.5,0.4,0.9) 215 | leg.color2 = (0.3,0.3,0.5) 216 | rjd = revoluteJointDef( 217 | bodyA=self.lander, 218 | bodyB=leg, 219 | localAnchorA=(0, 0), 220 | localAnchorB=(i*LEG_AWAY/SCALE, LEG_DOWN/SCALE), 221 | enableMotor=True, 222 | enableLimit=True, 223 | maxMotorTorque=LEG_SPRING_TORQUE, 224 | motorSpeed=+0.3*i # low enough not to jump back into the sky 225 | ) 226 | if i==-1: 227 | rjd.lowerAngle = +0.9 - 0.5 # Yes, the most esoteric numbers here, angles legs have freedom to travel within 228 | rjd.upperAngle = +0.9 229 | else: 230 | rjd.lowerAngle = -0.9 231 | rjd.upperAngle = -0.9 + 0.5 232 | leg.joint = self.world.CreateJoint(rjd) 233 | self.legs.append(leg) 234 | 235 | self.drawlist = [self.lander] + self.legs 236 | 237 | return self.step(np.array([0,0]) if self.continuous else 0)[0] 238 | 239 | def _create_particle(self, mass, x, y, ttl): 240 | p = self.world.CreateDynamicBody( 241 | position = (x,y), 242 | angle=0.0, 243 | fixtures = fixtureDef( 244 | shape=circleShape(radius=2/SCALE, pos=(0,0)), 245 | density=mass, 246 | friction=0.1, 247 | categoryBits=0x0100, 248 | maskBits=0x001, # collide only with ground 249 | restitution=0.3) 250 | ) 251 | p.ttl = ttl 252 | self.particles.append(p) 253 | self._clean_particles(False) 254 | return p 255 | 256 | def _clean_particles(self, all): 257 | while self.particles and (all or self.particles[0].ttl<0): 258 | self.world.DestroyBody(self.particles.pop(0)) 259 | 260 | def step(self, action): 261 | if self.continuous: 262 | action = np.clip(action, -1, +1).astype(np.float32) 263 | else: 264 | assert self.action_space.contains(action), "%r (%s) invalid " % (action, type(action)) 265 | 266 | # Engines 267 | tip = (math.sin(self.lander.angle), math.cos(self.lander.angle)) 268 | side = (-tip[1], tip[0]); 269 | dispersion = [self.np_random.uniform(-1.0, +1.0) / SCALE for _ in range(2)] 270 | 271 | m_power = 0.0 272 | if (self.continuous and action[0] > 0.0) or (not self.continuous and action==2): 273 | # Main engine 274 | if self.continuous: 275 | m_power = (np.clip(action[0], 0.0,1.0) + 1.0)*0.5 # 0.5..1.0 276 | assert m_power>=0.5 and m_power <= 1.0 277 | else: 278 | m_power = 1.0 279 | ox = tip[0]*(4/SCALE + 2*dispersion[0]) + side[0]*dispersion[1] # 4 is move a bit downwards, +-2 for randomness 280 | oy = -tip[1]*(4/SCALE + 2*dispersion[0]) - side[1]*dispersion[1] 281 | impulse_pos = (self.lander.position[0] + ox, self.lander.position[1] + oy) 282 | p = self._create_particle(3.5, impulse_pos[0], impulse_pos[1], m_power) # particles are just a decoration, 3.5 is here to make particle speed adequate 283 | p.ApplyLinearImpulse( ( ox*MAIN_ENGINE_POWER*m_power, oy*MAIN_ENGINE_POWER*m_power), impulse_pos, True) 284 | self.lander.ApplyLinearImpulse( (-ox*MAIN_ENGINE_POWER*m_power, -oy*MAIN_ENGINE_POWER*m_power), impulse_pos, True) 285 | 286 | s_power = 0.0 287 | if (self.continuous and np.abs(action[1]) > 0.5) or (not self.continuous and action in [1,3]): 288 | # Orientation engines 289 | if self.continuous: 290 | direction = np.sign(action[1]) 291 | s_power = np.clip(np.abs(action[1]), 0.5,1.0) 292 | assert s_power>=0.5 and s_power <= 1.0 293 | else: 294 | direction = action-2 295 | s_power = 1.0 296 | ox = tip[0]*dispersion[0] + side[0]*(3*dispersion[1]+direction*SIDE_ENGINE_AWAY/SCALE) 297 | oy = -tip[1]*dispersion[0] - side[1]*(3*dispersion[1]+direction*SIDE_ENGINE_AWAY/SCALE) 298 | impulse_pos = (self.lander.position[0] + ox - tip[0]*17/SCALE, self.lander.position[1] + oy + tip[1]*SIDE_ENGINE_HEIGHT/SCALE) 299 | p = self._create_particle(0.7, impulse_pos[0], impulse_pos[1], s_power) 300 | p.ApplyLinearImpulse( ( ox*SIDE_ENGINE_POWER*s_power, oy*SIDE_ENGINE_POWER*s_power), impulse_pos, True) 301 | self.lander.ApplyLinearImpulse( (-ox*SIDE_ENGINE_POWER*s_power, -oy*SIDE_ENGINE_POWER*s_power), impulse_pos, True) 302 | 303 | self.world.Step(1.0/FPS, 6*30, 2*30) 304 | 305 | pos = self.lander.position 306 | vel = self.lander.linearVelocity 307 | helipad_x = (self.helipad_x1 + self.helipad_x2) / 2 308 | state = [ 309 | (pos.x - VIEWPORT_W/SCALE/2) / (VIEWPORT_W/SCALE/2), 310 | (pos.y - (self.helipad_y+LEG_DOWN/SCALE)) / (VIEWPORT_H/SCALE/2), 311 | vel.x*(VIEWPORT_W/SCALE/2)/FPS, 312 | vel.y*(VIEWPORT_H/SCALE/2)/FPS, 313 | self.lander.angle, 314 | 20.0*self.lander.angularVelocity/FPS, 315 | 1.0 if self.legs[0].ground_contact else 0.0, 316 | 1.0 if self.legs[1].ground_contact else 0.0, 317 | (helipad_x - VIEWPORT_W/SCALE/2) / (VIEWPORT_W/SCALE/2) 318 | ] 319 | if self.terrain_in_ob: 320 | for x, y in zip(self.terrain_x, self.terrain_y): 321 | state.append((x - VIEWPORT_W/SCALE/2) / (VIEWPORT_W/SCALE/2)) 322 | state.append((y - (self.helipad_y+LEG_DOWN/SCALE)) / (VIEWPORT_H/SCALE/2)) 323 | assert len(state)==N_OBS_DIM + 22 * self.terrain_in_ob 324 | self.curr_step += 1 325 | 326 | reward = 0 327 | dx = pos.x - self.target_cx 328 | dy = pos.y - self.target_cy 329 | shaping = \ 330 | - 100*np.sqrt(dx*dx + dy*dy)/2 \ 331 | - 100*np.sqrt(state[2]*state[2] + state[3]*state[3]) \ 332 | - 100*abs(state[4]) + 10*state[6] + 10*state[7] # And ten points for legs contact, the idea is if you 333 | # lose contact again after landing, you get negative reward 334 | 335 | if self.prev_shaping is not None: 336 | reward = shaping - self.prev_shaping 337 | self.prev_shaping = shaping 338 | 339 | reward -= m_power*0.30 # less fuel spent is better, about -30 for heurisic landing 340 | reward -= s_power*0.03 341 | 342 | oob = abs(state[0]) >= 1.0 343 | timeout = self.curr_step >= MAX_NUM_STEPS 344 | at_target = np.sqrt(dx*dx + dy*dy) < TARGET_R/SCALE 345 | 346 | done = self.game_over or oob or at_target or timeout 347 | success = False 348 | crash = False 349 | if done: 350 | if self.game_over or oob: 351 | reward -= 100 352 | crash = True 353 | self.lander.color1 = (1.0,0.0,0.0) 354 | elif at_target: 355 | success = True 356 | reward += 100 357 | self.lander.color1 = (0.0,1.0,0.0) 358 | if self.human_test: 359 | self.render() 360 | import time 361 | time.sleep(1.0) 362 | 363 | info = {'success': success, 'crash': crash} 364 | return np.array(state, dtype=np.float32), reward, done, info 365 | 366 | def render(self, mode='human'): 367 | from gym.envs.classic_control import rendering 368 | if self.viewer is None: 369 | self.viewer = rendering.Viewer(VIEWPORT_W, VIEWPORT_H) 370 | self.viewer.set_bounds(0, VIEWPORT_W/SCALE, 0, VIEWPORT_H/SCALE) 371 | 372 | for obj in self.particles: 373 | obj.ttl -= 0.15 374 | obj.color1 = (max(0.2,0.2+obj.ttl), max(0.2,0.5*obj.ttl), max(0.2,0.5*obj.ttl)) 375 | obj.color2 = (max(0.2,0.2+obj.ttl), max(0.2,0.5*obj.ttl), max(0.2,0.5*obj.ttl)) 376 | 377 | self._clean_particles(False) 378 | 379 | for p in self.sky_polys: 380 | self.viewer.draw_polygon(p, color=(0,0,0)) 381 | 382 | for obj in self.particles + self.drawlist: 383 | for f in obj.fixtures: 384 | trans = f.body.transform 385 | if type(f.shape) is circleShape: 386 | t = rendering.Transform(translation=trans*f.shape.pos) 387 | self.viewer.draw_circle(f.shape.radius, 20, color=obj.color1).add_attr(t) 388 | self.viewer.draw_circle(f.shape.radius, 20, color=obj.color2, filled=False, linewidth=2).add_attr(t) 389 | else: 390 | path = [trans*v for v in f.shape.vertices] 391 | self.viewer.draw_polygon(path, color=obj.color1) 392 | path.append(path[0]) 393 | self.viewer.draw_polyline(path, color=obj.color2, linewidth=2) 394 | 395 | # draw target 396 | t = rendering.Transform(translation=np.array([self.target_cx, self.target_cy])) 397 | self.viewer.draw_circle(TARGET_R / SCALE, 20, color=[255, 0, 0], filled=False, linewidth=5).add_attr(t) 398 | 399 | return self.viewer.render(return_rgb_array = mode=='rgb_array') 400 | 401 | def close(self): 402 | if self.viewer is not None: 403 | self.viewer.close() 404 | self.viewer = None 405 | 406 | def state_dict(self): 407 | return {'rng': self.np_random.get_state()} 408 | 409 | def load_state_dict(self, state_dict): 410 | self.np_random.set_state(state_dict['rng']) 411 | 412 | 413 | class LunarLanderReacher(LunarLander): 414 | continuous = True 415 | 416 | def heuristic(env, s): 417 | # Heuristic for: 418 | # 1. Testing. 419 | # 2. Demonstration rollout. 420 | angle_targ = s[0]*0.5 + s[2]*1.0 # angle should point towards center (s[0] is horizontal coordinate, s[2] hor speed) 421 | if angle_targ > 0.4: angle_targ = 0.4 # more than 0.4 radians (22 degrees) is bad 422 | if angle_targ < -0.4: angle_targ = -0.4 423 | hover_targ = 0.55*np.abs(s[0]) # target y should be proporional to horizontal offset 424 | 425 | # PID controller: s[4] angle, s[5] angularSpeed 426 | angle_todo = (angle_targ - s[4])*0.5 - (s[5])*1.0 427 | #print("angle_targ=%0.2f, angle_todo=%0.2f" % (angle_targ, angle_todo)) 428 | 429 | # PID controller: s[1] vertical coordinate s[3] vertical speed 430 | hover_todo = (hover_targ - s[1])*0.5 - (s[3])*0.5 431 | #print("hover_targ=%0.2f, hover_todo=%0.2f" % (hover_targ, hover_todo)) 432 | 433 | if s[6] or s[7]: # legs have contact 434 | angle_todo = 0 435 | hover_todo = -(s[3])*0.5 # override to reduce fall speed, that's all we need after contact 436 | 437 | if env.continuous: 438 | a = np.array( [hover_todo*20 - 1, -angle_todo*20] ) 439 | a = np.clip(a, -1, +1) 440 | else: 441 | a = 0 442 | if hover_todo > np.abs(angle_todo) and hover_todo > 0.05: a = 2 443 | elif angle_todo < -0.05: a = 3 444 | elif angle_todo > +0.05: a = 1 445 | return a 446 | 447 | def demo_heuristic_lander(env, seed=None, render=False): 448 | env.seed(seed) 449 | total_reward = 0 450 | steps = 0 451 | s = env.reset() 452 | while True: 453 | a = heuristic(env, s) 454 | s, r, done, info = env.step(a) 455 | total_reward += r 456 | 457 | if render: 458 | still_open = env.render() 459 | if still_open == False: break 460 | 461 | if steps % 20 == 0 or done: 462 | print("observations:", " ".join(["{:+0.2f}".format(x) for x in s])) 463 | print("step {} total_reward {:+0.2f}".format(steps, total_reward)) 464 | steps += 1 465 | if done: break 466 | return total_reward 467 | 468 | 469 | if __name__ == '__main__': 470 | demo_heuristic_lander(LunarLander(), render=True) 471 | -------------------------------------------------------------------------------- /lunar_lander/env_lander.py: -------------------------------------------------------------------------------- 1 | import sys, math 2 | import numpy as np 3 | 4 | import Box2D 5 | from Box2D.b2 import (edgeShape, circleShape, fixtureDef, polygonShape, revoluteJointDef, contactListener) 6 | 7 | import gym 8 | from gym import spaces 9 | from gym.utils import seeding, EzPickle 10 | 11 | # Rocket trajectory optimization is a classic topic in Optimal Control. 12 | # 13 | # According to Pontryagin's maximum principle it's optimal to fire engine full throttle or 14 | # turn it off. That's the reason this environment is OK to have discreet actions (engine on or off). 15 | # 16 | # Landing pad is always at coordinates (0,0). Coordinates are the first two numbers in state vector. 17 | # Reward for moving from the top of the screen to landing pad and zero speed is about 100..140 points. 18 | # If lander moves away from landing pad it loses reward back. Episode finishes if the lander crashes or 19 | # comes to rest, receiving additional -100 or +100 points. Each leg ground contact is +10. Firing main 20 | # engine is -0.3 points each frame. Firing side engine is -0.03 points each frame. Solved is 200 points. 21 | # 22 | # Landing outside landing pad is possible. Fuel is infinite, so an agent can learn to fly and then land 23 | # on its first attempt. Please see source code for details. 24 | # 25 | # To see heuristic landing, run: 26 | # 27 | # python gym/envs/box2d/lunar_lander.py 28 | # 29 | # To play yourself, run: 30 | # 31 | # python examples/agents/keyboard_agent.py LunarLander-v2 32 | # 33 | # Created by Oleg Klimov. Licensed on the same terms as the rest of OpenAI Gym. 34 | 35 | FPS = 50 36 | SCALE = 30.0 # affects how fast-paced the game is, forces should be adjusted as well 37 | 38 | N_OBS_DIM = 9 39 | MAX_NUM_STEPS = 1000 40 | GROUND_SLACK = 0.01 41 | LANDED_SLACK = 50 42 | 43 | MAIN_ENGINE_POWER = 13.0 44 | SIDE_ENGINE_POWER = 0.6 45 | 46 | INITIAL_RANDOM = 1000.0 # Set 1500 to make game harder 47 | 48 | LANDER_POLY =[ 49 | (-14,+17), (-17,0), (-17,-10), 50 | (+17,-10), (+17,0), (+14,+17) 51 | ] 52 | LEG_AWAY = 20 53 | LEG_DOWN = 18 54 | LEG_W, LEG_H = 2, 8 55 | LEG_SPRING_TORQUE = 40 56 | 57 | SIDE_ENGINE_HEIGHT = 14.0 58 | SIDE_ENGINE_AWAY = 12.0 59 | 60 | VIEWPORT_W = 600 61 | VIEWPORT_H = 400 62 | 63 | FUEL_COST_MAIN = 0.3 64 | FUEL_COST_SIDE = 0.03 65 | 66 | class ContactDetector(contactListener): 67 | def __init__(self, env): 68 | contactListener.__init__(self) 69 | self.env = env 70 | def BeginContact(self, contact): 71 | if self.env.lander==contact.fixtureA.body or self.env.lander==contact.fixtureB.body: 72 | self.env.game_over = True 73 | for i in range(2): 74 | if self.env.legs[i] in [contact.fixtureA.body, contact.fixtureB.body]: 75 | self.env.legs[i].ground_contact = True 76 | def EndContact(self, contact): 77 | for i in range(2): 78 | if self.env.legs[i] in [contact.fixtureA.body, contact.fixtureB.body]: 79 | self.env.legs[i].ground_contact = False 80 | 81 | class LunarLander(gym.Env, EzPickle): 82 | metadata = { 83 | 'render.modes': ['human', 'rgb_array'], 84 | 'video.frames_per_second' : FPS 85 | } 86 | 87 | continuous = False 88 | goal_seeking = True 89 | terrain_in_ob = True 90 | human_test = False 91 | 92 | def __init__(self): 93 | EzPickle.__init__(self) 94 | self.seed() 95 | self.viewer = None 96 | 97 | self.world = Box2D.b2World() 98 | self.moon = None 99 | self.lander = None 100 | self.particles = [] 101 | 102 | self.prev_reward = None 103 | self.curr_step = None 104 | self.at_site = None 105 | self.grounded = None 106 | 107 | # useful range is -1 .. +1, but spikes can be higher 108 | ob_dim = N_OBS_DIM + 22 if self.terrain_in_ob else N_OBS_DIM 109 | self.observation_space = spaces.Box(-np.inf, np.inf, shape=(ob_dim,), dtype=np.float32) 110 | 111 | if self.continuous: 112 | # Action is two floats [main engine, left-right engines]. 113 | # Main engine: -1..0 off, 0..+1 throttle from 50% to 100% power. Engine can't work with less than 50% power. 114 | # Left-right: -1.0..-0.5 fire left engine, +0.5..+1.0 fire right engine, -0.5..0.5 off 115 | self.action_space = spaces.Box(-1, +1, (2,), dtype=np.float32) 116 | else: 117 | # Nop, fire left engine, main engine, right engine 118 | self.action_space = spaces.Discrete(4) 119 | 120 | self.reset() 121 | 122 | def seed(self, seed=None): 123 | self.np_random, seed = seeding.np_random(seed) 124 | return [seed] 125 | 126 | def _destroy(self): 127 | if not self.moon: return 128 | self.world.contactListener = None 129 | self._clean_particles(True) 130 | self.world.DestroyBody(self.moon) 131 | self.moon = None 132 | self.world.DestroyBody(self.lander) 133 | self.lander = None 134 | self.world.DestroyBody(self.legs[0]) 135 | self.world.DestroyBody(self.legs[1]) 136 | 137 | def reset(self): 138 | self._destroy() 139 | self.world.contactListener_keepref = ContactDetector(self) 140 | self.world.contactListener = self.world.contactListener_keepref 141 | self.game_over = False 142 | self.prev_shaping = None 143 | self.curr_step = 0 144 | self.at_site = [False for _ in range(LANDED_SLACK)] 145 | self.grounded = [False for _ in range(LANDED_SLACK)] 146 | 147 | W = VIEWPORT_W/SCALE 148 | H = VIEWPORT_H/SCALE 149 | 150 | # terrain 151 | CHUNKS = 11 152 | height = self.np_random.uniform(0, H/2, size=(CHUNKS+1,) ) 153 | chunk_x = [W/(CHUNKS-1)*i for i in range(CHUNKS)] 154 | # randomize helipad x coord 155 | helipad_chunk = np.random.choice(range(1, CHUNKS-1)) 156 | 157 | self.helipad_x1 = chunk_x[helipad_chunk-1] 158 | self.helipad_x2 = chunk_x[helipad_chunk+1] 159 | self.helipad_y = H/4 160 | height[helipad_chunk-2] = self.helipad_y 161 | height[helipad_chunk-1] = self.helipad_y 162 | height[helipad_chunk+0] = self.helipad_y 163 | height[helipad_chunk+1] = self.helipad_y 164 | height[helipad_chunk+2] = self.helipad_y 165 | smooth_y = [0.33*(height[i-1] + height[i+0] + height[i+1]) for i in range(CHUNKS)] 166 | self.terrain_x = chunk_x 167 | self.terrain_y = smooth_y 168 | 169 | self.moon = self.world.CreateStaticBody( shapes=edgeShape(vertices=[(0, 0), (W, 0)]) ) 170 | self.sky_polys = [] 171 | for i in range(CHUNKS-1): 172 | p1 = (chunk_x[i], smooth_y[i]) 173 | p2 = (chunk_x[i+1], smooth_y[i+1]) 174 | self.moon.CreateEdgeFixture( 175 | vertices=[p1,p2], 176 | density=0, 177 | friction=0.1) 178 | self.sky_polys.append( [p1, p2, (p2[0],H), (p1[0],H)] ) 179 | 180 | self.moon.color1 = (0.0,0.0,0.0) 181 | self.moon.color2 = (0.0,0.0,0.0) 182 | 183 | initial_y = VIEWPORT_H/SCALE 184 | self.lander = self.world.CreateDynamicBody( 185 | position = (VIEWPORT_W/SCALE/2, initial_y), 186 | angle=0.0, 187 | fixtures = fixtureDef( 188 | shape=polygonShape(vertices=[ (x/SCALE,y/SCALE) for x,y in LANDER_POLY ]), 189 | density=5.0, 190 | friction=0.1, 191 | categoryBits=0x0010, 192 | maskBits=0x001, # collide only with ground 193 | restitution=0.0) # 0.99 bouncy 194 | ) 195 | self.lander.color1 = (0.5,0.4,0.9) 196 | self.lander.color2 = (0.3,0.3,0.5) 197 | self.lander.ApplyForceToCenter( ( 198 | self.np_random.uniform(-INITIAL_RANDOM, INITIAL_RANDOM), 199 | self.np_random.uniform(-INITIAL_RANDOM, INITIAL_RANDOM) 200 | ), True) 201 | 202 | self.legs = [] 203 | for i in [-1,+1]: 204 | leg = self.world.CreateDynamicBody( 205 | position = (VIEWPORT_W/SCALE/2 - i*LEG_AWAY/SCALE, initial_y), 206 | angle = (i*0.05), 207 | fixtures = fixtureDef( 208 | shape=polygonShape(box=(LEG_W/SCALE, LEG_H/SCALE)), 209 | density=1.0, 210 | restitution=0.0, 211 | categoryBits=0x0020, 212 | maskBits=0x001) 213 | ) 214 | leg.ground_contact = False 215 | leg.color1 = (0.5,0.4,0.9) 216 | leg.color2 = (0.3,0.3,0.5) 217 | rjd = revoluteJointDef( 218 | bodyA=self.lander, 219 | bodyB=leg, 220 | localAnchorA=(0, 0), 221 | localAnchorB=(i*LEG_AWAY/SCALE, LEG_DOWN/SCALE), 222 | enableMotor=True, 223 | enableLimit=True, 224 | maxMotorTorque=LEG_SPRING_TORQUE, 225 | motorSpeed=+0.3*i # low enough not to jump back into the sky 226 | ) 227 | if i==-1: 228 | rjd.lowerAngle = +0.9 - 0.5 # Yes, the most esoteric numbers here, angles legs have freedom to travel within 229 | rjd.upperAngle = +0.9 230 | else: 231 | rjd.lowerAngle = -0.9 232 | rjd.upperAngle = -0.9 + 0.5 233 | leg.joint = self.world.CreateJoint(rjd) 234 | self.legs.append(leg) 235 | 236 | self.drawlist = [self.lander] + self.legs 237 | 238 | return self.step(np.array([0,0]) if self.continuous else 0)[0] 239 | 240 | def _create_particle(self, mass, x, y, ttl): 241 | p = self.world.CreateDynamicBody( 242 | position = (x,y), 243 | angle=0.0, 244 | fixtures = fixtureDef( 245 | shape=circleShape(radius=2/SCALE, pos=(0,0)), 246 | density=mass, 247 | friction=0.1, 248 | categoryBits=0x0100, 249 | maskBits=0x001, # collide only with ground 250 | restitution=0.3) 251 | ) 252 | p.ttl = ttl 253 | self.particles.append(p) 254 | self._clean_particles(False) 255 | return p 256 | 257 | def _clean_particles(self, all): 258 | while self.particles and (all or self.particles[0].ttl<0): 259 | self.world.DestroyBody(self.particles.pop(0)) 260 | 261 | def step(self, action): 262 | if self.continuous: 263 | action = np.clip(action, -1, +1).astype(np.float32) 264 | else: 265 | assert self.action_space.contains(action), "%r (%s) invalid " % (action, type(action)) 266 | 267 | # Engines 268 | tip = (math.sin(self.lander.angle), math.cos(self.lander.angle)) 269 | side = (-tip[1], tip[0]); 270 | dispersion = [self.np_random.uniform(-1.0, +1.0) / SCALE for _ in range(2)] 271 | 272 | m_power = 0.0 273 | if (self.continuous and action[0] > 0.0) or (not self.continuous and action==2): 274 | # Main engine 275 | if self.continuous: 276 | m_power = (np.clip(action[0], 0.0,1.0) + 1.0)*0.5 # 0.5..1.0 277 | assert m_power>=0.5 and m_power <= 1.0 278 | else: 279 | m_power = 1.0 280 | ox = tip[0]*(4/SCALE + 2*dispersion[0]) + side[0]*dispersion[1] # 4 is move a bit downwards, +-2 for randomness 281 | oy = -tip[1]*(4/SCALE + 2*dispersion[0]) - side[1]*dispersion[1] 282 | impulse_pos = (self.lander.position[0] + ox, self.lander.position[1] + oy) 283 | p = self._create_particle(3.5, impulse_pos[0], impulse_pos[1], m_power) # particles are just a decoration, 3.5 is here to make particle speed adequate 284 | p.ApplyLinearImpulse( ( ox*MAIN_ENGINE_POWER*m_power, oy*MAIN_ENGINE_POWER*m_power), impulse_pos, True) 285 | self.lander.ApplyLinearImpulse( (-ox*MAIN_ENGINE_POWER*m_power, -oy*MAIN_ENGINE_POWER*m_power), impulse_pos, True) 286 | 287 | s_power = 0.0 288 | if (self.continuous and np.abs(action[1]) > 0.5) or (not self.continuous and action in [1,3]): 289 | # Orientation engines 290 | if self.continuous: 291 | direction = np.sign(action[1]) 292 | s_power = np.clip(np.abs(action[1]), 0.5,1.0) 293 | assert s_power>=0.5 and s_power <= 1.0 294 | else: 295 | direction = action-2 296 | s_power = 1.0 297 | ox = tip[0]*dispersion[0] + side[0]*(3*dispersion[1]+direction*SIDE_ENGINE_AWAY/SCALE) 298 | oy = -tip[1]*dispersion[0] - side[1]*(3*dispersion[1]+direction*SIDE_ENGINE_AWAY/SCALE) 299 | impulse_pos = (self.lander.position[0] + ox - tip[0]*17/SCALE, self.lander.position[1] + oy + tip[1]*SIDE_ENGINE_HEIGHT/SCALE) 300 | p = self._create_particle(0.7, impulse_pos[0], impulse_pos[1], s_power) 301 | p.ApplyLinearImpulse( ( ox*SIDE_ENGINE_POWER*s_power, oy*SIDE_ENGINE_POWER*s_power), impulse_pos, True) 302 | self.lander.ApplyLinearImpulse( (-ox*SIDE_ENGINE_POWER*s_power, -oy*SIDE_ENGINE_POWER*s_power), impulse_pos, True) 303 | 304 | self.world.Step(1.0/FPS, 6*30, 2*30) 305 | 306 | pos = self.lander.position 307 | vel = self.lander.linearVelocity 308 | helipad_x = (self.helipad_x1 + self.helipad_x2) / 2 309 | state = [ 310 | (pos.x - VIEWPORT_W/SCALE/2) / (VIEWPORT_W/SCALE/2), 311 | (pos.y - (self.helipad_y+LEG_DOWN/SCALE)) / (VIEWPORT_H/SCALE/2), 312 | vel.x*(VIEWPORT_W/SCALE/2)/FPS, 313 | vel.y*(VIEWPORT_H/SCALE/2)/FPS, 314 | self.lander.angle, 315 | 20.0*self.lander.angularVelocity/FPS, 316 | 1.0 if self.legs[0].ground_contact else 0.0, 317 | 1.0 if self.legs[1].ground_contact else 0.0, 318 | (helipad_x - VIEWPORT_W/SCALE/2) / (VIEWPORT_W/SCALE/2) 319 | ] 320 | if self.terrain_in_ob: 321 | for x, y in zip(self.terrain_x, self.terrain_y): 322 | state.append((x - VIEWPORT_W/SCALE/2) / (VIEWPORT_W/SCALE/2)) 323 | state.append((y - (self.helipad_y+LEG_DOWN/SCALE)) / (VIEWPORT_H/SCALE/2)) 324 | assert len(state)==N_OBS_DIM + 22 * self.terrain_in_ob 325 | self.curr_step += 1 326 | 327 | reward = 0 328 | dx = (pos.x - helipad_x) / (VIEWPORT_W/SCALE/2) 329 | if self.goal_seeking: 330 | shaping = \ 331 | - 100*np.sqrt(dx*dx + state[1]*state[1]) \ 332 | - 100*np.sqrt(state[2]*state[2] + state[3]*state[3]) \ 333 | - 100*abs(state[4]) + 10*state[6] + 10*state[7] # And ten points for legs contact, the idea is if you 334 | # lose contact again after landing, you get negative reward 335 | 336 | # Remove distance to goal from reward shaping. CHANGED 337 | else: 338 | shaping = \ 339 | - 100*np.sqrt(state[2]*state[2] + state[3]*state[3]) \ 340 | - 100*abs(state[4]) + 10*state[6] + 10*state[7] # And ten points for legs contact, the idea is if you 341 | # lose contact again after landing, you get negative reward 342 | if self.prev_shaping is not None: 343 | reward = shaping - self.prev_shaping 344 | self.prev_shaping = shaping 345 | 346 | reward -= m_power*FUEL_COST_MAIN 347 | reward -= s_power*FUEL_COST_SIDE 348 | 349 | oob = abs(state[0]) >= 1.0 350 | timeout = self.curr_step >= MAX_NUM_STEPS 351 | not_awake = not self.lander.awake 352 | 353 | at_site = pos.x >= self.helipad_x1 and pos.x <= self.helipad_x2 354 | if self.goal_seeking: 355 | self.at_site[:-1] = self.at_site[1:] 356 | self.at_site[-1] = pos.x >= self.helipad_x1 and pos.x <= self.helipad_x2 and state[1] <= GROUND_SLACK 357 | self.grounded[:-1] = self.grounded[1:] 358 | self.grounded[-1] = self.legs[0].ground_contact and self.legs[1].ground_contact 359 | landed = all(self.at_site) and any(self.grounded) 360 | else: 361 | landed = False 362 | 363 | 364 | done = self.game_over or oob or not_awake or timeout or landed 365 | success = False 366 | crash = False 367 | if done: 368 | if self.game_over or oob: 369 | reward -= 100 370 | crash = True 371 | self.lander.color1 = (1.0,0.0,0.0) 372 | elif ((not timeout and at_site) or landed) and self.goal_seeking: 373 | success = True 374 | reward += 100 375 | self.lander.color1 = (0.0,1.0,0.0) 376 | if self.human_test: 377 | self.render() 378 | import time 379 | time.sleep(1.0) 380 | 381 | info = {'success': success, 'crash': crash} 382 | return np.array(state, dtype=np.float32), reward, done, info 383 | 384 | def render(self, mode='human'): 385 | from gym.envs.classic_control import rendering 386 | if self.viewer is None: 387 | self.viewer = rendering.Viewer(VIEWPORT_W, VIEWPORT_H) 388 | self.viewer.set_bounds(0, VIEWPORT_W/SCALE, 0, VIEWPORT_H/SCALE) 389 | 390 | for obj in self.particles: 391 | obj.ttl -= 0.15 392 | obj.color1 = (max(0.2,0.2+obj.ttl), max(0.2,0.5*obj.ttl), max(0.2,0.5*obj.ttl)) 393 | obj.color2 = (max(0.2,0.2+obj.ttl), max(0.2,0.5*obj.ttl), max(0.2,0.5*obj.ttl)) 394 | 395 | self._clean_particles(False) 396 | 397 | for p in self.sky_polys: 398 | self.viewer.draw_polygon(p, color=(0,0,0)) 399 | 400 | for obj in self.particles + self.drawlist: 401 | for f in obj.fixtures: 402 | trans = f.body.transform 403 | if type(f.shape) is circleShape: 404 | t = rendering.Transform(translation=trans*f.shape.pos) 405 | self.viewer.draw_circle(f.shape.radius, 20, color=obj.color1).add_attr(t) 406 | self.viewer.draw_circle(f.shape.radius, 20, color=obj.color2, filled=False, linewidth=2).add_attr(t) 407 | else: 408 | path = [trans*v for v in f.shape.vertices] 409 | self.viewer.draw_polygon(path, color=obj.color1) 410 | path.append(path[0]) 411 | self.viewer.draw_polyline(path, color=obj.color2, linewidth=2) 412 | 413 | for x in [self.helipad_x1, self.helipad_x2]: 414 | flagy1 = self.helipad_y 415 | flagy2 = flagy1 + 50/SCALE 416 | self.viewer.draw_polyline( [(x, flagy1), (x, flagy2)], color=(1,1,1) ) 417 | self.viewer.draw_polygon( [(x, flagy2), (x, flagy2-10/SCALE), (x+25/SCALE, flagy2-5/SCALE)], color=(0.8,0.8,0) ) 418 | 419 | return self.viewer.render(return_rgb_array = mode=='rgb_array') 420 | 421 | def close(self): 422 | if self.viewer is not None: 423 | self.viewer.close() 424 | self.viewer = None 425 | 426 | def state_dict(self): 427 | return {'rng': self.np_random.get_state()} 428 | 429 | def load_state_dict(self, state_dict): 430 | self.np_random.set_state(state_dict['rng']) 431 | 432 | 433 | class LunarLanderContinuous(LunarLander): 434 | continuous = True 435 | 436 | class LunarLanderNoGoal(LunarLander): 437 | goal_seeking = False 438 | 439 | class LunarLanderContinuousNoGoal(LunarLander): 440 | continuous = True 441 | goal_seeking = False 442 | 443 | def heuristic(env, s): 444 | # Heuristic for: 445 | # 1. Testing. 446 | # 2. Demonstration rollout. 447 | angle_targ = s[0]*0.5 + s[2]*1.0 # angle should point towards center (s[0] is horizontal coordinate, s[2] hor speed) 448 | if angle_targ > 0.4: angle_targ = 0.4 # more than 0.4 radians (22 degrees) is bad 449 | if angle_targ < -0.4: angle_targ = -0.4 450 | hover_targ = 0.55*np.abs(s[0]) # target y should be proporional to horizontal offset 451 | 452 | # PID controller: s[4] angle, s[5] angularSpeed 453 | angle_todo = (angle_targ - s[4])*0.5 - (s[5])*1.0 454 | #print("angle_targ=%0.2f, angle_todo=%0.2f" % (angle_targ, angle_todo)) 455 | 456 | # PID controller: s[1] vertical coordinate s[3] vertical speed 457 | hover_todo = (hover_targ - s[1])*0.5 - (s[3])*0.5 458 | #print("hover_targ=%0.2f, hover_todo=%0.2f" % (hover_targ, hover_todo)) 459 | 460 | if s[6] or s[7]: # legs have contact 461 | angle_todo = 0 462 | hover_todo = -(s[3])*0.5 # override to reduce fall speed, that's all we need after contact 463 | 464 | if env.continuous: 465 | a = np.array( [hover_todo*20 - 1, -angle_todo*20] ) 466 | a = np.clip(a, -1, +1) 467 | else: 468 | a = 0 469 | if hover_todo > np.abs(angle_todo) and hover_todo > 0.05: a = 2 470 | elif angle_todo < -0.05: a = 3 471 | elif angle_todo > +0.05: a = 1 472 | return a 473 | 474 | def demo_heuristic_lander(env, seed=None, render=False): 475 | env.seed(seed) 476 | total_reward = 0 477 | steps = 0 478 | s = env.reset() 479 | while True: 480 | a = heuristic(env, s) 481 | s, r, done, info = env.step(a) 482 | total_reward += r 483 | 484 | if render: 485 | still_open = env.render() 486 | if still_open == False: break 487 | 488 | if steps % 20 == 0 or done: 489 | print("observations:", " ".join(["{:+0.2f}".format(x) for x in s])) 490 | print("step {} total_reward {:+0.2f}".format(steps, total_reward)) 491 | steps += 1 492 | if done: break 493 | return total_reward 494 | 495 | 496 | if __name__ == '__main__': 497 | demo_heuristic_lander(LunarLander(), render=True) 498 | --------------------------------------------------------------------------------