├── envs ├── __init__.py ├── func_optim │ ├── base.py │ ├── dejong.py │ └── ackley.py └── base.py ├── src └── gippo │ ├── __init__.py │ ├── runner.py │ ├── dataset.py │ ├── rl_algorithm │ ├── rp.py │ ├── lr.py │ ├── lrp.py │ ├── ppo.py │ ├── gippo.py │ └── base.py │ ├── vecenv.py │ ├── network.py │ ├── utils.py │ └── experience.py ├── .gitignore ├── setup.py ├── README.md ├── config └── func_optim │ ├── ackley │ ├── lr.yaml │ ├── rp.yaml │ ├── lrp.yaml │ ├── ppo.yaml │ └── gippo.yaml │ ├── dejong │ ├── lr.yaml │ ├── rp.yaml │ ├── lrp.yaml │ ├── ppo.yaml │ └── gippo.yaml │ ├── ackley64 │ ├── lr.yaml │ ├── lrp.yaml │ ├── rp.yaml │ ├── ppo.yaml │ └── gippo.yaml │ └── dejong64 │ ├── lr.yaml │ ├── lrp.yaml │ ├── rp.yaml │ ├── ppo.yaml │ └── gippo.yaml ├── run_func_optim.sh └── train.py /envs/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/gippo/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.egg-info 2 | .vscode/ 3 | __pycache__/ 4 | logdir/ -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name="gippo", 5 | version="0.0", 6 | description="Implementation of gradient informed proximal policy optimization (GI-PPO) algorithm", 7 | author="Sanghyun Son", 8 | author_email="shh1295@umd.edu", 9 | ) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Introduction 2 | 3 | This is the code repository for the paper ["Gradient Informed Proximal Policy Optimization"](https://arxiv.org/abs/2312.08710), which was presented in the Neurips 2023 conference. This code was implemented on the basis of [rl_games](https://github.com/Denys88/rl_games) and [SHAC](https://github.com/NVlabs/DiffRL). 4 | 5 | # Installation 6 | 7 | We need following packages. 8 | 9 | * pytorch 1.13.1 (https://pytorch.org/get-started/previous-versions/) 10 | * pyyaml 6.0.1 (pip install pyyaml) 11 | * tensorboard (pip install tensorboard) 12 | * tensorboardx 2.6.2 (pip install tensorboardx) 13 | * urdfpy (pip install urdfpy) 14 | * usd-core 23.8 (pip install usd-core) 15 | * ray 2.6.2 (pip install ray) 16 | * ninja 1.10.2 (conda install -c conda-forge ninja) 17 | * cudatoolkit (conda install -c anaconda cudatoolkit) 18 | * cudatoolkit-dev (conda install -c conda-forge cudatoolkit-dev) 19 | * optuna 3.2.0 (pip install optuna) 20 | * optuna-dashboard 0.11.0 (pip install optuna-dashboard) 21 | * matplotlib (pip install matplotlib) 22 | * highway-env 1.8.2 (pip install highway-env) 23 | * seaborn (pip install seaborn) 24 | * gym (pip install gym) 25 | 26 | Then, run following command. 27 | 28 | ```bash 29 | pip install -e . 30 | ``` 31 | 32 | # Usage 33 | 34 | Run following command for function optimization problems. 35 | 36 | ```bash 37 | bash ./run_func_optim.sh 38 | ``` 39 | -------------------------------------------------------------------------------- /config/func_optim/ackley/lr.yaml: -------------------------------------------------------------------------------- 1 | params: 2 | seed: 1 3 | device: 'cuda:0' 4 | 5 | env: 6 | name: AckleyEnv 7 | config: 8 | dim: 1 9 | 10 | algo: 11 | name: lr 12 | 13 | # network 14 | network: 15 | actor: ActorStochasticMLP 16 | actor_mlp: 17 | units: [32, 32] 18 | activation: elu 19 | actor_logstd_init: 0.0 20 | fixed_sigma: False 21 | 22 | critic: CriticMLP 23 | critic_mlp: 24 | units: [32, 32] 25 | activation: elu 26 | 27 | # length 28 | horizon_length: 1 29 | max_epochs: 2000 30 | 31 | # normalize 32 | normalize_input: True 33 | normalize_value: True 34 | normalize_advantage: True 35 | 36 | # GAE 37 | gamma: 0.99 38 | tau: 0.95 39 | 40 | # save 41 | save_best_after: 50 42 | save_frequency: 100 43 | 44 | grad_norm: 1.0 45 | truncate_grads: True 46 | steps_num: 1 47 | 48 | num_actors: 64 49 | minibatch_size: 64 50 | mini_epochs: 5 51 | 52 | critic_coef: 4 53 | clip_value: True 54 | 55 | defer_summaries_sec: 0.001 56 | summaries_interval_sec_min: 0.001 57 | summaries_interval_sec_max: 0.002 58 | 59 | 60 | 61 | # actor 62 | actor_learning_rate: 1e-4 63 | 64 | # critic 65 | critic_learning_rate: 1e-3 66 | critic_iterations: 16 67 | critic_num_batch: 4 68 | target_critic_alpha: 0.2 69 | 70 | # learning rate scheduler 71 | lr_schedule: linear # [constant, linear] 72 | 73 | # adam 74 | betas: [0.7, 0.95] -------------------------------------------------------------------------------- /config/func_optim/ackley/rp.yaml: -------------------------------------------------------------------------------- 1 | params: 2 | seed: 1 3 | device: 'cuda:0' 4 | 5 | env: 6 | name: AckleyEnv 7 | config: 8 | dim: 1 9 | 10 | algo: 11 | name: rp 12 | 13 | # network 14 | network: 15 | actor: ActorStochasticMLP 16 | actor_mlp: 17 | units: [32, 32] 18 | activation: elu 19 | actor_logstd_init: 0.0 20 | fixed_sigma: False 21 | 22 | critic: CriticMLP 23 | critic_mlp: 24 | units: [32, 32] 25 | activation: elu 26 | 27 | # length 28 | horizon_length: 1 29 | max_epochs: 2000 30 | 31 | # normalize 32 | normalize_input: True 33 | normalize_value: True 34 | normalize_advantage: True 35 | 36 | # GAE 37 | gamma: 0.99 38 | tau: 0.95 39 | 40 | # save 41 | save_best_after: 50 42 | save_frequency: 100 43 | 44 | grad_norm: 1.0 45 | truncate_grads: True 46 | steps_num: 1 47 | 48 | num_actors: 64 49 | minibatch_size: 64 50 | mini_epochs: 5 51 | 52 | critic_coef: 4 53 | clip_value: True 54 | 55 | defer_summaries_sec: 0.001 56 | summaries_interval_sec_min: 0.001 57 | summaries_interval_sec_max: 0.002 58 | 59 | 60 | 61 | # actor 62 | actor_learning_rate: 1e-3 63 | 64 | # critic 65 | critic_learning_rate: 1e-3 66 | critic_iterations: 16 67 | critic_num_batch: 4 68 | target_critic_alpha: 0.2 69 | 70 | # learning rate scheduler 71 | lr_schedule: linear # [constant, linear] 72 | 73 | # adam 74 | betas: [0.7, 0.95] -------------------------------------------------------------------------------- /config/func_optim/dejong/lr.yaml: -------------------------------------------------------------------------------- 1 | params: 2 | seed: 1 3 | device: 'cuda:0' 4 | 5 | env: 6 | name: DejongEnv 7 | config: 8 | dim: 1 9 | 10 | algo: 11 | name: lr 12 | 13 | # network 14 | network: 15 | actor: ActorStochasticMLP 16 | actor_mlp: 17 | units: [32, 32] 18 | activation: elu 19 | actor_logstd_init: 0.0 20 | fixed_sigma: False 21 | 22 | critic: CriticMLP 23 | critic_mlp: 24 | units: [32, 32] 25 | activation: elu 26 | 27 | # length 28 | horizon_length: 1 29 | max_epochs: 2000 30 | 31 | # normalize 32 | normalize_input: True 33 | normalize_value: True 34 | normalize_advantage: True 35 | 36 | # GAE 37 | gamma: 0.99 38 | tau: 0.95 39 | 40 | # save 41 | save_best_after: 50 42 | save_frequency: 100 43 | 44 | grad_norm: 1.0 45 | truncate_grads: True 46 | steps_num: 1 47 | 48 | num_actors: 64 49 | minibatch_size: 64 50 | mini_epochs: 5 51 | 52 | critic_coef: 4 53 | clip_value: True 54 | 55 | defer_summaries_sec: 0.001 56 | summaries_interval_sec_min: 0.001 57 | summaries_interval_sec_max: 0.002 58 | 59 | 60 | 61 | # actor 62 | actor_learning_rate: 1e-3 63 | 64 | # critic 65 | critic_learning_rate: 1e-3 66 | critic_iterations: 16 67 | critic_num_batch: 4 68 | target_critic_alpha: 0.2 69 | 70 | # learning rate scheduler 71 | lr_schedule: linear # [constant, linear] 72 | 73 | # adam 74 | betas: [0.7, 0.95] -------------------------------------------------------------------------------- /config/func_optim/dejong/rp.yaml: -------------------------------------------------------------------------------- 1 | params: 2 | seed: 1 3 | device: 'cuda:0' 4 | 5 | env: 6 | name: DejongEnv 7 | config: 8 | dim: 1 9 | 10 | algo: 11 | name: rp 12 | 13 | # network 14 | network: 15 | actor: ActorStochasticMLP 16 | actor_mlp: 17 | units: [32, 32] 18 | activation: elu 19 | actor_logstd_init: 0.0 20 | fixed_sigma: False 21 | 22 | critic: CriticMLP 23 | critic_mlp: 24 | units: [32, 32] 25 | activation: elu 26 | 27 | # length 28 | horizon_length: 1 29 | max_epochs: 2000 30 | 31 | # normalize 32 | normalize_input: True 33 | normalize_value: True 34 | normalize_advantage: True 35 | 36 | # GAE 37 | gamma: 0.99 38 | tau: 0.95 39 | 40 | # save 41 | save_best_after: 50 42 | save_frequency: 100 43 | 44 | grad_norm: 1.0 45 | truncate_grads: True 46 | steps_num: 1 47 | 48 | num_actors: 64 49 | minibatch_size: 64 50 | mini_epochs: 5 51 | 52 | critic_coef: 4 53 | clip_value: True 54 | 55 | defer_summaries_sec: 0.001 56 | summaries_interval_sec_min: 0.001 57 | summaries_interval_sec_max: 0.002 58 | 59 | 60 | 61 | # actor 62 | actor_learning_rate: 1e-2 63 | 64 | # critic 65 | critic_learning_rate: 1e-3 66 | critic_iterations: 16 67 | critic_num_batch: 4 68 | target_critic_alpha: 0.2 69 | 70 | # learning rate scheduler 71 | lr_schedule: linear # [constant, linear] 72 | 73 | # adam 74 | betas: [0.7, 0.95] -------------------------------------------------------------------------------- /config/func_optim/ackley/lrp.yaml: -------------------------------------------------------------------------------- 1 | params: 2 | seed: 1 3 | device: 'cuda:0' 4 | 5 | env: 6 | name: AckleyEnv 7 | config: 8 | dim: 1 9 | 10 | algo: 11 | name: lrp 12 | 13 | # network 14 | network: 15 | actor: ActorStochasticMLP 16 | actor_mlp: 17 | units: [32, 32] 18 | activation: elu 19 | actor_logstd_init: 0.0 20 | fixed_sigma: False 21 | 22 | critic: CriticMLP 23 | critic_mlp: 24 | units: [32, 32] 25 | activation: elu 26 | 27 | # length 28 | horizon_length: 1 29 | max_epochs: 2000 30 | 31 | # normalize 32 | normalize_input: True 33 | normalize_value: True 34 | normalize_advantage: True 35 | 36 | # GAE 37 | gamma: 0.99 38 | tau: 0.95 39 | 40 | # save 41 | save_best_after: 50 42 | save_frequency: 100 43 | 44 | grad_norm: 1.0 45 | truncate_grads: True 46 | steps_num: 1 47 | 48 | num_actors: 64 49 | minibatch_size: 64 50 | mini_epochs: 5 51 | 52 | critic_coef: 4 53 | clip_value: True 54 | 55 | defer_summaries_sec: 0.001 56 | summaries_interval_sec_min: 0.001 57 | summaries_interval_sec_max: 0.002 58 | 59 | 60 | 61 | # actor 62 | actor_learning_rate: 1e-4 63 | 64 | # critic 65 | critic_learning_rate: 1e-3 66 | critic_iterations: 16 67 | critic_num_batch: 4 68 | target_critic_alpha: 0.2 69 | 70 | # learning rate scheduler 71 | lr_schedule: linear # [constant, linear] 72 | 73 | # adam 74 | betas: [0.7, 0.95] -------------------------------------------------------------------------------- /config/func_optim/ackley64/lr.yaml: -------------------------------------------------------------------------------- 1 | params: 2 | seed: 1 3 | device: 'cuda:0' 4 | 5 | env: 6 | name: AckleyEnv 7 | config: 8 | dim: 64 9 | 10 | algo: 11 | name: lr 12 | 13 | # network 14 | network: 15 | actor: ActorStochasticMLP 16 | actor_mlp: 17 | units: [32, 32] 18 | activation: elu 19 | actor_logstd_init: 0.0 20 | fixed_sigma: False 21 | 22 | critic: CriticMLP 23 | critic_mlp: 24 | units: [32, 32] 25 | activation: elu 26 | 27 | # length 28 | horizon_length: 1 29 | max_epochs: 2000 30 | 31 | # normalize 32 | normalize_input: True 33 | normalize_value: True 34 | normalize_advantage: True 35 | 36 | # GAE 37 | gamma: 0.99 38 | tau: 0.95 39 | 40 | # save 41 | save_best_after: 50 42 | save_frequency: 100 43 | 44 | grad_norm: 1.0 45 | truncate_grads: True 46 | steps_num: 1 47 | 48 | num_actors: 64 49 | minibatch_size: 64 50 | mini_epochs: 5 51 | 52 | critic_coef: 4 53 | clip_value: True 54 | 55 | defer_summaries_sec: 0.001 56 | summaries_interval_sec_min: 0.001 57 | summaries_interval_sec_max: 0.002 58 | 59 | 60 | 61 | # actor 62 | actor_learning_rate: 3e-4 63 | 64 | # critic 65 | critic_learning_rate: 1e-3 66 | critic_iterations: 16 67 | critic_num_batch: 4 68 | target_critic_alpha: 0.2 69 | 70 | # learning rate scheduler 71 | lr_schedule: linear # [constant, linear] 72 | 73 | # adam 74 | betas: [0.7, 0.95] -------------------------------------------------------------------------------- /config/func_optim/ackley64/lrp.yaml: -------------------------------------------------------------------------------- 1 | params: 2 | seed: 1 3 | device: 'cuda:0' 4 | 5 | env: 6 | name: AckleyEnv 7 | config: 8 | dim: 64 9 | 10 | algo: 11 | name: lrp 12 | 13 | # network 14 | network: 15 | actor: ActorStochasticMLP 16 | actor_mlp: 17 | units: [32, 32] 18 | activation: elu 19 | actor_logstd_init: 0.0 20 | fixed_sigma: False 21 | 22 | critic: CriticMLP 23 | critic_mlp: 24 | units: [32, 32] 25 | activation: elu 26 | 27 | # length 28 | horizon_length: 1 29 | max_epochs: 2000 30 | 31 | # normalize 32 | normalize_input: True 33 | normalize_value: True 34 | normalize_advantage: True 35 | 36 | # GAE 37 | gamma: 0.99 38 | tau: 0.95 39 | 40 | # save 41 | save_best_after: 50 42 | save_frequency: 100 43 | 44 | grad_norm: 1.0 45 | truncate_grads: True 46 | steps_num: 1 47 | 48 | num_actors: 64 49 | minibatch_size: 64 50 | mini_epochs: 5 51 | 52 | critic_coef: 4 53 | clip_value: True 54 | 55 | defer_summaries_sec: 0.001 56 | summaries_interval_sec_min: 0.001 57 | summaries_interval_sec_max: 0.002 58 | 59 | 60 | 61 | # actor 62 | actor_learning_rate: 3e-4 63 | 64 | # critic 65 | critic_learning_rate: 1e-3 66 | critic_iterations: 16 67 | critic_num_batch: 4 68 | target_critic_alpha: 0.2 69 | 70 | # learning rate scheduler 71 | lr_schedule: linear # [constant, linear] 72 | 73 | # adam 74 | betas: [0.7, 0.95] -------------------------------------------------------------------------------- /config/func_optim/ackley64/rp.yaml: -------------------------------------------------------------------------------- 1 | params: 2 | seed: 1 3 | device: 'cuda:0' 4 | 5 | env: 6 | name: AckleyEnv 7 | config: 8 | dim: 64 9 | 10 | algo: 11 | name: rp 12 | 13 | # network 14 | network: 15 | actor: ActorStochasticMLP 16 | actor_mlp: 17 | units: [32, 32] 18 | activation: elu 19 | actor_logstd_init: 0.0 20 | fixed_sigma: False 21 | 22 | critic: CriticMLP 23 | critic_mlp: 24 | units: [32, 32] 25 | activation: elu 26 | 27 | # length 28 | horizon_length: 1 29 | max_epochs: 2000 30 | 31 | # normalize 32 | normalize_input: True 33 | normalize_value: True 34 | normalize_advantage: True 35 | 36 | # GAE 37 | gamma: 0.99 38 | tau: 0.95 39 | 40 | # save 41 | save_best_after: 50 42 | save_frequency: 100 43 | 44 | grad_norm: 1.0 45 | truncate_grads: True 46 | steps_num: 1 47 | 48 | num_actors: 64 49 | minibatch_size: 64 50 | mini_epochs: 5 51 | 52 | critic_coef: 4 53 | clip_value: True 54 | 55 | defer_summaries_sec: 0.001 56 | summaries_interval_sec_min: 0.001 57 | summaries_interval_sec_max: 0.002 58 | 59 | 60 | 61 | # actor 62 | actor_learning_rate: 1e-3 63 | 64 | # critic 65 | critic_learning_rate: 1e-3 66 | critic_iterations: 16 67 | critic_num_batch: 4 68 | target_critic_alpha: 0.2 69 | 70 | # learning rate scheduler 71 | lr_schedule: linear # [constant, linear] 72 | 73 | # adam 74 | betas: [0.7, 0.95] -------------------------------------------------------------------------------- /config/func_optim/dejong/lrp.yaml: -------------------------------------------------------------------------------- 1 | params: 2 | seed: 1 3 | device: 'cuda:0' 4 | 5 | env: 6 | name: DejongEnv 7 | config: 8 | dim: 1 9 | 10 | algo: 11 | name: lrp 12 | 13 | # network 14 | network: 15 | actor: ActorStochasticMLP 16 | actor_mlp: 17 | units: [32, 32] 18 | activation: elu 19 | actor_logstd_init: 0.0 20 | fixed_sigma: False 21 | 22 | critic: CriticMLP 23 | critic_mlp: 24 | units: [32, 32] 25 | activation: elu 26 | 27 | # length 28 | horizon_length: 1 29 | max_epochs: 2000 30 | 31 | # normalize 32 | normalize_input: True 33 | normalize_value: True 34 | normalize_advantage: True 35 | 36 | # GAE 37 | gamma: 0.99 38 | tau: 0.95 39 | 40 | # save 41 | save_best_after: 50 42 | save_frequency: 100 43 | 44 | grad_norm: 1.0 45 | truncate_grads: True 46 | steps_num: 1 47 | 48 | num_actors: 64 49 | minibatch_size: 64 50 | mini_epochs: 5 51 | 52 | critic_coef: 4 53 | clip_value: True 54 | 55 | defer_summaries_sec: 0.001 56 | summaries_interval_sec_min: 0.001 57 | summaries_interval_sec_max: 0.002 58 | 59 | 60 | 61 | # actor 62 | actor_learning_rate: 1e-2 63 | 64 | # critic 65 | critic_learning_rate: 1e-3 66 | critic_iterations: 16 67 | critic_num_batch: 4 68 | target_critic_alpha: 0.2 69 | 70 | # learning rate scheduler 71 | lr_schedule: linear # [constant, linear] 72 | 73 | # adam 74 | betas: [0.7, 0.95] -------------------------------------------------------------------------------- /config/func_optim/dejong64/lr.yaml: -------------------------------------------------------------------------------- 1 | params: 2 | seed: 1 3 | device: 'cuda:0' 4 | 5 | env: 6 | name: DejongEnv 7 | config: 8 | dim: 64 9 | 10 | algo: 11 | name: lr 12 | 13 | # network 14 | network: 15 | actor: ActorStochasticMLP 16 | actor_mlp: 17 | units: [32, 32] 18 | activation: elu 19 | actor_logstd_init: 0.0 20 | fixed_sigma: False 21 | 22 | critic: CriticMLP 23 | critic_mlp: 24 | units: [32, 32] 25 | activation: elu 26 | 27 | # length 28 | horizon_length: 1 29 | max_epochs: 2000 30 | 31 | # normalize 32 | normalize_input: True 33 | normalize_value: True 34 | normalize_advantage: True 35 | 36 | # GAE 37 | gamma: 0.99 38 | tau: 0.95 39 | 40 | # save 41 | save_best_after: 50 42 | save_frequency: 100 43 | 44 | grad_norm: 1.0 45 | truncate_grads: True 46 | steps_num: 1 47 | 48 | num_actors: 64 49 | minibatch_size: 64 50 | mini_epochs: 5 51 | 52 | critic_coef: 4 53 | clip_value: True 54 | 55 | defer_summaries_sec: 0.001 56 | summaries_interval_sec_min: 0.001 57 | summaries_interval_sec_max: 0.002 58 | 59 | 60 | 61 | # actor 62 | actor_learning_rate: 1e-3 63 | 64 | # critic 65 | critic_learning_rate: 1e-3 66 | critic_iterations: 16 67 | critic_num_batch: 4 68 | target_critic_alpha: 0.2 69 | 70 | # learning rate scheduler 71 | lr_schedule: linear # [constant, linear] 72 | 73 | # adam 74 | betas: [0.7, 0.95] -------------------------------------------------------------------------------- /config/func_optim/dejong64/lrp.yaml: -------------------------------------------------------------------------------- 1 | params: 2 | seed: 1 3 | device: 'cuda:0' 4 | 5 | env: 6 | name: DejongEnv 7 | config: 8 | dim: 64 9 | 10 | algo: 11 | name: lrp 12 | 13 | # network 14 | network: 15 | actor: ActorStochasticMLP 16 | actor_mlp: 17 | units: [32, 32] 18 | activation: elu 19 | actor_logstd_init: 0.0 20 | fixed_sigma: False 21 | 22 | critic: CriticMLP 23 | critic_mlp: 24 | units: [32, 32] 25 | activation: elu 26 | 27 | # length 28 | horizon_length: 1 29 | max_epochs: 2000 30 | 31 | # normalize 32 | normalize_input: True 33 | normalize_value: True 34 | normalize_advantage: True 35 | 36 | # GAE 37 | gamma: 0.99 38 | tau: 0.95 39 | 40 | # save 41 | save_best_after: 50 42 | save_frequency: 100 43 | 44 | grad_norm: 1.0 45 | truncate_grads: True 46 | steps_num: 1 47 | 48 | num_actors: 64 49 | minibatch_size: 64 50 | mini_epochs: 5 51 | 52 | critic_coef: 4 53 | clip_value: True 54 | 55 | defer_summaries_sec: 0.001 56 | summaries_interval_sec_min: 0.001 57 | summaries_interval_sec_max: 0.002 58 | 59 | 60 | 61 | # actor 62 | actor_learning_rate: 1e-3 63 | 64 | # critic 65 | critic_learning_rate: 1e-3 66 | critic_iterations: 16 67 | critic_num_batch: 4 68 | target_critic_alpha: 0.2 69 | 70 | # learning rate scheduler 71 | lr_schedule: linear # [constant, linear] 72 | 73 | # adam 74 | betas: [0.7, 0.95] -------------------------------------------------------------------------------- /config/func_optim/dejong64/rp.yaml: -------------------------------------------------------------------------------- 1 | params: 2 | seed: 1 3 | device: 'cuda:0' 4 | 5 | env: 6 | name: DejongEnv 7 | config: 8 | dim: 64 9 | 10 | algo: 11 | name: rp 12 | 13 | # network 14 | network: 15 | actor: ActorStochasticMLP 16 | actor_mlp: 17 | units: [32, 32] 18 | activation: elu 19 | actor_logstd_init: 0.0 20 | fixed_sigma: False 21 | 22 | critic: CriticMLP 23 | critic_mlp: 24 | units: [32, 32] 25 | activation: elu 26 | 27 | # length 28 | horizon_length: 1 29 | max_epochs: 2000 30 | 31 | # normalize 32 | normalize_input: True 33 | normalize_value: True 34 | normalize_advantage: True 35 | 36 | # GAE 37 | gamma: 0.99 38 | tau: 0.95 39 | 40 | # save 41 | save_best_after: 50 42 | save_frequency: 100 43 | 44 | grad_norm: 1.0 45 | truncate_grads: True 46 | steps_num: 1 47 | 48 | num_actors: 64 49 | minibatch_size: 64 50 | mini_epochs: 5 51 | 52 | critic_coef: 4 53 | clip_value: True 54 | 55 | defer_summaries_sec: 0.001 56 | summaries_interval_sec_min: 0.001 57 | summaries_interval_sec_max: 0.002 58 | 59 | 60 | 61 | # actor 62 | actor_learning_rate: 1e-2 63 | 64 | # critic 65 | critic_learning_rate: 1e-3 66 | critic_iterations: 16 67 | critic_num_batch: 4 68 | target_critic_alpha: 0.2 69 | 70 | # learning rate scheduler 71 | lr_schedule: linear # [constant, linear] 72 | 73 | # adam 74 | betas: [0.7, 0.95] -------------------------------------------------------------------------------- /config/func_optim/ackley/ppo.yaml: -------------------------------------------------------------------------------- 1 | params: 2 | seed: 1 3 | device: 'cuda:0' 4 | 5 | env: 6 | name: AckleyEnv 7 | config: 8 | dim: 1 9 | 10 | algo: 11 | name: ppo 12 | 13 | # network 14 | network: 15 | actor: ActorStochasticMLP 16 | actor_mlp: 17 | units: [32, 32] 18 | activation: elu 19 | actor_logstd_init: 0.0 20 | fixed_sigma: False 21 | 22 | critic: CriticMLP 23 | critic_mlp: 24 | units: [32, 32] 25 | activation: elu 26 | 27 | # length 28 | horizon_length: 1 29 | max_epochs: 2000 30 | 31 | # normalize 32 | normalize_input: True 33 | normalize_value: True 34 | normalize_advantage: True 35 | 36 | # GAE 37 | gamma: 0.99 38 | tau: 0.95 39 | 40 | # save 41 | save_best_after: 50 42 | save_frequency: 100 43 | 44 | grad_norm: 1.0 45 | truncate_grads: True 46 | steps_num: 1 47 | 48 | num_actors: 64 49 | 50 | critic_coef: 4 51 | clip_value: True 52 | 53 | defer_summaries_sec: 0.001 54 | summaries_interval_sec_min: 0.001 55 | summaries_interval_sec_max: 0.002 56 | 57 | # actor 58 | actor_learning_rate: 1e-4 59 | 60 | # critic 61 | critic_learning_rate: 1e-3 62 | critic_iterations: 16 63 | critic_num_batch: 4 64 | target_critic_alpha: 0.2 65 | 66 | # learning rate scheduler 67 | lr_schedule: linear # [constant, linear] 68 | 69 | # adam 70 | betas: [0.7, 0.95] 71 | 72 | # ppo 73 | ppo: 74 | e_clip: 0.2 75 | minibatch_size: 64 76 | mini_epochs: 5 -------------------------------------------------------------------------------- /config/func_optim/ackley64/ppo.yaml: -------------------------------------------------------------------------------- 1 | params: 2 | seed: 1 3 | device: 'cuda:0' 4 | 5 | env: 6 | name: AckleyEnv 7 | config: 8 | dim: 64 9 | 10 | algo: 11 | name: ppo 12 | 13 | # network 14 | network: 15 | actor: ActorStochasticMLP 16 | actor_mlp: 17 | units: [32, 32] 18 | activation: elu 19 | actor_logstd_init: 0.0 20 | fixed_sigma: False 21 | 22 | critic: CriticMLP 23 | critic_mlp: 24 | units: [32, 32] 25 | activation: elu 26 | 27 | # length 28 | horizon_length: 1 29 | max_epochs: 2000 30 | 31 | # normalize 32 | normalize_input: True 33 | normalize_value: True 34 | normalize_advantage: True 35 | 36 | # GAE 37 | gamma: 0.99 38 | tau: 0.95 39 | 40 | # save 41 | save_best_after: 50 42 | save_frequency: 100 43 | 44 | grad_norm: 1.0 45 | truncate_grads: True 46 | steps_num: 1 47 | 48 | num_actors: 64 49 | 50 | critic_coef: 4 51 | clip_value: True 52 | 53 | defer_summaries_sec: 0.001 54 | summaries_interval_sec_min: 0.001 55 | summaries_interval_sec_max: 0.002 56 | 57 | # actor 58 | actor_learning_rate: 1e-2 59 | 60 | # critic 61 | critic_learning_rate: 1e-3 62 | critic_iterations: 16 63 | critic_num_batch: 4 64 | target_critic_alpha: 0.2 65 | 66 | # learning rate scheduler 67 | lr_schedule: linear # [constant, linear] 68 | 69 | # adam 70 | betas: [0.7, 0.95] 71 | 72 | # ppo 73 | ppo: 74 | e_clip: 0.2 75 | minibatch_size: 64 76 | mini_epochs: 5 -------------------------------------------------------------------------------- /config/func_optim/dejong/ppo.yaml: -------------------------------------------------------------------------------- 1 | params: 2 | seed: 1 3 | device: 'cuda:0' 4 | 5 | env: 6 | name: DejongEnv 7 | config: 8 | dim: 1 9 | 10 | algo: 11 | name: ppo 12 | 13 | # network 14 | network: 15 | actor: ActorStochasticMLP 16 | actor_mlp: 17 | units: [32, 32] 18 | activation: elu 19 | actor_logstd_init: 0.0 20 | fixed_sigma: False 21 | 22 | critic: CriticMLP 23 | critic_mlp: 24 | units: [32, 32] 25 | activation: elu 26 | 27 | # length 28 | horizon_length: 1 29 | max_epochs: 2000 30 | 31 | # normalize 32 | normalize_input: True 33 | normalize_value: True 34 | normalize_advantage: True 35 | 36 | # GAE 37 | gamma: 0.99 38 | tau: 0.95 39 | 40 | # save 41 | save_best_after: 50 42 | save_frequency: 100 43 | 44 | grad_norm: 1.0 45 | truncate_grads: True 46 | steps_num: 1 47 | 48 | num_actors: 64 49 | 50 | critic_coef: 4 51 | clip_value: True 52 | 53 | defer_summaries_sec: 0.001 54 | summaries_interval_sec_min: 0.001 55 | summaries_interval_sec_max: 0.002 56 | 57 | # actor 58 | actor_learning_rate: 1e-4 59 | 60 | # critic 61 | critic_learning_rate: 1e-3 62 | critic_iterations: 16 63 | critic_num_batch: 4 64 | target_critic_alpha: 0.2 65 | 66 | # learning rate scheduler 67 | lr_schedule: linear # [constant, linear] 68 | 69 | # adam 70 | betas: [0.7, 0.95] 71 | 72 | # ppo 73 | ppo: 74 | e_clip: 0.2 75 | minibatch_size: 64 76 | mini_epochs: 5 -------------------------------------------------------------------------------- /config/func_optim/dejong64/ppo.yaml: -------------------------------------------------------------------------------- 1 | params: 2 | seed: 1 3 | device: 'cuda:0' 4 | 5 | env: 6 | name: DejongEnv 7 | config: 8 | dim: 64 9 | 10 | algo: 11 | name: ppo 12 | 13 | # network 14 | network: 15 | actor: ActorStochasticMLP 16 | actor_mlp: 17 | units: [32, 32] 18 | activation: elu 19 | actor_logstd_init: 0.0 20 | fixed_sigma: False 21 | 22 | critic: CriticMLP 23 | critic_mlp: 24 | units: [32, 32] 25 | activation: elu 26 | 27 | # length 28 | horizon_length: 1 29 | max_epochs: 2000 30 | 31 | # normalize 32 | normalize_input: True 33 | normalize_value: True 34 | normalize_advantage: True 35 | 36 | # GAE 37 | gamma: 0.99 38 | tau: 0.95 39 | 40 | # save 41 | save_best_after: 50 42 | save_frequency: 100 43 | 44 | grad_norm: 1.0 45 | truncate_grads: True 46 | steps_num: 1 47 | 48 | num_actors: 64 49 | 50 | critic_coef: 4 51 | clip_value: True 52 | 53 | defer_summaries_sec: 0.001 54 | summaries_interval_sec_min: 0.001 55 | summaries_interval_sec_max: 0.002 56 | 57 | # actor 58 | actor_learning_rate: 1e-2 59 | 60 | # critic 61 | critic_learning_rate: 1e-3 62 | critic_iterations: 16 63 | critic_num_batch: 4 64 | target_critic_alpha: 0.2 65 | 66 | # learning rate scheduler 67 | lr_schedule: linear # [constant, linear] 68 | 69 | # adam 70 | betas: [0.7, 0.95] 71 | 72 | # ppo 73 | ppo: 74 | e_clip: 0.2 75 | minibatch_size: 64 76 | mini_epochs: 5 -------------------------------------------------------------------------------- /config/func_optim/ackley/gippo.yaml: -------------------------------------------------------------------------------- 1 | params: 2 | seed: 1 3 | device: 'cuda:0' 4 | 5 | env: 6 | name: AckleyEnv 7 | config: 8 | dim: 1 9 | 10 | algo: 11 | name: gippo 12 | 13 | # network 14 | network: 15 | actor: ActorStochasticMLP 16 | actor_mlp: 17 | units: [32, 32] 18 | activation: elu 19 | actor_logstd_init: 0.0 20 | fixed_sigma: False 21 | 22 | critic: CriticMLP 23 | critic_mlp: 24 | units: [32, 32] 25 | activation: elu 26 | 27 | # length 28 | horizon_length: 1 29 | max_epochs: 2000 30 | 31 | # normalize 32 | normalize_input: True 33 | normalize_value: True 34 | normalize_advantage: True 35 | 36 | # GAE 37 | gamma: 0.99 38 | tau: 0.95 39 | 40 | # save 41 | save_best_after: 50 42 | save_frequency: 100 43 | 44 | grad_norm: 1.0 45 | truncate_grads: True 46 | steps_num: 1 47 | 48 | num_actors: 64 49 | 50 | critic_coef: 4 51 | clip_value: True 52 | 53 | # actor 54 | actor_learning_rate: 1e-4 # ppo 55 | actor_learning_rate_no_ppo: 1e-3 # analytical grads 56 | 57 | # critic 58 | critic_learning_rate: 1e-3 59 | critic_iterations: 16 60 | critic_num_batch: 4 61 | target_critic_alpha: 0.2 62 | 63 | # learning rate scheduler 64 | lr_schedule: linear # [constant, linear] 65 | 66 | # adam 67 | betas: [0.7, 0.95] 68 | 69 | # ppo 70 | ppo: 71 | e_clip: 0.2 72 | minibatch_size: 64 73 | mini_epochs: 5 74 | 75 | # gippo 76 | gi: 77 | alpha: 1e-5 78 | alpha_interval: 0.40 79 | alpha_update_factor: 1.1 80 | max_alpha: 1e-0 81 | num_iter: 16 82 | max_oorr: 0.5 -------------------------------------------------------------------------------- /config/func_optim/dejong/gippo.yaml: -------------------------------------------------------------------------------- 1 | params: 2 | seed: 1 3 | device: 'cuda:0' 4 | 5 | env: 6 | name: DejongEnv 7 | config: 8 | dim: 1 9 | 10 | algo: 11 | name: gippo 12 | 13 | # network 14 | network: 15 | actor: ActorStochasticMLP 16 | actor_mlp: 17 | units: [32, 32] 18 | activation: elu 19 | actor_logstd_init: 0.0 20 | fixed_sigma: False 21 | 22 | critic: CriticMLP 23 | critic_mlp: 24 | units: [32, 32] 25 | activation: elu 26 | 27 | # length 28 | horizon_length: 1 29 | max_epochs: 2000 30 | 31 | # normalize 32 | normalize_input: True 33 | normalize_value: True 34 | normalize_advantage: True 35 | 36 | # GAE 37 | gamma: 0.99 38 | tau: 0.95 39 | 40 | # save 41 | save_best_after: 50 42 | save_frequency: 100 43 | 44 | grad_norm: 1.0 45 | truncate_grads: True 46 | steps_num: 1 47 | 48 | num_actors: 64 49 | 50 | critic_coef: 4 51 | clip_value: True 52 | 53 | # actor 54 | actor_learning_rate: 1e-4 # ppo 55 | actor_learning_rate_no_ppo: 1e-3 # analytical grads 56 | 57 | # critic 58 | critic_learning_rate: 1e-3 59 | critic_iterations: 16 60 | critic_num_batch: 4 61 | target_critic_alpha: 0.2 62 | 63 | # learning rate scheduler 64 | lr_schedule: linear # [constant, linear] 65 | 66 | # adam 67 | betas: [0.7, 0.95] 68 | 69 | # ppo 70 | ppo: 71 | e_clip: 0.2 72 | minibatch_size: 64 73 | mini_epochs: 5 74 | 75 | # gippo 76 | gi: 77 | alpha: 1e-5 78 | alpha_interval: 0.40 79 | alpha_update_factor: 1.1 80 | max_alpha: 1e-0 81 | num_iter: 16 82 | max_oorr: 0.5 -------------------------------------------------------------------------------- /config/func_optim/ackley64/gippo.yaml: -------------------------------------------------------------------------------- 1 | params: 2 | seed: 1 3 | device: 'cuda:0' 4 | 5 | env: 6 | name: AckleyEnv 7 | config: 8 | dim: 64 9 | 10 | algo: 11 | name: gippo 12 | 13 | # network 14 | network: 15 | actor: ActorStochasticMLP 16 | actor_mlp: 17 | units: [32, 32] 18 | activation: elu 19 | actor_logstd_init: 0.0 20 | fixed_sigma: False 21 | 22 | critic: CriticMLP 23 | critic_mlp: 24 | units: [32, 32] 25 | activation: elu 26 | 27 | # length 28 | horizon_length: 1 29 | max_epochs: 2000 30 | 31 | # normalize 32 | normalize_input: True 33 | normalize_value: True 34 | normalize_advantage: True 35 | 36 | # GAE 37 | gamma: 0.99 38 | tau: 0.95 39 | 40 | # save 41 | save_best_after: 50 42 | save_frequency: 100 43 | 44 | grad_norm: 1.0 45 | truncate_grads: True 46 | steps_num: 1 47 | 48 | num_actors: 64 49 | 50 | critic_coef: 4 51 | clip_value: True 52 | 53 | # actor 54 | actor_learning_rate: 1e-2 # ppo 55 | actor_learning_rate_no_ppo: 1e-3 # analytical grads 56 | 57 | # critic 58 | critic_learning_rate: 1e-3 59 | critic_iterations: 16 60 | critic_num_batch: 4 61 | target_critic_alpha: 0.2 62 | 63 | # learning rate scheduler 64 | lr_schedule: linear # [constant, linear] 65 | 66 | # adam 67 | betas: [0.7, 0.95] 68 | 69 | # ppo 70 | ppo: 71 | e_clip: 0.2 72 | minibatch_size: 64 73 | mini_epochs: 5 74 | 75 | # gippo 76 | gi: 77 | alpha: 1e-5 78 | alpha_interval: 0.40 79 | alpha_update_factor: 1.1 80 | max_alpha: 1e-0 81 | num_iter: 16 82 | max_oorr: 0.5 -------------------------------------------------------------------------------- /config/func_optim/dejong64/gippo.yaml: -------------------------------------------------------------------------------- 1 | params: 2 | seed: 1 3 | device: 'cuda:0' 4 | 5 | env: 6 | name: DejongEnv 7 | config: 8 | dim: 64 9 | 10 | algo: 11 | name: gippo 12 | 13 | # network 14 | network: 15 | actor: ActorStochasticMLP 16 | actor_mlp: 17 | units: [32, 32] 18 | activation: elu 19 | actor_logstd_init: 0.0 20 | fixed_sigma: False 21 | 22 | critic: CriticMLP 23 | critic_mlp: 24 | units: [32, 32] 25 | activation: elu 26 | 27 | # length 28 | horizon_length: 1 29 | max_epochs: 2000 30 | 31 | # normalize 32 | normalize_input: True 33 | normalize_value: True 34 | normalize_advantage: True 35 | 36 | # GAE 37 | gamma: 0.99 38 | tau: 0.95 39 | 40 | # save 41 | save_best_after: 50 42 | save_frequency: 100 43 | 44 | grad_norm: 1.0 45 | truncate_grads: True 46 | steps_num: 1 47 | 48 | num_actors: 64 49 | 50 | critic_coef: 4 51 | clip_value: True 52 | 53 | # actor 54 | actor_learning_rate: 1e-2 # ppo 55 | actor_learning_rate_no_ppo: 1e-3 # analytical grads 56 | 57 | # critic 58 | critic_learning_rate: 1e-3 59 | critic_iterations: 16 60 | critic_num_batch: 4 61 | target_critic_alpha: 0.2 62 | 63 | # learning rate scheduler 64 | lr_schedule: linear # [constant, linear] 65 | 66 | # adam 67 | betas: [0.7, 0.95] 68 | 69 | # ppo 70 | ppo: 71 | e_clip: 0.2 72 | minibatch_size: 64 73 | mini_epochs: 5 74 | 75 | # gippo 76 | gi: 77 | alpha: 1e-5 78 | alpha_interval: 0.40 79 | alpha_update_factor: 1.1 80 | max_alpha: 1e-0 81 | num_iter: 16 82 | max_oorr: 0.5 -------------------------------------------------------------------------------- /envs/func_optim/base.py: -------------------------------------------------------------------------------- 1 | import torch as th 2 | 3 | from envs.base import BaseEnv 4 | 5 | class FuncOptimEnv(BaseEnv): 6 | 7 | def __init__(self, 8 | num_envs, 9 | dim=1, 10 | seed=0, 11 | no_grad=True, 12 | render=False, 13 | device='cuda:0'): 14 | 15 | super(FuncOptimEnv, self).__init__( 16 | num_envs=num_envs, 17 | num_obs=1, 18 | num_act=dim, 19 | episode_length=1, 20 | seed=seed, 21 | no_grad=no_grad, 22 | render=render, 23 | device=device 24 | ) 25 | 26 | self.dim = dim 27 | self.render_resolution = 1e3 28 | 29 | def preprocess_actions(self, actions: th.Tensor): 30 | actions = actions.view((self.num_envs, self.num_actions)) 31 | actions = th.clip(actions, -1., 1.) 32 | return actions 33 | 34 | def step(self, actions: th.Tensor): 35 | actions = self.preprocess_actions(actions) 36 | self.actions = actions 37 | 38 | self.reset_buf = th.zeros_like(self.reset_buf) 39 | 40 | self.progress_buf += 1 41 | self.num_frames += 1 42 | 43 | self.calculateObservations() 44 | self.calculateReward() 45 | 46 | if self.no_grad == False: 47 | self.obs_buf_before_reset = self.obs_buf.clone() 48 | self.extras = { 49 | 'obs_before_reset': self.obs_buf_before_reset, 50 | 'episode_end': self.termination_buf 51 | } 52 | 53 | self.reset() 54 | return self.obs_buf, self.rew_buf, self.reset_buf, self.extras 55 | 56 | def reset(self): 57 | 58 | self.calculateObservations() 59 | self.progress_buf[:] = 0 60 | 61 | return self.obs_buf 62 | 63 | def calculateObservations(self): 64 | 65 | self.obs_buf = th.zeros_like(self.obs_buf) 66 | 67 | def calculateReward(self): 68 | 69 | self.rew_buf = self.evaluate(self.actions) 70 | 71 | # reset agents 72 | self.reset_buf = th.where(self.progress_buf > self.episode_length - 1, th.ones_like(self.reset_buf), self.reset_buf) 73 | 74 | def evaluate(self, x: th.Tensor): 75 | 76 | raise NotImplementedError() -------------------------------------------------------------------------------- /src/gippo/runner.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import random 4 | from copy import deepcopy 5 | 6 | import numpy as np 7 | import torch as th 8 | 9 | from gippo.rl_algorithm.lr import LR 10 | from gippo.rl_algorithm.rp import RP 11 | from gippo.rl_algorithm.lrp import LRP 12 | from gippo.rl_algorithm.ppo import PPO 13 | from gippo.rl_algorithm.gippo import GIPPO 14 | from gippo.vecenv import create_vecenv 15 | 16 | class Runner: 17 | 18 | def __init__(self): 19 | th.backends.cudnn.benchmark = True 20 | 21 | def reset(self): 22 | pass 23 | 24 | def load_config(self, params): 25 | self.seed = params.get('seed', None) 26 | if self.seed is None: 27 | self.seed = int(time.time()) 28 | 29 | print(f"self.seed = {self.seed}") 30 | 31 | self.algo_params = params['algo'] 32 | self.algo_name = self.algo_params['name'] 33 | self.exp_config = None 34 | 35 | if self.seed: 36 | th.manual_seed(self.seed) 37 | th.cuda.manual_seed_all(self.seed) 38 | np.random.seed(self.seed) 39 | random.seed(self.seed) 40 | 41 | # deal with environment specific seed if applicable 42 | if 'config' in params['env']: 43 | params['env']['config']['seed'] = self.seed 44 | 45 | self.params = params 46 | 47 | def load(self, yaml_config): 48 | config = deepcopy(yaml_config) 49 | self.default_config = deepcopy(config['params']) 50 | self.load_config(params=self.default_config) 51 | 52 | def run_train(self, args): 53 | print('Started to train') 54 | 55 | algo_config = self.params['algo'] 56 | env_config = self.params['env'] 57 | device = self.params['device'] 58 | log_path = self.params['log_path'] 59 | 60 | if self.algo_name == 'lr': 61 | agent = LR(algo_config, env_config, device, log_path) 62 | elif self.algo_name == 'rp': 63 | agent = RP(algo_config, env_config, device, log_path) 64 | elif self.algo_name == 'lrp': 65 | agent = LRP(algo_config, env_config, device, log_path) 66 | elif self.algo_name == 'ppo': 67 | agent = PPO(algo_config, env_config, device, log_path) 68 | elif self.algo_name == 'gippo': 69 | agent = GIPPO(algo_config, env_config, device, log_path) 70 | else: 71 | raise NotImplementedError() 72 | # _restore(agent, args) 73 | # _override_sigma(agent, args) 74 | agent.train() -------------------------------------------------------------------------------- /src/gippo/dataset.py: -------------------------------------------------------------------------------- 1 | ''' 2 | From 3 | https://github.com/Denys88/rl_games/blob/master/rl_games/common/datasets.py 4 | https://github.com/NVlabs/DiffRL/blob/main/utils/dataset.py 5 | ''' 6 | import numpy as np 7 | from torch.utils.data import Dataset 8 | 9 | class PPODataset(Dataset): 10 | def __init__(self, batch_size, minibatch_size, device): 11 | self.batch_size = batch_size 12 | self.minibatch_size = minibatch_size 13 | self.device = device 14 | self.length = self.batch_size // self.minibatch_size 15 | self.special_names = [] 16 | 17 | def update_values_dict(self, values_dict): 18 | self.values_dict = values_dict 19 | 20 | def update_mu_sigma(self, mu, sigma): 21 | start = self.last_range[0] 22 | end = self.last_range[1] 23 | self.values_dict['mu'][start:end] = mu 24 | self.values_dict['sigma'][start:end] = sigma 25 | 26 | def __len__(self): 27 | return self.length 28 | 29 | def _get_item(self, idx): 30 | start = idx * self.minibatch_size 31 | end = (idx + 1) * self.minibatch_size 32 | self.last_range = (start, end) 33 | input_dict = {} 34 | for k,v in self.values_dict.items(): 35 | if k not in self.special_names and v is not None: 36 | if type(v) is dict: 37 | v_dict = { kd:vd[start:end] for kd, vd in v.items() } 38 | input_dict[k] = v_dict 39 | else: 40 | input_dict[k] = v[start:end] 41 | 42 | return input_dict 43 | 44 | def __getitem__(self, idx): 45 | sample = self._get_item(idx) 46 | return sample 47 | 48 | class CriticDataset: 49 | def __init__(self, batch_size, obs, target_values, shuffle = False, drop_last = False): 50 | self.obs = obs.view(-1, obs.shape[-1]) 51 | self.target_values = target_values.view(-1) 52 | self.batch_size = batch_size 53 | 54 | if shuffle: 55 | self.shuffle() 56 | 57 | if drop_last: 58 | self.length = self.obs.shape[0] // self.batch_size 59 | else: 60 | self.length = ((self.obs.shape[0] - 1) // self.batch_size) + 1 61 | 62 | def shuffle(self): 63 | index = np.random.permutation(self.obs.shape[0]) 64 | self.obs = self.obs[index, :] 65 | self.target_values = self.target_values[index] 66 | 67 | def __len__(self): 68 | return self.length 69 | 70 | def __getitem__(self, index): 71 | start_idx = index * self.batch_size 72 | end_idx = min((index + 1) * self.batch_size, self.obs.shape[0]) 73 | return {'obs': self.obs[start_idx:end_idx, :], 'target_values': self.target_values[start_idx:end_idx]} -------------------------------------------------------------------------------- /run_func_optim.sh: -------------------------------------------------------------------------------- 1 | ITER=5 2 | 3 | # Dejong 4 | for (( i=1; i<=${ITER}; i++ )) 5 | do 6 | python ./train.py --cfg ./config/func_optim/dejong/lr.yaml --logdir ./logdir/func_optim/dejong/lr/ --seed ${i} --device cpu 7 | python ./train.py --cfg ./config/func_optim/dejong/rp.yaml --logdir ./logdir/func_optim/dejong/rp/ --seed ${i} --device cpu 8 | python ./train.py --cfg ./config/func_optim/dejong/lrp.yaml --logdir ./logdir/func_optim/dejong/lrp/ --seed ${i} --device cpu 9 | python ./train.py --cfg ./config/func_optim/dejong/ppo.yaml --logdir ./logdir/func_optim/dejong/ppo/ --seed ${i} --device cpu 10 | python ./train.py --cfg ./config/func_optim/dejong/gippo.yaml --logdir ./logdir/func_optim/dejong/gippo/ --seed ${i} --device cpu 11 | done 12 | 13 | # Dejong 64 14 | for (( i=1; i<=${ITER}; i++ )) 15 | do 16 | python ./train.py --cfg ./config/func_optim/dejong64/lr.yaml --logdir ./logdir/func_optim/dejong64/lr/ --seed ${i} --device cpu 17 | python ./train.py --cfg ./config/func_optim/dejong64/rp.yaml --logdir ./logdir/func_optim/dejong64/rp/ --seed ${i} --device cpu 18 | python ./train.py --cfg ./config/func_optim/dejong64/lrp.yaml --logdir ./logdir/func_optim/dejong64/lrp/ --seed ${i} --device cpu 19 | python ./train.py --cfg ./config/func_optim/dejong64/ppo.yaml --logdir ./logdir/func_optim/dejong64/ppo/ --seed ${i} --device cpu 20 | python ./train.py --cfg ./config/func_optim/dejong64/gippo.yaml --logdir ./logdir/func_optim/dejong64/gippo/ --seed ${i} --device cpu 21 | done 22 | 23 | # Ackley 24 | for (( i=1; i<=${ITER}; i++ )) 25 | do 26 | python ./train.py --cfg ./config/func_optim/ackley/lr.yaml --logdir ./logdir/func_optim/ackley/lr/ --seed ${i} --device cpu 27 | python ./train.py --cfg ./config/func_optim/ackley/rp.yaml --logdir ./logdir/func_optim/ackley/rp/ --seed ${i} --device cpu 28 | python ./train.py --cfg ./config/func_optim/ackley/lrp.yaml --logdir ./logdir/func_optim/ackley/lrp/ --seed ${i} --device cpu 29 | python ./train.py --cfg ./config/func_optim/ackley/ppo.yaml --logdir ./logdir/func_optim/ackley/ppo/ --seed ${i} --device cpu 30 | python ./train.py --cfg ./config/func_optim/ackley/gippo.yaml --logdir ./logdir/func_optim/ackley/gippo/ --seed ${i} --device cpu 31 | done 32 | 33 | # Ackley 64 34 | for (( i=1; i<=${ITER}; i++ )) 35 | do 36 | python ./train.py --cfg ./config/func_optim/ackley64/lr.yaml --logdir ./logdir/func_optim/ackley64/lr/ --seed ${i} --device cpu 37 | python ./train.py --cfg ./config/func_optim/ackley64/rp.yaml --logdir ./logdir/func_optim/ackley64/rp/ --seed ${i} --device cpu 38 | python ./train.py --cfg ./config/func_optim/ackley64/lrp.yaml --logdir ./logdir/func_optim/ackley64/lrp/ --seed ${i} --device cpu 39 | python ./train.py --cfg ./config/func_optim/ackley64/ppo.yaml --logdir ./logdir/func_optim/ackley64/ppo/ --seed ${i} --device cpu 40 | python ./train.py --cfg ./config/func_optim/ackley64/gippo.yaml --logdir ./logdir/func_optim/ackley64/gippo/ --seed ${i} --device cpu 41 | done -------------------------------------------------------------------------------- /envs/base.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Modified from 3 | https://github.com/NVlabs/DiffRL/blob/main/envs/dflex_env.py 4 | ''' 5 | import numpy as np 6 | import torch as th 7 | 8 | from gym import spaces 9 | 10 | class BaseEnv: 11 | 12 | def __init__(self, 13 | num_envs, 14 | num_obs, 15 | num_act, 16 | episode_length, 17 | seed=0, 18 | no_grad=True, 19 | render=False, 20 | device='cuda:0'): 21 | 22 | self.seed = seed 23 | 24 | self.no_grad = no_grad 25 | 26 | self.episode_length = episode_length 27 | 28 | self.device = device 29 | 30 | self.render = render 31 | 32 | self.sim_time = 0.0 33 | 34 | self.num_frames = 0 # record the number of frames for rendering 35 | 36 | self.num_environments = num_envs 37 | self.num_agents = 1 38 | 39 | # initialize observation and action space 40 | self.num_observations = num_obs 41 | self.num_actions = num_act 42 | 43 | self.obs_space = spaces.Box(np.ones(self.num_observations, dtype=np.float32) * -np.Inf, 44 | np.ones(self.num_observations, dtype=np.float32) * np.Inf) 45 | self.act_space = spaces.Box(np.ones(self.num_actions, dtype=np.float32) * np.float32(-1.), 46 | np.ones(self.num_actions, dtype=np.float32) * np.float32(1.)) 47 | 48 | # allocate buffers 49 | self.obs_buf = th.zeros( 50 | (self.num_envs, self.num_observations), device=self.device, dtype=th.float32, requires_grad=False) 51 | self.rew_buf = th.zeros( 52 | self.num_envs, device=self.device, dtype=th.float32, requires_grad=False) 53 | self.reset_buf = th.ones( 54 | self.num_envs, device=self.device, dtype=th.int64, requires_grad=False) 55 | 56 | # end of the episode 57 | self.termination_buf = th.zeros( 58 | self.num_envs, device=self.device, dtype=th.int64, requires_grad=False) 59 | self.progress_buf = th.zeros( 60 | self.num_envs, device=self.device, dtype=th.int64, requires_grad=False) 61 | self.actions = th.zeros( 62 | (self.num_envs, self.num_actions), device = self.device, dtype = th.float32, requires_grad = False) 63 | 64 | self.extras = {} 65 | 66 | def get_number_of_agents(self): 67 | return self.num_agents 68 | 69 | @property 70 | def observation_space(self): 71 | return self.obs_space 72 | 73 | @property 74 | def action_space(self): 75 | return self.act_space 76 | 77 | @property 78 | def num_envs(self): 79 | return self.num_environments 80 | 81 | @property 82 | def num_acts(self): 83 | return self.num_actions 84 | 85 | @property 86 | def num_obs(self): 87 | return self.num_observations 88 | 89 | def get_state(self): 90 | raise NotImplementedError() 91 | 92 | def reset_with_state(self, env_ids=None, force_reset=True): 93 | raise NotImplementedError() -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | 4 | import os 5 | import yaml 6 | import time 7 | 8 | from gippo import vecenv 9 | from gippo.runner import Runner 10 | 11 | from envs.func_optim.dejong import DejongEnv 12 | from envs.func_optim.ackley import AckleyEnv 13 | 14 | vecenv.register_vecenv_config( 15 | 'BASE', 16 | lambda env_name, 17 | num_actors, 18 | **kwargs: vecenv.BaseVecEnv(env_name, num_actors, **kwargs)) 19 | 20 | vecenv.register_env_config( 21 | 'DejongEnv', 22 | { 23 | 'vecenv_type': 'BASE', 24 | 'env_creator': lambda **kwargs: DejongEnv(**kwargs), 25 | } 26 | ) 27 | vecenv.register_env_config( 28 | 'AckleyEnv', 29 | { 30 | 'vecenv_type': 'BASE', 31 | 'env_creator': lambda **kwargs: AckleyEnv(**kwargs) 32 | } 33 | ) 34 | 35 | def parse_arguments(description="Testing Args", custom_parameters=[]): 36 | parser = argparse.ArgumentParser() 37 | 38 | for argument in custom_parameters: 39 | if ("name" in argument) and ("type" in argument or "action" in argument): 40 | help_str = "" 41 | if "help" in argument: 42 | help_str = argument["help"] 43 | 44 | if "type" in argument: 45 | if "default" in argument: 46 | parser.add_argument(argument["name"], type=argument["type"], default=argument["default"], help=help_str) 47 | else: 48 | print("ERROR: default must be specified if using type") 49 | elif "action" in argument: 50 | parser.add_argument(argument["name"], action=argument["action"], help=help_str) 51 | else: 52 | print() 53 | print("ERROR: command line argument name, type/action must be defined, argument not added to parser") 54 | print("supported keys: name, type, default, action, help") 55 | print() 56 | 57 | args = parser.parse_args() 58 | return args 59 | 60 | def get_args(): 61 | custom_parameters = [ 62 | {"name": "--cfg", "type": str, "default": "./config/func_optim/dejong/lr.yaml", 63 | "help": "Configuration file for training"}, 64 | {"name": "--device", "type": str, "default": "cuda:0", 65 | "help": "Choose CPU or GPU device for inferencing policy network"}, 66 | {"name": "--render", "action": "store_true", "default": False, 67 | "help": "whether generate rendering file."}, 68 | {"name": "--logdir", "type": str, "default": "logdir/"}, 69 | {"name": "--seed", "type": int, "default": 1},] 70 | 71 | # parse arguments 72 | args = parse_arguments( 73 | description="Training args", 74 | custom_parameters=custom_parameters) 75 | 76 | return args 77 | 78 | if __name__ == '__main__': 79 | 80 | args = get_args() 81 | vargs = vars(args) 82 | 83 | with open(args.cfg, 'r') as f: 84 | cfg_train = yaml.load(f, Loader=yaml.SafeLoader) 85 | 86 | # save command line args to config; 87 | cfg_train["params"]["command_line_args"] = {} 88 | for key in vargs.keys(): 89 | cfg_train["params"]["command_line_args"][key] = vargs[key] 90 | 91 | # save config; 92 | log_dir = cfg_train["params"]["command_line_args"]["logdir"] 93 | log_dir = log_dir + time.strftime("%Y-%m-%d-%H-%M-%S") 94 | os.makedirs(log_dir, exist_ok = True) 95 | yaml.dump(cfg_train, open(os.path.join(log_dir, 'cfg.yaml'), 'w')) 96 | cfg_train["params"]["log_path"] = log_dir 97 | cfg_train["params"]["device"] = vargs["device"] 98 | cfg_train["params"]["seed"] = vargs["seed"] 99 | 100 | runner = Runner() 101 | runner.load(cfg_train) 102 | runner.run_train(vargs) -------------------------------------------------------------------------------- /src/gippo/rl_algorithm/rp.py: -------------------------------------------------------------------------------- 1 | import torch as th 2 | import torch.utils as tu 3 | from typing import List 4 | 5 | from gippo.rl_algorithm.base import RLAlgorithm 6 | from gippo.utils import swap_and_flatten01 7 | 8 | class RP(RLAlgorithm): 9 | 10 | def __init__(self, config, env_config, device="cpu", log_path=None): 11 | 12 | super(RP, self).__init__(config, env_config, device, log_path) 13 | 14 | self.actor_lr = float(config["actor_learning_rate"]) 15 | self.actor_optimizer = th.optim.Adam( 16 | self.actor.parameters(), 17 | betas = config['betas'], 18 | lr = self.actor_lr 19 | ) 20 | 21 | def train_actor_critic_no_ppo(self): 22 | 23 | ''' 24 | Set learning rate. 25 | ''' 26 | # set learning rate; 27 | actor_lr = self.actor_lr 28 | critic_lr = self.critic_lr 29 | if self.lr_schedule == 'linear': 30 | actor_lr = (1e-5 - self.actor_lr) * float(self.epoch_num / self.max_epochs) + self.actor_lr 31 | critic_lr = (1e-5 - self.critic_lr) * float(self.epoch_num / self.max_epochs) + self.critic_lr 32 | 33 | for param_group in self.actor_optimizer.param_groups: 34 | param_group['lr'] = actor_lr 35 | for param_group in self.critic_optimizer.param_groups: 36 | param_group['lr'] = critic_lr 37 | 38 | self.writer.add_scalar("info/actor_lr", actor_lr, self.epoch_num) 39 | self.writer.add_scalar("info/critic_lr", critic_lr, self.epoch_num) 40 | 41 | return super().train_actor_critic_no_ppo() 42 | 43 | def use_analytic_grads(self): 44 | 45 | return True 46 | 47 | def use_ppo(self): 48 | 49 | return False 50 | 51 | def get_optimizers_state(self): 52 | state = super().get_optimizers_state() 53 | state['actor'] = self.actor_optimizer.state_dict() 54 | 55 | return state 56 | 57 | def train_actor_no_ppo(self, 58 | grad_start: th.Tensor, 59 | grad_obses: List[th.Tensor], 60 | grad_rp_eps: List[th.Tensor], 61 | grad_actions: List[th.Tensor], 62 | grad_values: List[th.Tensor], 63 | grad_next_values: List[th.Tensor], 64 | grad_rewards: List[th.Tensor], 65 | grad_fdones: List[th.Tensor], 66 | last_fdones: th.Tensor): 67 | ''' 68 | Train actor using Reparameterization-Trick (RP) techinque. 69 | 70 | Follow variance reduction scheme of SHAC (https://arxiv.org/abs/2204.07137), 71 | such as truncated time horizon. 72 | ''' 73 | 74 | self.actor.train() 75 | 76 | # compute advantages; 77 | curr_grad_advs = self.grad_advantages(self.tau, 78 | grad_values, 79 | grad_next_values, 80 | grad_rewards, 81 | grad_fdones, 82 | last_fdones) 83 | 84 | # add value of the states; 85 | for i in range(len(grad_values)): 86 | curr_grad_advs[i] = curr_grad_advs[i] + grad_values[i] 87 | 88 | # compute loss; 89 | actor_loss: th.Tensor = -self.grad_advantages_first_terms_sum(curr_grad_advs, grad_start) 90 | 91 | # divide by number of trajectories; 92 | actor_loss = actor_loss / th.count_nonzero(grad_start) 93 | 94 | # update actor; 95 | self.actor_optimizer.zero_grad() 96 | actor_loss.backward() 97 | if self.truncate_grads: 98 | th.nn.utils.clip_grad_norm_(self.actor.parameters(), self.grad_norm) 99 | self.actor_optimizer.step() -------------------------------------------------------------------------------- /envs/func_optim/dejong.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch as th 3 | import os 4 | 5 | import matplotlib.pyplot as plt 6 | 7 | from envs.func_optim.base import FuncOptimEnv 8 | 9 | class DejongEnv(FuncOptimEnv): 10 | 11 | def __init__(self, 12 | num_envs, 13 | dim=1, 14 | seed=0, 15 | no_grad=True, 16 | render=False, 17 | device='cuda:0'): 18 | 19 | super(DejongEnv, self).__init__( 20 | num_envs=num_envs, 21 | dim=dim, 22 | seed=seed, 23 | no_grad=no_grad, 24 | render=render, 25 | device=device) 26 | 27 | self.bound = 5.12 28 | 29 | def preprocess_actions(self, actions: th.Tensor): 30 | actions = super().preprocess_actions(actions) 31 | actions = actions * self.bound 32 | return actions 33 | 34 | def render(self, mode = 'human', actions = None, p_actions = None): 35 | 36 | if self.visualize: 37 | 38 | assert self.dim == 1, "" 39 | 40 | min_action = -self.bound 41 | max_action = self.bound 42 | step = (max_action - min_action) / self.render_resolution 43 | 44 | x = th.arange(min_action, max_action, step).unsqueeze(-1) 45 | y = self.evaluate(x) 46 | 47 | x = x[:, 0].cpu().numpy() 48 | y = y.cpu().numpy() 49 | 50 | f = plt.figure() 51 | f.set_figwidth(6.4 * 2) 52 | f.set_figheight(4.8 * 2) 53 | 54 | plt.plot(x, y, color='blue') 55 | 56 | with th.no_grad(): 57 | 58 | if actions == None: 59 | x = self.actions[:, 0].cpu().numpy() 60 | y = self.rew_buf.cpu().numpy() 61 | elif actions != None: 62 | x = th.clip(actions, -1, 1) * self.bound 63 | y = self.evaluate(x) 64 | 65 | x = x[:, 0].cpu().numpy() 66 | y = y.cpu().numpy() 67 | else: 68 | raise ValueError() 69 | 70 | plt.plot(x, y, 'x', color='black', markersize=5e-0) 71 | 72 | with th.no_grad(): 73 | 74 | if p_actions != None: 75 | x = th.clip(p_actions, -1, 1) * self.bound 76 | y = self.evaluate(x) 77 | 78 | x = x[:, 0].cpu().numpy() 79 | y = y.cpu().numpy() 80 | 81 | plt.plot(x, y, 'o', color='red', markersize=2e-0) 82 | 83 | plt.title("Dejong Function, Step {}".format(self.num_frames)) 84 | plt.xlabel("x") 85 | plt.ylabel("y") 86 | 87 | dir = './outputs/dejong/' 88 | 89 | if not os.path.exists(dir): 90 | os.makedirs(dir) 91 | 92 | plt.savefig("./outputs/dejong/dejong_{}.png".format(self.num_frames)) 93 | 94 | def reset(self, env_ids=None, force_reset=True): 95 | 96 | self.calculateObservations() 97 | 98 | return self.obs_buf 99 | 100 | ''' 101 | cut off the gradient from the current state to previous states 102 | ''' 103 | def clear_grad(self): 104 | 105 | pass 106 | 107 | ''' 108 | This function starts collecting a new trajectory from the current states but cut off the computation graph to the previous states. 109 | It has to be called every time the algorithm starts an episode and return the observation vectors 110 | ''' 111 | def initialize_trajectory(self): 112 | self.clear_grad() 113 | self.calculateObservations() 114 | return self.obs_buf 115 | 116 | def calculateObservations(self): 117 | 118 | self.obs_buf = th.zeros_like(self.obs_buf) 119 | 120 | def calculateReward(self): 121 | 122 | self.rew_buf = self.evaluate(self.actions) 123 | 124 | # reset agents 125 | self.reset_buf = th.where(self.progress_buf > self.episode_length - 1, th.ones_like(self.reset_buf), self.reset_buf) 126 | 127 | def evaluate(self, x: th.Tensor): 128 | 129 | y = th.sum(x * x, dim=1) 130 | 131 | return -y -------------------------------------------------------------------------------- /src/gippo/vecenv.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Modified from 3 | https://github.com/Denys88/rl_games/blob/master/rl_games/common/vecenv.py 4 | https://github.com/NVlabs/DiffRL/blob/main/examples/train_rl.py#L52 5 | ''' 6 | vecenv_config = {} # vectorized environment, which usually wraps around 7 | # a single environment and provides parallelized interface; 8 | env_config = {} # single environment config; 9 | 10 | def register_vecenv_config(config_name, func): 11 | vecenv_config[config_name] = func 12 | 13 | def register_env_config(env_name, config): 14 | env_config[env_name] = config 15 | 16 | def create_vecenv(env_name, num_actors, **kwargs): 17 | vecenv_name = env_config[env_name]['vecenv_type'] 18 | return vecenv_config[vecenv_name](env_name, num_actors, **kwargs) 19 | 20 | ''' 21 | Vectorized Environment 22 | ''' 23 | 24 | class IVecEnv: 25 | def step(self, actions): 26 | raise NotImplementedError 27 | 28 | def reset(self): 29 | raise NotImplementedError 30 | 31 | def has_action_masks(self): 32 | return False 33 | 34 | def get_number_of_agents(self): 35 | return 1 36 | 37 | def get_env_info(self): 38 | pass 39 | 40 | def seed(self, seed): 41 | pass 42 | 43 | def set_train_info(self, env_frames, *args, **kwargs): 44 | """ 45 | Send the information in the direction algo->environment. 46 | Most common use case: tell the environment how far along we are in the training process. This is useful 47 | for implementing curriculums and things such as that. 48 | """ 49 | pass 50 | 51 | def get_env_state(self): 52 | """ 53 | Return serializable environment state to be saved to checkpoint. 54 | Can be used for stateful training sessions, i.e. with adaptive curriculums. 55 | """ 56 | return None 57 | 58 | def set_env_state(self, env_state): 59 | pass 60 | 61 | class BaseVecEnv(IVecEnv): 62 | def __init__(self, env_name, num_actors, **kwargs): 63 | kwargs['num_envs'] = num_actors 64 | self.env = env_config[env_name]['env_creator'](**kwargs) 65 | 66 | self.full_state = {} 67 | self.device = kwargs['device'] 68 | 69 | self.full_state["obs"] = self.env.reset(force_reset=True).to(self.device) 70 | 71 | def step(self, actions): 72 | self.full_state["obs"], reward, is_done, info = self.env.step(actions.to(self.device)) 73 | 74 | return self.full_state["obs"].to(self.device), \ 75 | reward.to(self.device), \ 76 | is_done.to(self.device), \ 77 | info 78 | 79 | def reset(self): 80 | self.full_state["obs"] = self.env.reset(force_reset=True) 81 | 82 | return self.full_state["obs"].to(self.device) 83 | 84 | def get_number_of_agents(self): 85 | return self.env.get_number_of_agents() 86 | 87 | def get_env_info(self): 88 | info = {} 89 | info['action_space'] = self.env.action_space 90 | info['observation_space'] = self.env.observation_space 91 | return info 92 | 93 | class RLGPUEnv(IVecEnv): 94 | def __init__(self, env_name, num_actors, **kwargs): 95 | self.env = env_config[env_name]['env_creator'](**kwargs) 96 | 97 | self.full_state = {} 98 | raise NotImplementedError() 99 | self.rl_device = "cuda:0" 100 | 101 | self.full_state["obs"] = self.env.reset(force_reset=True).to(self.rl_device) 102 | print(self.full_state["obs"].shape) 103 | 104 | def step(self, actions): 105 | self.full_state["obs"], reward, is_done, info = self.env.step(actions.to(self.env.device)) 106 | 107 | return self.full_state["obs"].to(self.rl_device), reward.to(self.rl_device), is_done.to(self.rl_device), info 108 | 109 | def reset(self): 110 | self.full_state["obs"] = self.env.reset(force_reset=True) 111 | 112 | return self.full_state["obs"].to(self.rl_device) 113 | 114 | def get_number_of_agents(self): 115 | return self.env.get_number_of_agents() 116 | 117 | def get_env_info(self): 118 | info = {} 119 | info['action_space'] = self.env.action_space 120 | info['observation_space'] = self.env.observation_space 121 | 122 | print(info['action_space'], info['observation_space']) 123 | 124 | return info -------------------------------------------------------------------------------- /src/gippo/rl_algorithm/lr.py: -------------------------------------------------------------------------------- 1 | import torch as th 2 | import torch.utils as tu 3 | from typing import List 4 | 5 | from gippo.rl_algorithm.base import RLAlgorithm 6 | from gippo.utils import swap_and_flatten01 7 | 8 | class LR(RLAlgorithm): 9 | 10 | def __init__(self, config, env_config, device="cpu", log_path=None): 11 | 12 | super(LR, self).__init__(config, env_config, device, log_path) 13 | 14 | self.actor_lr = float(config["actor_learning_rate"]) 15 | self.actor_optimizer = th.optim.Adam( 16 | self.actor.parameters(), 17 | betas = config['betas'], 18 | lr = self.actor_lr 19 | ) 20 | 21 | def train_actor_critic_no_ppo(self): 22 | 23 | ''' 24 | Set learning rate. 25 | ''' 26 | # set learning rate; 27 | actor_lr = self.actor_lr 28 | critic_lr = self.critic_lr 29 | if self.lr_schedule == 'linear': 30 | actor_lr = (1e-5 - self.actor_lr) * float(self.epoch_num / self.max_epochs) + self.actor_lr 31 | critic_lr = (1e-5 - self.critic_lr) * float(self.epoch_num / self.max_epochs) + self.critic_lr 32 | 33 | for param_group in self.actor_optimizer.param_groups: 34 | param_group['lr'] = actor_lr 35 | for param_group in self.critic_optimizer.param_groups: 36 | param_group['lr'] = critic_lr 37 | 38 | self.writer.add_scalar("info/actor_lr", actor_lr, self.epoch_num) 39 | self.writer.add_scalar("info/critic_lr", critic_lr, self.epoch_num) 40 | 41 | return super().train_actor_critic_no_ppo() 42 | 43 | def use_analytic_grads(self): 44 | 45 | return False 46 | 47 | def use_ppo(self): 48 | 49 | return False 50 | 51 | def get_optimizers_state(self): 52 | state = super().get_optimizers_state() 53 | state['actor'] = self.actor_optimizer.state_dict() 54 | 55 | return state 56 | 57 | def train_actor_no_ppo(self, 58 | grad_start: th.Tensor, 59 | grad_obses: List[th.Tensor], 60 | grad_rp_eps: List[th.Tensor], 61 | grad_actions: List[th.Tensor], 62 | grad_values: List[th.Tensor], 63 | grad_next_values: List[th.Tensor], 64 | grad_rewards: List[th.Tensor], 65 | grad_fdones: List[th.Tensor], 66 | last_fdones: th.Tensor): 67 | ''' 68 | Train actor using Likelihood-Ratio (LR) techinque. 69 | 70 | There are two additional measures to reduce variance: 71 | 1. Use advantage term instead of total expected return. 72 | (Using total expected return resulted in hopless results in some problems...) 73 | 2. Normalize advantages (if [normalize_advantage] flag is set). 74 | ''' 75 | 76 | self.actor.train() 77 | 78 | with th.no_grad(): 79 | # compute advantages; 80 | curr_grad_advs = self.grad_advantages(self.tau, 81 | grad_values, 82 | grad_next_values, 83 | grad_rewards, 84 | grad_fdones, 85 | last_fdones) 86 | 87 | t_obses = swap_and_flatten01(th.stack(grad_obses, dim=0)) 88 | t_advantages = swap_and_flatten01(th.stack(curr_grad_advs, dim=0)) 89 | t_actions = swap_and_flatten01(th.stack(grad_actions, dim=0)) 90 | 91 | # to reduce variance, we admit normalizing advantages; 92 | if self.normalize_advantage: 93 | t_advantages = (t_advantages - t_advantages.mean()) / (t_advantages.std() + 1e-8) 94 | 95 | _, mu, std, _ = self.actor.forward_with_dist(t_obses) 96 | t_neglogpacs = self.neglogp(t_actions, mu, std, th.log(std)) 97 | 98 | actor_loss = t_advantages * t_neglogpacs.unsqueeze(-1) 99 | 100 | # divide by number of (s, a) pairs; 101 | actor_loss = th.mean(actor_loss) 102 | 103 | self.actor_optimizer.zero_grad() 104 | actor_loss.backward() 105 | if self.truncate_grads: 106 | th.nn.utils.clip_grad_norm_(self.actor.parameters(), self.grad_norm) 107 | self.actor_optimizer.step() -------------------------------------------------------------------------------- /envs/func_optim/ackley.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch as th 3 | import os 4 | 5 | import matplotlib.pyplot as plt 6 | 7 | from envs.func_optim.base import FuncOptimEnv 8 | 9 | class AckleyEnv(FuncOptimEnv): 10 | 11 | def __init__(self, 12 | num_envs, 13 | dim=1, 14 | seed=0, 15 | no_grad=True, 16 | render=False, 17 | device='cuda:0'): 18 | 19 | super(AckleyEnv, self).__init__( 20 | num_envs=num_envs, 21 | dim=dim, 22 | seed=seed, 23 | no_grad=no_grad, 24 | render=render, 25 | device=device) 26 | 27 | self.a = 20 28 | self.b = 0.2 29 | self.c = 2.0 * th.pi 30 | self.bound = 32.768 31 | 32 | def preprocess_actions(self, actions: th.Tensor): 33 | actions = super().preprocess_actions(actions) 34 | actions = actions * self.bound 35 | return actions 36 | 37 | def render(self, mode = 'human', actions = None, p_actions = None): 38 | 39 | if self.visualize: 40 | 41 | assert self.dim == 1, "" 42 | 43 | min_action = -self.bound 44 | max_action = self.bound 45 | step = (max_action - min_action) / self.render_resolution 46 | 47 | x = th.arange(min_action, max_action, step).unsqueeze(-1) 48 | y = self.evaluate(x) 49 | 50 | x = x[:, 0].cpu().numpy() 51 | y = y.cpu().numpy() 52 | 53 | f = plt.figure() 54 | f.set_figwidth(6.4 * 2) 55 | f.set_figheight(4.8 * 2) 56 | 57 | plt.plot(x, y, color='blue') 58 | 59 | with th.no_grad(): 60 | 61 | if actions == None: 62 | x = self.actions[:, 0].cpu().numpy() 63 | y = self.rew_buf.cpu().numpy() 64 | elif actions != None: 65 | x = th.clip(actions, -1, 1) * self.bound 66 | y = self.evaluate(x) 67 | 68 | x = x[:, 0].cpu().numpy() 69 | y = y.cpu().numpy() 70 | else: 71 | raise ValueError() 72 | 73 | plt.plot(x, y, 'x', color='black', markersize=5e-0) 74 | 75 | with th.no_grad(): 76 | 77 | if p_actions != None: 78 | x = th.clip(p_actions, -1, 1) * self.bound 79 | y = self.evaluate(x) 80 | 81 | x = x[:, 0].cpu().numpy() 82 | y = y.cpu().numpy() 83 | 84 | plt.plot(x, y, 'o', color='red', markersize=2e-0) 85 | 86 | plt.title("Ackley Function, Step {}".format(self.num_frames)) 87 | plt.xlabel("x") 88 | plt.ylabel("y") 89 | 90 | dir = './outputs/ackley/' 91 | 92 | if not os.path.exists(dir): 93 | os.makedirs(dir) 94 | 95 | plt.savefig("./outputs/ackley/ackley_{}.png".format(self.num_frames)) 96 | 97 | def reset(self, env_ids=None, force_reset=True): 98 | 99 | self.calculateObservations() 100 | 101 | return self.obs_buf 102 | 103 | ''' 104 | cut off the gradient from the current state to previous states 105 | ''' 106 | def clear_grad(self): 107 | 108 | pass 109 | 110 | ''' 111 | This function starts collecting a new trajectory from the current states but cut off the computation graph to the previous states. 112 | It has to be called every time the algorithm starts an episode and return the observation vectors 113 | ''' 114 | def initialize_trajectory(self): 115 | self.clear_grad() 116 | self.calculateObservations() 117 | return self.obs_buf 118 | 119 | def calculateObservations(self): 120 | 121 | self.obs_buf = th.zeros_like(self.obs_buf) 122 | 123 | def calculateReward(self): 124 | 125 | self.rew_buf = self.evaluate(self.actions) 126 | 127 | # reset agents 128 | self.reset_buf = th.where(self.progress_buf > self.episode_length - 1, th.ones_like(self.reset_buf), self.reset_buf) 129 | 130 | def evaluate(self, x: th.Tensor): 131 | 132 | t0 = th.zeros((len(x),), device=x.device, dtype=x.dtype) 133 | t1 = th.zeros((len(x),), device=x.device, dtype=x.dtype) 134 | one = th.ones((len(x),), device=x.device, dtype=x.dtype) 135 | 136 | for i in range(self.dim): 137 | 138 | xi = x[:, i] 139 | t0 = t0 + th.pow(xi, 2.0) 140 | t1 = t1 + th.cos(self.c * xi) 141 | 142 | t0 = t0 / self.dim 143 | t1 = t1 / self.dim 144 | 145 | y = -self.a * th.exp(-self.b * th.sqrt(t0)) - th.exp(t1) + self.a + th.exp(one) 146 | 147 | return -y -------------------------------------------------------------------------------- /src/gippo/network.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Modified from 3 | https://github.com/NVlabs/DiffRL/blob/main/models/actor.py 4 | https://github.com/NVlabs/DiffRL/blob/main/models/critic.py 5 | https://github.com/NVlabs/DiffRL/blob/main/models/model_utils.py 6 | ''' 7 | import numpy as np 8 | import torch as th 9 | from torch import nn 10 | 11 | from gippo.utils import Normal 12 | 13 | ''' 14 | Initialize the parameters of module using the given weight and bias initialization functions. 15 | ''' 16 | def init(module, weight_init, bias_init, gain=1): 17 | weight_init(module.weight.data) #, gain=gain) 18 | bias_init(module.bias.data) 19 | return module 20 | 21 | def get_activation_func(activation_name): 22 | if activation_name.lower() == 'tanh': 23 | return nn.Tanh() 24 | elif activation_name.lower() == 'relu': 25 | return nn.ReLU() 26 | elif activation_name.lower() == 'elu': 27 | return nn.ELU() 28 | elif activation_name.lower() == 'identity': 29 | return nn.Identity() 30 | else: 31 | raise NotImplementedError('Activation func {} not defined'.format(activation_name)) 32 | 33 | ''' 34 | Actor 35 | ''' 36 | class ActorStochasticMLP(nn.Module): 37 | def __init__(self, 38 | obs_dim, 39 | action_dim, 40 | cfg_network, 41 | device='cuda:0'): 42 | super(ActorStochasticMLP, self).__init__() 43 | 44 | self.device = device 45 | self.layer_dims = [obs_dim] + cfg_network['actor_mlp']['units'] 46 | 47 | modules = [] 48 | for i in range(len(self.layer_dims) - 1): 49 | modules.append(nn.Linear(self.layer_dims[i], self.layer_dims[i + 1])) 50 | modules.append(get_activation_func(cfg_network['actor_mlp']['activation'])) 51 | modules.append(th.nn.LayerNorm(self.layer_dims[i + 1])) 52 | self.actor_mlp = nn.Sequential(*modules).to(device) 53 | 54 | # mu; 55 | out_size = self.layer_dims[-1] 56 | self.mu = [nn.Linear(out_size, action_dim), get_activation_func('identity')] 57 | self.mu = nn.Sequential(*self.mu).to(device) 58 | 59 | # logstd; 60 | self.fixed_sigma = cfg_network['fixed_sigma'] 61 | if cfg_network['fixed_sigma']: 62 | logstd = cfg_network.get('actor_logstd_init', -1.0) 63 | self.logstd = nn.Parameter(th.ones(action_dim, dtype=th.float32, device=device) * logstd) 64 | else: 65 | self.logstd = nn.Linear(out_size, action_dim).to(device) 66 | 67 | self.action_dim = action_dim 68 | self.obs_dim = obs_dim 69 | 70 | # print(self.actor_mlp) 71 | # print(self.mu) 72 | # print(self.logstd) 73 | 74 | def forward(self, obs, deterministic = False): 75 | out = self.actor_mlp(obs) 76 | mu = self.mu(out) 77 | 78 | if deterministic: 79 | return mu 80 | else: 81 | if self.fixed_sigma: 82 | std = self.logstd.exp() # (num_actions) 83 | else: 84 | std = th.exp(self.logstd(out)) 85 | dist = Normal(mu, std) 86 | sample = dist.rsample() 87 | return sample 88 | 89 | def forward_with_dist(self, obs, deterministic = False): 90 | mu, std = self.forward_dist(obs) 91 | 92 | dist = Normal(mu, std) 93 | eps = dist.sample_eps() 94 | 95 | if deterministic: 96 | eps = eps.zero_() 97 | sample = dist.eps_to_action(eps) 98 | 99 | return sample, mu, std, eps 100 | 101 | def evaluate_actions_log_probs(self, obs, actions): 102 | mu, std = self.forward_dist(obs) 103 | dist = Normal(mu, std) 104 | return dist.log_prob(actions) 105 | 106 | def forward_dist(self, obs): 107 | out = self.actor_mlp(obs) 108 | mu = self.mu(out) 109 | if self.fixed_sigma: 110 | std = self.logstd.exp() # (num_actions) 111 | else: 112 | std = th.exp(self.logstd(out)) 113 | 114 | return mu, std 115 | 116 | ''' 117 | Critic 118 | ''' 119 | class CriticMLP(nn.Module): 120 | def __init__(self, obs_dim, cfg_network, device='cuda:0'): 121 | super(CriticMLP, self).__init__() 122 | 123 | self.device = device 124 | 125 | self.layer_dims = [obs_dim] + cfg_network['critic_mlp']['units'] + [1] 126 | 127 | init_ = lambda m: init(m, nn.init.orthogonal_, lambda x: nn.init. 128 | constant_(x, 0), np.sqrt(2)) 129 | 130 | modules = [] 131 | for i in range(len(self.layer_dims) - 1): 132 | modules.append(init_(nn.Linear(self.layer_dims[i], self.layer_dims[i + 1]))) 133 | if i < len(self.layer_dims) - 2: 134 | modules.append(get_activation_func(cfg_network['critic_mlp']['activation'])) 135 | modules.append(nn.LayerNorm(self.layer_dims[i + 1])) 136 | 137 | self.critic = nn.Sequential(*modules).to(device) 138 | 139 | self.obs_dim = obs_dim 140 | 141 | # print(self.critic) 142 | 143 | def forward(self, observations): 144 | return self.critic(observations) 145 | -------------------------------------------------------------------------------- /src/gippo/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch as th 3 | import random 4 | import os 5 | 6 | ''' 7 | From 8 | https://github.com/NVlabs/DiffRL/blob/a4c0dd1696d3c3b885ce85a3cb64370b580cb913/utils/common.py#L72 9 | ''' 10 | def seeding(seed=0, torch_deterministic=False): 11 | print("Setting seed: {}".format(seed)) 12 | 13 | random.seed(seed) 14 | np.random.seed(seed) 15 | th.manual_seed(seed) 16 | os.environ['PYTHONHASHSEED'] = str(seed) 17 | th.cuda.manual_seed(seed) 18 | th.cuda.manual_seed_all(seed) 19 | 20 | if torch_deterministic: 21 | # refer to https://docs.nvidia.com/cuda/cublas/index.html#cublasApi_reproducibility 22 | os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8' 23 | th.backends.cudnn.benchmark = False 24 | th.backends.cudnn.deterministic = True 25 | th.use_deterministic_algorithms(True) 26 | else: 27 | th.backends.cudnn.benchmark = True 28 | th.backends.cudnn.deterministic = False 29 | 30 | return seed 31 | 32 | from torch.distributions.utils import _standard_normal 33 | class Normal(th.distributions.Normal): 34 | 35 | def __init__(self, loc, scale, validate_args=None): 36 | super().__init__(loc, scale, validate_args) 37 | 38 | def sample_eps(self, sample_shape=th.Size()): 39 | shape = self._extended_shape(sample_shape) 40 | eps = _standard_normal(shape, dtype=self.loc.dtype, device=self.loc.device) 41 | return eps 42 | 43 | def eps_to_action(self, eps): 44 | return self.loc + eps * self.scale 45 | 46 | ''' 47 | From 48 | https://github.com/NVlabs/DiffRL/blob/main/utils/running_mean_std.py 49 | ''' 50 | from typing import Tuple 51 | class RunningMeanStd(object): 52 | def __init__(self, epsilon: float = 1e-4, shape: Tuple[int, ...] = (), device = 'cuda:0'): 53 | """ 54 | Calulates the running mean and std of a data stream 55 | https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm 56 | :param epsilon: helps with arithmetic issues 57 | :param shape: the shape of the data stream's output 58 | """ 59 | self.mean = th.zeros(shape, dtype = th.float32, device = device) 60 | self.var = th.ones(shape, dtype = th.float32, device = device) 61 | self.count = epsilon 62 | 63 | def to(self, device): 64 | rms = RunningMeanStd(device = device) 65 | rms.mean = self.mean.to(device).clone() 66 | rms.var = self.var.to(device).clone() 67 | rms.count = self.count 68 | return rms 69 | 70 | @th.no_grad() 71 | def update(self, arr: th.tensor) -> None: 72 | batch_mean = th.mean(arr, dim = 0) 73 | batch_var = th.var(arr, dim = 0, unbiased = False) 74 | batch_count = arr.shape[0] 75 | self.update_from_moments(batch_mean, batch_var, batch_count) 76 | 77 | def update_from_moments(self, batch_mean: th.tensor, batch_var: th.tensor, batch_count: int) -> None: 78 | delta = batch_mean - self.mean 79 | tot_count = self.count + batch_count 80 | 81 | new_mean = self.mean + delta * batch_count / tot_count 82 | m_a = self.var * self.count 83 | m_b = batch_var * batch_count 84 | m_2 = m_a + m_b + th.square(delta) * self.count * batch_count / (self.count + batch_count) 85 | new_var = m_2 / (self.count + batch_count) 86 | 87 | new_count = batch_count + self.count 88 | 89 | self.mean = new_mean 90 | self.var = new_var 91 | self.count = new_count 92 | 93 | def normalize(self, arr:th.tensor, un_norm = False) -> th.tensor: 94 | if not un_norm: 95 | result = (arr - self.mean) / th.sqrt(self.var + 1e-5) 96 | else: 97 | result = arr * th.sqrt(self.var + 1e-5) + self.mean 98 | return result 99 | 100 | ''' 101 | From 102 | https://github.com/SonSang/DiffRL/blob/stable/externals/rl_games/rl_games/algos_torch/torch_ext.py#L275 103 | ''' 104 | class AverageMeter(th.nn.Module): 105 | def __init__(self, in_shape, max_size): 106 | super(AverageMeter, self).__init__() 107 | self.max_size = max_size 108 | self.current_size = 0 109 | self.register_buffer("mean", th.zeros(in_shape, dtype = th.float32)) 110 | 111 | def update(self, values): 112 | size = values.size()[0] 113 | if size == 0: 114 | return 115 | new_mean = th.mean(values.float(), dim=0) 116 | size = np.clip(size, 0, self.max_size) 117 | old_size = min(self.max_size - size, self.current_size) 118 | size_sum = old_size + size 119 | self.current_size = size_sum 120 | self.mean = (self.mean * old_size + new_mean * size) / size_sum 121 | 122 | def clear(self): 123 | self.current_size = 0 124 | self.mean.fill_(0) 125 | 126 | def __len__(self): 127 | return self.current_size 128 | 129 | def get_mean(self): 130 | return self.mean.squeeze(0).cpu().numpy() 131 | 132 | ''' 133 | From 134 | https://github.com/Denys88/rl_games/blob/master/rl_games/common/a2c_common.py#L30 135 | ''' 136 | def swap_and_flatten01(arr): 137 | """ 138 | swap and then flatten axes 0 and 1 139 | """ 140 | if arr is None: 141 | return arr 142 | s = arr.size() 143 | return arr.transpose(0, 1).reshape(s[0] * s[1], *s[2:]) 144 | 145 | ''' 146 | From 147 | https://github.com/Denys88/rl_games/blob/master/rl_games/algos_torch/torch_ext.py#L10 148 | ''' 149 | numpy_to_torch_dtype_dict = { 150 | np.dtype('bool') : th.bool, 151 | np.dtype('uint8') : th.uint8, 152 | np.dtype('int8') : th.int8, 153 | np.dtype('int16') : th.int16, 154 | np.dtype('int32') : th.int32, 155 | np.dtype('int64') : th.int64, 156 | np.dtype('float16') : th.float16, 157 | np.dtype('float32') : th.float32, 158 | np.dtype('float64') : th.float64, 159 | np.dtype('complex64') : th.complex64, 160 | np.dtype('complex128') : th.complex128, 161 | } -------------------------------------------------------------------------------- /src/gippo/experience.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Modified from 3 | https://github.com/Denys88/rl_games/blob/master/rl_games/common/experience.py#L285 4 | ''' 5 | 6 | import numpy as np 7 | import torch as th 8 | import gym 9 | 10 | from gippo.utils import numpy_to_torch_dtype_dict 11 | 12 | class ExperienceBuffer: 13 | def __init__(self, env_info, algo_info, device, aux_tensor_dict=None): 14 | self.env_info = env_info 15 | self.algo_info = algo_info 16 | self.device = device 17 | 18 | self.num_agents = env_info.get('agents', 1) 19 | self.action_space = env_info['action_space'] 20 | 21 | self.num_actors = algo_info['num_actors'] 22 | self.horizon_length = algo_info['horizon_length'] 23 | batch_size = self.num_actors * self.num_agents 24 | self.obs_base_shape = (self.horizon_length, self.num_agents * self.num_actors) 25 | self.state_base_shape = (self.horizon_length, self.num_actors) 26 | if type(self.action_space) is gym.spaces.Discrete: 27 | raise ValueError() 28 | if type(self.action_space) is gym.spaces.Tuple: 29 | raise ValueError() 30 | if type(self.action_space) is gym.spaces.Box: 31 | self.actions_shape = (self.action_space.shape[0],) 32 | self.actions_num = self.action_space.shape[0] 33 | self.is_continuous = True 34 | self.tensor_dict = {} 35 | self._init_from_env_info(self.env_info) 36 | 37 | self.aux_tensor_dict = aux_tensor_dict 38 | if self.aux_tensor_dict is not None: 39 | self._init_from_aux_dict(self.aux_tensor_dict) 40 | 41 | def _init_from_env_info(self, env_info): 42 | obs_base_shape = self.obs_base_shape 43 | state_base_shape = self.state_base_shape 44 | 45 | self.tensor_dict['obses'] = self._create_tensor_from_space(env_info['observation_space'], obs_base_shape) 46 | 47 | val_space = gym.spaces.Box(low=0, high=1,shape=(env_info.get('value_size',1),)) 48 | self.tensor_dict['rewards'] = self._create_tensor_from_space(val_space, obs_base_shape) 49 | self.tensor_dict['values'] = self._create_tensor_from_space(val_space, obs_base_shape) 50 | self.tensor_dict['neglogpacs'] = self._create_tensor_from_space(gym.spaces.Box(low=0, high=1,shape=(), dtype=np.float32), obs_base_shape) 51 | self.tensor_dict['dones'] = self._create_tensor_from_space(gym.spaces.Box(low=0, high=1,shape=(), dtype=np.uint8), obs_base_shape) 52 | 53 | assert self.is_continuous, "Only continuous action space is supported" 54 | self.tensor_dict['actions'] = self._create_tensor_from_space(gym.spaces.Box(low=0, high=1,shape=self.actions_shape, dtype=np.float32), obs_base_shape) 55 | self.tensor_dict['mus'] = self._create_tensor_from_space(gym.spaces.Box(low=0, high=1,shape=self.actions_shape, dtype=np.float32), obs_base_shape) 56 | self.tensor_dict['sigmas'] = self._create_tensor_from_space(gym.spaces.Box(low=0, high=1,shape=self.actions_shape, dtype=np.float32), obs_base_shape) 57 | 58 | ''' 59 | Gradient info 60 | ''' 61 | # store first and second order analytical gradients of advantage w.r.t. actions; 62 | 63 | base_shape = self.obs_base_shape 64 | action_shape = self.actions_shape 65 | dtype = th.float32 66 | device = self.device 67 | 68 | self.tensor_dict['adv_gradient'] = th.zeros(base_shape + action_shape, dtype=dtype, device=device) 69 | self.tensor_dict['adv_hessian'] = th.zeros(base_shape + action_shape + action_shape, dtype=dtype, device=device) 70 | 71 | def _init_from_aux_dict(self, tensor_dict): 72 | obs_base_shape = self.obs_base_shape 73 | for k,v in tensor_dict.items(): 74 | self.tensor_dict[k] = self._create_tensor_from_space(gym.spaces.Box(low=0, high=1,shape=(v), dtype=np.float32), obs_base_shape) 75 | 76 | def _create_tensor_from_space(self, space, base_shape): 77 | if type(space) is gym.spaces.Box: 78 | dtype = numpy_to_torch_dtype_dict[space.dtype] 79 | return th.zeros(base_shape + space.shape, dtype= dtype, device = self.device) 80 | if type(space) is gym.spaces.Discrete: 81 | dtype = numpy_to_torch_dtype_dict[space.dtype] 82 | return th.zeros(base_shape, dtype= dtype, device = self.device) 83 | if type(space) is gym.spaces.Tuple: 84 | ''' 85 | assuming that tuple is only Discrete tuple 86 | ''' 87 | dtype = numpy_to_torch_dtype_dict[space.dtype] 88 | tuple_len = len(space) 89 | return th.zeros(base_shape +(tuple_len,), dtype= dtype, device = self.device) 90 | if type(space) is gym.spaces.Dict: 91 | t_dict = {} 92 | for k,v in space.spaces.items(): 93 | t_dict[k] = self._create_tensor_from_space(v, base_shape) 94 | return t_dict 95 | 96 | def update_data(self, name, index, val): 97 | if type(val) is dict: 98 | for k,v in val.items(): 99 | self.tensor_dict[name][k][index,:] = v 100 | else: 101 | self.tensor_dict[name][index,:] = val 102 | 103 | def get_transformed(self, transform_op): 104 | res_dict = {} 105 | for k, v in self.tensor_dict.items(): 106 | if type(v) is dict: 107 | transformed_dict = {} 108 | for kd,vd in v.items(): 109 | transformed_dict[kd] = transform_op(vd) 110 | res_dict[k] = transformed_dict 111 | else: 112 | res_dict[k] = transform_op(v) 113 | 114 | return res_dict 115 | 116 | def get_transformed_list(self, transform_op, tensor_list): 117 | res_dict = {} 118 | for k in tensor_list: 119 | v = self.tensor_dict.get(k) 120 | if v is None: 121 | continue 122 | if type(v) is dict: 123 | transformed_dict = {} 124 | for kd,vd in v.items(): 125 | transformed_dict[kd] = transform_op(vd) 126 | res_dict[k] = transformed_dict 127 | else: 128 | res_dict[k] = transform_op(v) 129 | 130 | return res_dict -------------------------------------------------------------------------------- /src/gippo/rl_algorithm/lrp.py: -------------------------------------------------------------------------------- 1 | import torch as th 2 | import numpy as np 3 | import torch.utils as tu 4 | from typing import List 5 | 6 | from gippo.rl_algorithm.base import RLAlgorithm 7 | from gippo.utils import swap_and_flatten01 8 | 9 | class LRP(RLAlgorithm): 10 | 11 | def __init__(self, config, env_config, device="cpu", log_path=None): 12 | 13 | super(LRP, self).__init__(config, env_config, device, log_path) 14 | 15 | self.actor_lr = float(config["actor_learning_rate"]) 16 | self.actor_optimizer = th.optim.Adam( 17 | self.actor.parameters(), 18 | betas = config['betas'], 19 | lr = self.actor_lr 20 | ) 21 | 22 | ''' 23 | Parameters for sample variance estimation of policy gradients. 24 | ''' 25 | # [var_est_num_sample]: Number of samples to use for sample variance estimation; 26 | self.var_est_num_sample = config.get("var_est_num_sample", 16) 27 | 28 | # [var_est_max_grad_len]: Length of the first N values (in policy gradients) to use 29 | # for sample variance estimation; 30 | self.var_est_max_grad_len = config.get("var_est_max_grad_len", 512) 31 | 32 | def train_actor_critic_no_ppo(self): 33 | 34 | ''' 35 | Set learning rate. 36 | ''' 37 | # set learning rate; 38 | actor_lr = self.actor_lr 39 | critic_lr = self.critic_lr 40 | if self.lr_schedule == 'linear': 41 | actor_lr = (1e-5 - self.actor_lr) * float(self.epoch_num / self.max_epochs) + self.actor_lr 42 | critic_lr = (1e-5 - self.critic_lr) * float(self.epoch_num / self.max_epochs) + self.critic_lr 43 | 44 | for param_group in self.actor_optimizer.param_groups: 45 | param_group['lr'] = actor_lr 46 | for param_group in self.critic_optimizer.param_groups: 47 | param_group['lr'] = critic_lr 48 | 49 | self.writer.add_scalar("info/actor_lr", actor_lr, self.epoch_num) 50 | self.writer.add_scalar("info/critic_lr", critic_lr, self.epoch_num) 51 | 52 | return super().train_actor_critic_no_ppo() 53 | 54 | def use_analytic_grads(self): 55 | 56 | return True 57 | 58 | def use_ppo(self): 59 | 60 | return False 61 | 62 | def get_optimizers_state(self): 63 | state = super().get_optimizers_state() 64 | state['actor'] = self.actor_optimizer.state_dict() 65 | 66 | return state 67 | 68 | def train_actor_no_ppo(self, 69 | grad_start: th.Tensor, 70 | grad_obses: List[th.Tensor], 71 | grad_rp_eps: List[th.Tensor], 72 | grad_actions: List[th.Tensor], 73 | grad_values: List[th.Tensor], 74 | grad_next_values: List[th.Tensor], 75 | grad_rewards: List[th.Tensor], 76 | grad_fdones: List[th.Tensor], 77 | last_fdones: th.Tensor): 78 | ''' 79 | Combine policy gradients obtained through LR and RP techinques. 80 | 81 | Use sample variance of the policy gradients to combine them. 82 | ''' 83 | 84 | self.actor.train() 85 | 86 | lr_gradient_var = None 87 | rp_gradient_var = None 88 | 89 | ''' 90 | Preliminaries 91 | ''' 92 | # compute advantages; 93 | curr_grad_advs = self.grad_advantages(self.tau, 94 | grad_values, 95 | grad_next_values, 96 | grad_rewards, 97 | grad_fdones, 98 | last_fdones) 99 | 100 | ''' 101 | Estimate LR gradients and their variances. 102 | ''' 103 | 104 | with th.no_grad(): 105 | t_obses = swap_and_flatten01(th.stack(grad_obses, dim=0)) 106 | t_advantages = swap_and_flatten01(th.stack(curr_grad_advs, dim=0)) 107 | t_actions = swap_and_flatten01(th.stack(grad_actions, dim=0)) 108 | 109 | # to reduce variance, we admit normalizing advantages; 110 | if self.normalize_advantage: 111 | t_advantages = (t_advantages - t_advantages.mean()) / (t_advantages.std() + 1e-8) 112 | 113 | _, mu, std, _ = self.actor.forward_with_dist(t_obses) 114 | t_neglogpacs = self.neglogp(t_actions, mu, std, th.log(std)) 115 | 116 | actor_loss = t_advantages * t_neglogpacs.unsqueeze(-1) 117 | 118 | # randomly select subset to compute sample variance; 119 | sample_num = np.min([self.var_est_num_sample, len(actor_loss)]) #if len(actor_loss) > 64 else len(actor_loss) 120 | actor_loss_num = len(actor_loss) 121 | actor_loss_indices = th.randperm(actor_loss_num)[:sample_num] 122 | lr_gradients = [] 123 | for ai in actor_loss_indices: 124 | al = actor_loss[ai].sum() 125 | 126 | self.actor_optimizer.zero_grad() 127 | al.backward(retain_graph=True) 128 | assert len(self.actor_optimizer.param_groups) == 1, "" 129 | grad_list = [] 130 | for param in self.actor_optimizer.param_groups[0]['params']: 131 | grad_list.append(param.grad.reshape([-1])) 132 | grad = th.cat(grad_list) 133 | 134 | # if length of the gradient is too long, we truncate it 135 | # because it is too time consuming to use all of the gradients; 136 | if len(grad) > self.var_est_max_grad_len: 137 | grad = grad[:self.var_est_max_grad_len] 138 | lr_gradients.append(grad) 139 | 140 | lr_gradients = th.stack(lr_gradients, dim=0) 141 | 142 | lr_gradient_cov = th.cov(lr_gradients.transpose(0, 1)) 143 | if lr_gradient_cov.ndim == 0: 144 | lr_gradient_cov = lr_gradient_cov.unsqueeze(0).unsqueeze(0) 145 | lr_gradient_var = lr_gradient_cov.diagonal(0).sum() 146 | 147 | ''' 148 | Estimate RP gradients and their variances. 149 | ''' 150 | 151 | # add value of the states; 152 | for i in range(len(grad_values)): 153 | curr_grad_advs[i] = curr_grad_advs[i] + grad_values[i] 154 | 155 | rp_gradients = [] 156 | for i in range(grad_start.shape[0]): 157 | for j in range(grad_start.shape[1]): 158 | if not grad_start[i, j]: 159 | continue 160 | 161 | al: th.Tensor = -curr_grad_advs[i][j].sum() 162 | 163 | self.actor_optimizer.zero_grad() 164 | al.backward(retain_graph=True) 165 | assert len(self.actor_optimizer.param_groups) == 1, "" 166 | grad_list = [] 167 | for param in self.actor_optimizer.param_groups[0]['params']: 168 | grad_list.append(param.grad.reshape([-1])) 169 | grad = th.cat(grad_list) 170 | 171 | # if length of the gradient is too long, we truncate it 172 | # because it is too time consuming to use all of the gradients; 173 | if len(grad) > self.var_est_max_grad_len: 174 | grad = grad[:self.var_est_max_grad_len] 175 | rp_gradients.append(grad) 176 | 177 | if len(rp_gradients) >= self.var_est_num_sample: 178 | break 179 | 180 | if len(rp_gradients) >= self.var_est_num_sample: 181 | break 182 | 183 | rp_gradients = th.stack(rp_gradients, dim=0) 184 | 185 | rp_gradient_cov = th.cov(rp_gradients.transpose(0, 1)) 186 | if rp_gradient_cov.ndim == 0: 187 | rp_gradient_cov = rp_gradient_cov.unsqueeze(0).unsqueeze(0) 188 | rp_gradient_var = rp_gradient_cov.diagonal(0).sum() 189 | 190 | ''' 191 | Interpolate LR and RP gradients using sample variances. 192 | ''' 193 | k_lr = (rp_gradient_var) / (lr_gradient_var + rp_gradient_var + 1e-8) 194 | k_rp = 1. - k_lr 195 | 196 | # self.writer.add_scalar("info/basic_k_lr", k_lr, self.epoch_num) 197 | 198 | lr_actor_loss = t_advantages * t_neglogpacs.unsqueeze(-1) 199 | lr_actor_loss = th.mean(lr_actor_loss) 200 | 201 | rp_actor_loss = -self.grad_advantages_first_terms_sum(curr_grad_advs, grad_start) 202 | rp_actor_loss = rp_actor_loss / th.count_nonzero(grad_start) 203 | 204 | actor_loss = (lr_actor_loss * k_lr) + (rp_actor_loss * k_rp) 205 | 206 | # update actor; 207 | self.actor_optimizer.zero_grad() 208 | actor_loss.backward() 209 | if self.truncate_grads: 210 | th.nn.utils.clip_grad_norm_(self.actor.parameters(), self.grad_norm) 211 | self.actor_optimizer.step() -------------------------------------------------------------------------------- /src/gippo/rl_algorithm/ppo.py: -------------------------------------------------------------------------------- 1 | import torch as th 2 | from typing import List 3 | 4 | from gippo.rl_algorithm.base import RLAlgorithm 5 | from gippo.dataset import PPODataset 6 | from copy import deepcopy 7 | 8 | class PPO(RLAlgorithm): 9 | 10 | def __init__(self, config, env_config, device="cpu", log_path=None): 11 | 12 | super(PPO, self).__init__(config, env_config, device, log_path) 13 | 14 | self.actor_lr = float(config["actor_learning_rate"]) 15 | self.actor_optimizer = th.optim.Adam( 16 | self.actor.parameters(), 17 | lr = self.actor_lr, 18 | eps = 1e-8, 19 | ) 20 | 21 | ppo_config = config.get("ppo", {}) 22 | 23 | # clipping parameter for PPO updates; 24 | self.e_clip = float(ppo_config.get("e_clip", 0.2)) 25 | 26 | # minibatch settings for PPO updates; 27 | self.mini_epochs = int(ppo_config.get("mini_epochs", 5)) 28 | self.minibatch_size = int(ppo_config.get("minibatch_size", 29 | self.horizon_length * self.num_actors)) 30 | self.dataset = PPODataset(self.batch_size, 31 | self.minibatch_size, 32 | device) 33 | 34 | ''' 35 | Measures to prevent false optimization. 36 | 37 | Theoretically, we optimize surrogate loss function for learning 38 | better policy. However, if learning rate is too large, the optimization 39 | result could be worse than the previous one. If such case is detected, 40 | we decrease the learning rate and try again. [max_optim_iter] denotes 41 | the maximum number of such cycles. 42 | ''' 43 | # use backup actor to restore the previous policy when optimization fails; 44 | self.b_actor = deepcopy(self.actor) 45 | 46 | # maximum number of iterations for actor optimization; 47 | self.max_optim_iter = int(ppo_config.get("max_optim_iter", 8)) 48 | 49 | # multiplier to decrease learning rate; 50 | self.learning_rate_multiplier = float(ppo_config.get("learning_rate_multiplier", 1.5)) 51 | 52 | def train_actor_critic_no_ppo(self): 53 | 54 | return super().train_actor_critic_no_ppo() 55 | 56 | def use_analytic_grads(self): 57 | 58 | return False 59 | 60 | def use_ppo(self): 61 | 62 | return True 63 | 64 | def get_optimizers_state(self): 65 | state = super().get_optimizers_state() 66 | state['actor'] = self.actor_optimizer.state_dict() 67 | 68 | return state 69 | 70 | def train_actor_no_ppo(self, 71 | grad_start: th.Tensor, 72 | grad_obses: List[th.Tensor], 73 | grad_rp_eps: List[th.Tensor], 74 | grad_actions: List[th.Tensor], 75 | grad_values: List[th.Tensor], 76 | grad_next_values: List[th.Tensor], 77 | grad_rewards: List[th.Tensor], 78 | grad_fdones: List[th.Tensor], 79 | last_fdones: th.Tensor): 80 | 81 | pass 82 | 83 | def train_actor_ppo(self, batch_dict): 84 | 85 | self.prepare_dataset(batch_dict) 86 | 87 | # backup actor and optimizer to prevent policy degradation; 88 | self.backup_actor() 89 | 90 | initial_actor_lr = self.actor_lr 91 | 92 | for iter in range(self.max_optim_iter): 93 | 94 | a_losses = [] 95 | 96 | for _ in range(0, self.mini_epochs): 97 | 98 | for i in range(len(self.dataset)): 99 | 100 | a_loss, cmu, csigma = self.calc_gradients(self.dataset[i]) 101 | a_losses.append(a_loss) 102 | self.dataset.update_mu_sigma(cmu, csigma) 103 | 104 | # this is erroneous code in original implementation, 105 | # put here for fair reproducibility; 106 | for param in self.actor_optimizer.param_groups: 107 | param['lr'] = self.actor_lr 108 | 109 | first_mini_epoch_loss = th.stack(a_losses[:len(self.dataset)]).mean() 110 | last_mini_epoch_loss = th.stack(a_losses[-len(self.dataset):]).mean() 111 | 112 | if last_mini_epoch_loss > first_mini_epoch_loss: 113 | 114 | with th.no_grad(): 115 | 116 | # optimization failed, restore the previous policy; 117 | self.restore_actor() 118 | 119 | # decrease learning rate; 120 | # @TODO: this is also an error in original implementation, 121 | # put here for fair reproducibility; 122 | for param in self.actor_optimizer.param_groups: 123 | param['lr'] = initial_actor_lr / self.learning_rate_multiplier 124 | self.actor_lr = initial_actor_lr / self.learning_rate_multiplier 125 | else: 126 | # @TODO: this is also an error in original implementation, 127 | # put here for fair reproducibility; 128 | self.actor_lr = initial_actor_lr 129 | break 130 | 131 | self.writer.add_scalar("info/actor_lr", self.actor_lr, self.epoch_num) 132 | 133 | return a_losses 134 | 135 | def prepare_dataset(self, batch_dict): 136 | 137 | obses = batch_dict['obses'] 138 | advantages = batch_dict['advantages'] 139 | dones = batch_dict['dones'] 140 | values = batch_dict['values'] 141 | actions = batch_dict['actions'] 142 | neglogpacs = batch_dict['neglogpacs'] 143 | mus = batch_dict['mus'] 144 | sigmas = batch_dict['sigmas'] 145 | 146 | advantages = th.sum(advantages, axis=1) 147 | 148 | if self.normalize_advantage: 149 | advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8) 150 | 151 | dataset_dict = {} 152 | dataset_dict['old_values'] = values 153 | dataset_dict['advantages'] = advantages 154 | dataset_dict['actions'] = actions 155 | dataset_dict['obs'] = obses 156 | 157 | dataset_dict['old_mu'] = mus 158 | dataset_dict['old_sigma'] = sigmas 159 | dataset_dict['old_logp_actions'] = neglogpacs 160 | 161 | dataset_dict['mu'] = mus 162 | dataset_dict['sigma'] = sigmas 163 | dataset_dict['logp_actions'] = neglogpacs 164 | 165 | self.dataset.update_values_dict(dataset_dict) 166 | 167 | def backup_actor(self): 168 | 169 | with th.no_grad(): 170 | for param, param_targ in zip(self.actor.parameters(), self.b_actor.parameters()): 171 | param_targ.data.mul_(0.) 172 | param_targ.data.add_(param.data) 173 | 174 | def restore_actor(self): 175 | 176 | with th.no_grad(): 177 | for param, param_targ in zip(self.b_actor.parameters(), self.actor.parameters()): 178 | param_targ.data.mul_(0.) 179 | param_targ.data.add_(param.data) 180 | 181 | def calc_gradients(self, input_dict): 182 | 183 | advantage = input_dict['advantages'] 184 | actions_batch = input_dict['actions'] 185 | obs_batch = input_dict['obs'] 186 | old_action_log_probs_batch = input_dict['old_logp_actions'] # original action log probs; 187 | curr_e_clip = self.e_clip 188 | 189 | # get current policy's actions; 190 | curr_mu, curr_std = self.actor.forward_dist(obs_batch) 191 | if curr_std.ndim == 1: 192 | curr_std = curr_std.unsqueeze(0) 193 | curr_std = curr_std.expand(curr_mu.shape[0], -1).clone() 194 | neglogp = self.neglogp(actions_batch, curr_mu, curr_std, th.log(curr_std)) 195 | 196 | a_loss = self.actor_loss(old_action_log_probs_batch, 197 | neglogp, 198 | advantage, 199 | curr_e_clip).mean() 200 | 201 | # we only use actor loss here for fair comparison; 202 | loss = a_loss 203 | 204 | self.actor_optimizer.zero_grad() 205 | loss.backward() 206 | if self.truncate_grads: 207 | th.nn.utils.clip_grad_norm_(self.actor.parameters(), self.grad_norm) 208 | self.actor_optimizer.step() 209 | 210 | self.train_result = (a_loss, curr_mu.detach(), curr_std.detach()) 211 | 212 | return self.train_result 213 | 214 | def actor_loss(self, old_action_log_probs_batch, action_log_probs, advantage, curr_e_clip): 215 | ratio = old_action_log_probs_batch - action_log_probs 216 | ratio = th.clamp(ratio, max=64.0) # prevent ratio becoming [inf]; 217 | ratio = th.exp(ratio) 218 | 219 | surr1 = advantage * ratio 220 | surr2 = advantage * th.clamp(ratio, 1.0 - curr_e_clip, 221 | 1.0 + curr_e_clip) 222 | a_loss = th.max(-surr1, -surr2) 223 | 224 | return a_loss -------------------------------------------------------------------------------- /src/gippo/rl_algorithm/gippo.py: -------------------------------------------------------------------------------- 1 | import torch as th 2 | import numpy as np 3 | from typing import List 4 | 5 | from gippo.rl_algorithm.ppo import PPO 6 | from gippo.utils import swap_and_flatten01, Normal, RunningMeanStd 7 | 8 | class GIPPO(PPO): 9 | 10 | def __init__(self, config, env_config, device="cpu", log_path=None): 11 | 12 | super(GIPPO, self).__init__(config, env_config, device, log_path) 13 | 14 | ''' 15 | Use different optimizers for analytical gradient-based 16 | actor update and PPO-based actor update 17 | @TODO: Merge two optimizers? 18 | ''' 19 | self.actor_lr_no_ppo = float(config["actor_learning_rate_no_ppo"]) 20 | self.actor_optimizer_no_ppo = th.optim.Adam( 21 | self.actor.parameters(), 22 | betas = config['betas'], 23 | lr = self.actor_lr_no_ppo 24 | ) 25 | 26 | ''' 27 | Parameters for alpha-policy updates. 28 | ''' 29 | gi_config = config.get("gi", {}) 30 | self.gi_alpha = float(gi_config.get("alpha", 1e-3)) 31 | self.gi_alpha_interval = float(gi_config.get("alpha_interval", 0.2)) 32 | self.gi_alpha_update_factor = float(gi_config.get("alpha_update_factor", 1.1)) 33 | self.gi_max_alpha = float(gi_config.get("max_alpha", 1.0)) 34 | self.gi_num_iter = int(gi_config.get("num_iter", 16)) 35 | self.gi_max_oorr = float(gi_config.get("max_oorr", 0.5)) 36 | 37 | # rms for estimated alpha-policy performance; 38 | self.est_alpha_performace_rms = RunningMeanStd() 39 | 40 | def use_analytic_grads(self): 41 | 42 | return True 43 | 44 | def use_ppo(self): 45 | 46 | return True 47 | 48 | def get_optimizers_state(self): 49 | state = super().get_optimizers_state() 50 | state['actor_no_ppo'] = self.actor_optimizer_no_ppo.state_dict() 51 | 52 | return state 53 | 54 | def train_actor_critic_no_ppo(self): 55 | 56 | ''' 57 | Set learning rate. 58 | ''' 59 | # set learning rate; 60 | # do not change actor learning rate; 61 | # @TODO: too messy code, and errorneous; 62 | actor_lr_no_ppo = self.actor_lr_no_ppo 63 | critic_lr = self.critic_lr 64 | if self.lr_schedule == 'linear': 65 | critic_lr = (1e-5 - self.critic_lr) * float(self.epoch_num / self.max_epochs) + self.critic_lr 66 | 67 | for param_group in self.actor_optimizer_no_ppo.param_groups: 68 | param_group['lr'] = actor_lr_no_ppo 69 | for param_group in self.critic_optimizer.param_groups: 70 | param_group['lr'] = critic_lr 71 | 72 | self.writer.add_scalar("info/actor_lr_no_ppo", self.actor_lr_no_ppo, self.epoch_num) 73 | self.writer.add_scalar("info/critic_lr", critic_lr, self.epoch_num) 74 | self.writer.add_scalar("gi_info/alpha", self.gi_alpha, self.epoch_num) 75 | 76 | return super().train_actor_critic_no_ppo() 77 | 78 | def train_actor_no_ppo(self, 79 | grad_start: th.Tensor, 80 | grad_obses: List[th.Tensor], 81 | grad_rp_eps: List[th.Tensor], 82 | grad_actions: List[th.Tensor], 83 | grad_values: List[th.Tensor], 84 | grad_next_values: List[th.Tensor], 85 | grad_rewards: List[th.Tensor], 86 | grad_fdones: List[th.Tensor], 87 | last_fdones: th.Tensor): 88 | 89 | # compute advantages; 90 | curr_grad_advs = self.grad_advantages(self.tau, 91 | grad_values, 92 | grad_next_values, 93 | grad_rewards, 94 | grad_fdones, 95 | last_fdones) 96 | 97 | # compute gradients of advantages w.r.t. actions; 98 | t_adv_gradient = self.differentiate_grad_advantages(grad_actions, 99 | curr_grad_advs, 100 | grad_start, 101 | False) 102 | 103 | with th.no_grad(): 104 | 105 | t_obses = swap_and_flatten01(th.stack(grad_obses, dim=0)) 106 | t_rp_eps = swap_and_flatten01(th.stack(grad_rp_eps, dim=0)) 107 | 108 | t_advantages = swap_and_flatten01(th.stack(curr_grad_advs, dim=0)) 109 | t_actions = swap_and_flatten01(th.stack(grad_actions, dim=0)) 110 | t_adv_gradient = swap_and_flatten01(t_adv_gradient) 111 | t_alpha_actions = t_actions + self.gi_alpha * t_adv_gradient 112 | 113 | # write log about variance; 114 | # advantage variance; 115 | t_advantages_var = th.var(t_advantages, dim=0) 116 | t_adv_gradient_cov = th.cov(t_adv_gradient.transpose(0, 1)) 117 | if t_adv_gradient_cov.ndim == 0: 118 | t_adv_gradient_cov = t_adv_gradient_cov.unsqueeze(0).unsqueeze(0) 119 | t_adv_gradient_var = t_adv_gradient_cov.diagonal(0).sum() 120 | 121 | self.writer.add_scalar("gi_info/advantage_variance", t_advantages_var, self.epoch_num) 122 | self.writer.add_scalar("gi_info/advantage_gradient_variance", t_adv_gradient_var, self.epoch_num) 123 | 124 | # backup actor before actor update; 125 | self.backup_actor() 126 | 127 | ''' 128 | Update policy to alpha-policy. 129 | ''' 130 | for i in range(self.max_optim_iter): 131 | 132 | actor_loss_0 = None 133 | actor_loss_1 = None 134 | 135 | for j in range(self.gi_num_iter): 136 | 137 | _, mu, std, _ = self.actor.forward_with_dist(t_obses) 138 | 139 | distr = Normal(mu, std) 140 | rpeps_actions = distr.eps_to_action(t_rp_eps) 141 | 142 | actor_loss = (rpeps_actions - t_alpha_actions) * (rpeps_actions - t_alpha_actions) 143 | actor_loss = th.sum(actor_loss, dim=-1) 144 | actor_loss = actor_loss.mean() 145 | 146 | # update actor; 147 | self.actor_optimizer_no_ppo.zero_grad() 148 | actor_loss.backward() 149 | if self.truncate_grads: 150 | th.nn.utils.clip_grad_norm_(self.actor.parameters(), self.grad_norm) 151 | self.actor_optimizer_no_ppo.step() 152 | 153 | if j == 0: 154 | actor_loss_0 = actor_loss.detach().cpu().item() 155 | elif j == self.gi_num_iter - 1: 156 | actor_loss_1 = actor_loss.detach().cpu().item() 157 | 158 | log_actor_loss_0 = np.log(actor_loss_0) 159 | log_actor_loss_1 = np.log(actor_loss_1) 160 | actor_loss_ratio = np.exp(log_actor_loss_1 - log_actor_loss_0) 161 | 162 | if actor_loss_0 < actor_loss_1: 163 | 164 | with th.no_grad(): 165 | 166 | # if optimization did not work well, restore original 167 | # policy, decrease learning rate and try again; 168 | self.restore_actor() 169 | 170 | for param in self.actor_optimizer_no_ppo.param_groups: 171 | param['lr'] /= self.learning_rate_multiplier 172 | 173 | continue 174 | 175 | else: 176 | 177 | # @TODO: errorneous code, put here for fair reproducibility; 178 | for param in self.actor_optimizer_no_ppo.param_groups: 179 | param['lr'] = self.actor_lr_no_ppo 180 | 181 | break 182 | 183 | self.writer.add_scalar("gi_info/actor_loss_ratio", actor_loss_ratio, self.epoch_num) 184 | 185 | did_converge = actor_loss_0 > actor_loss_1 186 | 187 | ''' 188 | Estimate determinant of (I + alpha * advantage Hessian) 189 | and use it to safely bound alpha. 190 | ''' 191 | with th.no_grad(): 192 | 193 | old_mu, old_std = self.experience_buffer.tensor_dict['mus'], \ 194 | self.experience_buffer.tensor_dict['sigmas'] 195 | 196 | old_mu, old_std = swap_and_flatten01(old_mu), swap_and_flatten01(old_std) 197 | 198 | _, new_mu, new_std, _ = self.actor.forward_with_dist(t_obses) 199 | 200 | preupdate_action_eps_jac = self.action_eps_jacobian(old_mu, old_std, t_rp_eps) 201 | postupdate_action_eps_jac = self.action_eps_jacobian(new_mu, new_std, t_rp_eps) 202 | 203 | preupdate_action_eps_jacdet = th.logdet(preupdate_action_eps_jac) 204 | postupdate_action_eps_jacdet = th.logdet(postupdate_action_eps_jac) 205 | 206 | est_hessian_logdet = postupdate_action_eps_jacdet - preupdate_action_eps_jacdet 207 | est_hessian_det = th.exp(est_hessian_logdet) 208 | 209 | mean_est_hessian_det = th.mean(est_hessian_det) 210 | min_est_hessian_det = th.min(est_hessian_det) 211 | max_est_hessian_det = th.max(est_hessian_det) 212 | 213 | self.writer.add_scalar("gi_info/mean_est_hessian_det", mean_est_hessian_det, self.epoch_num) 214 | self.writer.add_scalar("gi_info/min_est_hessian_det", min_est_hessian_det, self.epoch_num) 215 | self.writer.add_scalar("gi_info/max_est_hessian_det", max_est_hessian_det, self.epoch_num) 216 | 217 | ''' 218 | Update alpha and actor learning rate for next iteration. 219 | ''' 220 | curr_alpha = self.gi_alpha 221 | curr_actor_lr_no_ppo = self.actor_lr_no_ppo 222 | 223 | next_alpha = curr_alpha 224 | next_actor_lr_no_ppo = curr_actor_lr_no_ppo 225 | 226 | # we have to keep [est_hessian_det] in this range; 227 | min_safe_interval = (1. - self.gi_alpha_interval) 228 | max_safe_interval = (1. + self.gi_alpha_interval) 229 | 230 | if not did_converge: 231 | # alpha does not change, only decrease actor learning rate; 232 | next_actor_lr_no_ppo = curr_actor_lr_no_ppo / self.learning_rate_multiplier 233 | else: 234 | # actor_lr does not change, only change alpha; 235 | if min_est_hessian_det < min_safe_interval or \ 236 | max_est_hessian_det > max_safe_interval: 237 | next_alpha = curr_alpha / self.gi_alpha_update_factor 238 | else: 239 | next_alpha = curr_alpha * self.gi_alpha_update_factor 240 | 241 | next_alpha = np.clip(next_alpha, None, self.gi_max_alpha) 242 | next_actor_lr_no_ppo = np.clip(next_actor_lr_no_ppo, 1e-5, None) 243 | 244 | ''' 245 | Observe how much alpha-policy is different from the original 246 | policy, and then adjust [next_alpha] accordingly. 247 | ''' 248 | next_alpha = self.adjust_next_alpha_by_policy_diff(next_alpha, curr_grad_advs) 249 | 250 | self.gi_alpha = next_alpha 251 | self.actor_lr_no_ppo = next_actor_lr_no_ppo 252 | 253 | return 254 | 255 | def differentiate_grad_advantages(self, 256 | grad_actions: th.Tensor, 257 | grad_advs: th.Tensor, 258 | grad_start: th.Tensor, 259 | debug=False): 260 | 261 | ''' 262 | Compute first-order gradients of [grad_advs] w.r.t. 263 | [grad_actions] using automatic differentiation. 264 | ''' 265 | 266 | num_timestep = grad_start.shape[0] 267 | num_actor = grad_start.shape[1] 268 | 269 | ''' 270 | Using GAE, we can compute gradient of [grad_advs] at each 271 | time step by only backpropagating once for the first time 272 | step of a trajectory. 273 | ''' 274 | adv_sum: th.Tensor = self.grad_advantages_first_terms_sum(grad_advs, grad_start) 275 | for ga in grad_actions: 276 | ga.retain_grad() 277 | adv_sum.backward(retain_graph=debug) 278 | adv_gradient = [] 279 | for ga in grad_actions: 280 | adv_gradient.append(ga.grad) 281 | adv_gradient = th.stack(adv_gradient) 282 | 283 | # reweight gradients, so that we get correct gradients 284 | # for each time step; 285 | with th.no_grad(): 286 | 287 | c = (1.0 / (self.gamma * self.tau)) 288 | cv = th.ones((num_actor, 1), device=adv_gradient.device) 289 | 290 | for nt in range(num_timestep): 291 | 292 | # if new episode has been started, set [cv] to 1; 293 | for na in range(num_actor): 294 | if grad_start[nt, na]: 295 | cv[na, 0] = 1.0 296 | 297 | adv_gradient[nt] = adv_gradient[nt] * cv 298 | cv = cv * c 299 | 300 | if debug: 301 | 302 | ''' 303 | Compute gradients of [grad_advs] at each time step 304 | in brute force and compare it with the above computation 305 | results, which is more efficient than this. 306 | ''' 307 | for i in range(num_timestep): 308 | 309 | debug_adv_sum = grad_advs[i].sum() 310 | 311 | debug_grad_adv_gradient = th.autograd.grad(debug_adv_sum, grad_actions[i], retain_graph=True)[0] 312 | debug_grad_adv_gradient_norm = th.norm(debug_grad_adv_gradient, p=2, dim=-1) 313 | 314 | debug_grad_error = th.norm(debug_grad_adv_gradient - adv_gradient[i], p=2, dim=-1) 315 | debug_grad_error_ratio = debug_grad_error / debug_grad_adv_gradient_norm 316 | 317 | assert th.all(debug_grad_error_ratio < 0.01), \ 318 | "Gradient of advantage possibly wrong" 319 | 320 | adv_gradient = adv_gradient.detach() 321 | 322 | return adv_gradient 323 | 324 | def action_eps_jacobian(self, mu, sigma, eps): 325 | 326 | ''' 327 | Assume action is computed as: 328 | a = mu + sigma * eps, 329 | where mu, sigma, and eps are all one-dim tensors. 330 | ''' 331 | 332 | jacobian = th.zeros((eps.shape[0], eps.shape[1], eps.shape[1])) 333 | 334 | for d in range(eps.shape[1]): 335 | 336 | if sigma.ndim == 1: 337 | jacobian[:, d, d] = sigma[d].detach() 338 | elif sigma.ndim == 2: 339 | jacobian[:, d, d] = sigma[:, d].detach() 340 | 341 | return jacobian 342 | 343 | @th.no_grad() 344 | def adjust_next_alpha_by_policy_diff(self, next_alpha, grad_advs): 345 | ''' 346 | Observe how much alpha-policy is different from the original 347 | policy, and then adjust [next_alpha] accordingly. 348 | 349 | If alpha-policy is too far away from the original policy, 350 | decrease [next_alpha], so that there is some room for PPO 351 | optimization. 352 | ''' 353 | obses = swap_and_flatten01(self.experience_buffer.tensor_dict['obses'].detach()) 354 | neglogpacs = swap_and_flatten01(self.experience_buffer.tensor_dict['neglogpacs'].detach()) 355 | actions = swap_and_flatten01(self.experience_buffer.tensor_dict['actions'].detach()) 356 | advantages = swap_and_flatten01(th.cat(grad_advs, dim=0).detach()) 357 | 358 | n_mus, n_sigmas = self.actor.forward_dist(obses) 359 | if n_sigmas.ndim == 1: 360 | n_sigmas = n_sigmas.unsqueeze(0) 361 | n_sigmas = n_sigmas.expand(n_mus.shape[0], -1).clone() 362 | 363 | n_neglogpacs = self.neglogp(actions, n_mus, n_sigmas, th.log(n_sigmas)) 364 | 365 | ''' 366 | Estimate difference between alpha-policy and original policy 367 | using out-of-range-ratio. 368 | ''' 369 | pac_ratio = th.exp(th.clamp(neglogpacs - n_neglogpacs, max=16.)) # prevent [inf]; 370 | out_of_range_pac_ratio = th.logical_or(pac_ratio < (1. - self.e_clip), 371 | pac_ratio > (1. + self.e_clip)) 372 | out_of_range_pac_ratio = th.count_nonzero(out_of_range_pac_ratio) / actions.shape[0] 373 | 374 | self.writer.add_scalar("gi_info/out_of_range_ratio", out_of_range_pac_ratio, self.epoch_num) 375 | 376 | ''' 377 | Evaluate the bias of analytical gradients by estimating the 378 | performance of alpha-policy in terms of PPO. 379 | ''' 380 | est_alpha_performance = \ 381 | th.sum(advantages * pac_ratio) - \ 382 | th.sum(advantages) 383 | 384 | # @TODO: Ugly approach to prevent overly noisy [est_alpha_performance]; 385 | n_est_alpha_performance = self.est_alpha_performace_rms.normalize(est_alpha_performance) 386 | self.est_alpha_performace_rms.update(est_alpha_performance.unsqueeze(0)) 387 | 388 | self.writer.add_scalar("gi_info/est_alpha_performance", est_alpha_performance, self.epoch_num) 389 | self.writer.add_scalar("gi_info/est_alpha_performance_normalized", n_est_alpha_performance, self.epoch_num) 390 | 391 | ''' 392 | In following conditions, decrease [next_alpha]: 393 | 1. [out_of_range_pac_ratio] is too high (guarantee PPO update); 394 | 2. [est_alpha_performance] is negative (biased analytical grads); 395 | ''' 396 | if out_of_range_pac_ratio > self.gi_max_oorr or \ 397 | (est_alpha_performance < 0 and n_est_alpha_performance < -1.): 398 | 399 | next_alpha = self.gi_alpha / self.gi_alpha_update_factor 400 | 401 | next_alpha = np.clip(next_alpha, None, self.gi_max_alpha) 402 | return next_alpha 403 | 404 | def prepare_dataset(self, batch_dict): 405 | 406 | super().prepare_dataset(batch_dict) 407 | 408 | ''' 409 | Since policy could have been updated to alpha policy, 410 | change [mu], [sigma], and [logp_actions] accordingly. 411 | ''' 412 | obses = batch_dict['obses'] 413 | actions = batch_dict['actions'] 414 | 415 | with th.no_grad(): 416 | n_mus, n_sigmas = self.actor.forward_dist(obses) 417 | if n_sigmas.ndim == 1: 418 | n_sigmas = n_sigmas.unsqueeze(0) 419 | n_sigmas = n_sigmas.expand(n_mus.shape[0], -1).clone() 420 | n_neglogpacs = self.neglogp(actions, n_mus, n_sigmas, th.log(n_sigmas)) 421 | 422 | self.dataset.values_dict['mu'] = n_mus 423 | self.dataset.values_dict['sigma'] = n_sigmas 424 | self.dataset.values_dict['logp_actions'] = n_neglogpacs 425 | 426 | def calc_gradients(self, input_dict): 427 | 428 | advantage = input_dict['advantages'] 429 | actions_batch = input_dict['actions'] 430 | obs_batch = input_dict['obs'] 431 | 432 | old_action_log_probs_batch_before_alpha = input_dict['old_logp_actions'] # action log probs before alpha update; 433 | old_action_log_probs_batch_after_alpha = input_dict['logp_actions'] # action log probs after alpha update; 434 | 435 | curr_e_clip = self.e_clip 436 | 437 | # get current policy's actions; 438 | curr_mu, curr_std = self.actor.forward_dist(obs_batch) 439 | if curr_std.ndim == 1: 440 | curr_std = curr_std.unsqueeze(0) 441 | curr_std = curr_std.expand(curr_mu.shape[0], -1).clone() 442 | neglogp = self.neglogp(actions_batch, curr_mu, curr_std, th.log(curr_std)) 443 | 444 | a_loss = self.actor_loss(old_action_log_probs_batch_before_alpha, 445 | old_action_log_probs_batch_after_alpha, 446 | neglogp, 447 | advantage, 448 | curr_e_clip).mean() 449 | 450 | # we only use actor loss here for fair comparison; 451 | loss = a_loss 452 | 453 | self.actor_optimizer.zero_grad() 454 | loss.backward() 455 | if self.truncate_grads: 456 | th.nn.utils.clip_grad_norm_(self.actor.parameters(), self.grad_norm) 457 | self.actor_optimizer.step() 458 | 459 | self.train_result = (a_loss, curr_mu.detach(), curr_std.detach()) 460 | 461 | return self.train_result 462 | 463 | def actor_loss(self, 464 | old_action_log_probs_batch_before_alpha, 465 | old_action_log_probs_batch_after_alpha, 466 | action_log_probs, 467 | advantage, 468 | curr_e_clip): 469 | 470 | t_ratio = old_action_log_probs_batch_before_alpha - \ 471 | old_action_log_probs_batch_after_alpha 472 | 473 | if th.any(th.abs(t_ratio) > 4.): 474 | # ratio can be numerically unstable, just use original ppo; 475 | # but use policy after RP update as importance sampling distribution; 476 | ratio = old_action_log_probs_batch_after_alpha - action_log_probs 477 | else: 478 | t_ratio = th.exp(t_ratio) 479 | tmp0 = th.log(t_ratio + 1.) 480 | tmp1 = tmp0 - old_action_log_probs_batch_before_alpha 481 | action_log_probs_batch_mid = np.log(2.) - tmp1 482 | 483 | ratio = action_log_probs_batch_mid - action_log_probs 484 | 485 | ratio = th.clamp(ratio, min=-16., max=16.) # prevent ratio becoming [inf]; 486 | ratio = th.exp(ratio) 487 | 488 | surr1 = advantage * ratio 489 | surr2 = advantage * th.clamp(ratio, 1.0 - curr_e_clip, 490 | 1.0 + curr_e_clip) 491 | a_loss = th.max(-surr1, -surr2) 492 | 493 | return a_loss -------------------------------------------------------------------------------- /src/gippo/rl_algorithm/base.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Modified from 3 | https://github.com/Denys88/rl_games/blob/master/rl_games/algos_torch/a2c_continuous.py 4 | ''' 5 | 6 | import numpy as np 7 | import torch as th 8 | from torch import nn 9 | from typing import List 10 | 11 | import time 12 | import gym 13 | import copy 14 | import os 15 | 16 | from gippo.utils import RunningMeanStd, AverageMeter, swap_and_flatten01 17 | from gippo.vecenv import create_vecenv 18 | from gippo.network import ActorStochasticMLP, CriticMLP 19 | from gippo.experience import ExperienceBuffer 20 | from gippo.dataset import CriticDataset 21 | 22 | from torch.utils.tensorboard import SummaryWriter 23 | 24 | save_distribution = False 25 | 26 | class RLAlgorithm: 27 | def __init__(self, 28 | config, 29 | env_config, 30 | device="cpu", 31 | log_path=None): 32 | 33 | ''' 34 | Basic configs 35 | ''' 36 | self.config = config 37 | self.device = device 38 | 39 | # logging; 40 | self.log_path = log_path 41 | if self.log_path is None: 42 | self.log_path = f'./logdir/{time.strftime("%Y-%m-%d_%H-%M-%S")}' 43 | self.nn_dir = os.path.join(self.log_path, 'nn') 44 | self.summaries_dir = os.path.join(self.log_path, 'runs') 45 | 46 | os.makedirs(self.log_path, exist_ok=True) 47 | os.makedirs(self.nn_dir, exist_ok=True) 48 | os.makedirs(self.summaries_dir, exist_ok=True) 49 | 50 | self.writer = SummaryWriter(self.summaries_dir) 51 | self.save_freq = config.get('save_frequency', 0) 52 | self.save_best_after = config.get('save_best_after', 100) 53 | self.print_stats = config.get('print_stats', True) 54 | 55 | # experience buffer size that we are going to use for training; 56 | self.horizon_length = config.get('horizon_length', 32) 57 | self.num_actors = config.get('num_actors', 1) 58 | self.num_agents = config.get('num_agents', 1) 59 | self.batch_size = self.horizon_length * self.num_actors * self.num_agents 60 | self.batch_size_envs = self.horizon_length * self.num_actors 61 | 62 | # env configs; 63 | self.env_name = env_config['name'] 64 | self.env_config = env_config.get('config', {}) 65 | self.env_config['device'] = self.device 66 | self.env_config['no_grad'] = not self.use_analytic_grads() 67 | self.vec_env = create_vecenv( 68 | self.env_name, 69 | self.num_actors, 70 | **self.env_config) 71 | self.env_info = self.vec_env.get_env_info() 72 | 73 | self.value_size = self.env_info.get('value_size', 1) 74 | 75 | # reshaper and normalization; 76 | self.rewards_shaper = config.get("rewards_shaper", None) 77 | self.normalize_input = config.get("normalize_input", False) 78 | self.normalize_value = config.get("normalize_value", False) 79 | if self.normalize_value: 80 | self.value_mean_std = RunningMeanStd((1,)).to(self.device) 81 | self.normalize_advantage = config.get("normalize_advantage", False) 82 | 83 | # observation; 84 | self.observation_space = self.env_info['observation_space'] 85 | self.obs_shape = self.observation_space.shape 86 | self.obs = None 87 | 88 | # running stats; 89 | self.frame = 0 90 | self.update_time = 0 91 | self.mean_rewards = self.last_mean_rewards = -100500 92 | self.play_time = 0 93 | self.epoch_num = 0 94 | 95 | # training; 96 | self.max_epochs = self.config.get('max_epochs', 1e6) 97 | self.network = config.get("network", None) 98 | 99 | ''' 100 | Our work solves stochastic optimization problem in differentiable environment. 101 | ''' 102 | num_obs = self.obs_shape[0] 103 | num_actions = self.env_info['action_space'].shape[0] 104 | 105 | self.actor = ActorStochasticMLP(num_obs, 106 | num_actions, 107 | config['network'], 108 | device=self.device) 109 | 110 | self.critic = CriticMLP(num_obs, 111 | config['network'], 112 | device=self.device) 113 | 114 | self.target_critic = copy.deepcopy(self.critic) 115 | self.target_critic_alpha = config.get('target_critic_alpha', 0.4) 116 | 117 | self.all_params = list(self.actor.parameters()) + list(self.critic.parameters()) 118 | 119 | ''' 120 | Optimizers 121 | ''' 122 | 123 | # critic; 124 | self.critic_lr = float(config["critic_learning_rate"]) 125 | self.critic_optimizer = th.optim.Adam( 126 | self.critic.parameters(), 127 | betas = config['betas'], 128 | lr = self.critic_lr 129 | ) 130 | self.critic_iterations = config["critic_iterations"] 131 | self.critic_num_batch = config["critic_num_batch"] 132 | 133 | # misc; 134 | self.truncate_grads = config["truncate_grads"] 135 | self.grad_norm = config["grad_norm"] 136 | 137 | # learning rate scheduler; 138 | self.lr_schedule = config['lr_schedule'] 139 | 140 | # change to proper running mean std for backpropagation; 141 | if self.normalize_input: 142 | if isinstance(self.observation_space, gym.spaces.Dict): 143 | raise NotImplementedError() 144 | else: 145 | self.obs_rms = RunningMeanStd(shape=self.obs_shape, device=self.device) 146 | 147 | if self.normalize_value: 148 | self.val_rms = RunningMeanStd(shape=(1,), device=self.device) 149 | 150 | # episode length; 151 | self.episode_max_length = self.vec_env.env.episode_length 152 | 153 | # statistics; 154 | self.games_to_track = 100 155 | self.game_rewards = AverageMeter(self.value_size, self.games_to_track).to(self.device) 156 | self.game_lengths = AverageMeter(1, self.games_to_track).to(self.device) 157 | 158 | # GAE params; 159 | self.gamma = config['gamma'] 160 | self.tau = config['tau'] 161 | 162 | def train(self): 163 | self.init_tensors() 164 | self.last_mean_rewards = -100500 165 | start_time = time.time() 166 | total_time = 0 167 | rep_count = 0 168 | self.obs = self.env_reset() 169 | self.curr_frames = self.batch_size_envs 170 | 171 | while True: 172 | epoch_num = self.update_epoch() 173 | 174 | step_time, no_ppo_time, ppo_time, sum_time, ppo_loss = \ 175 | self.train_epoch() 176 | 177 | total_time += sum_time 178 | frame = self.frame 179 | 180 | # cleaning memory to optimize space 181 | if self.use_ppo(): 182 | self.dataset.update_values_dict(None) 183 | 184 | print(f"Num steps: {frame + self.curr_frames}") 185 | 186 | # do we need scaled_time? 187 | scaled_time = sum_time #self.num_agents * sum_time 188 | scaled_no_ppo_time = no_ppo_time #self.num_agents * play_time 189 | curr_frames = self.curr_frames 190 | self.frame += curr_frames 191 | 192 | self.write_stats(total_time, 193 | epoch_num, 194 | step_time, 195 | no_ppo_time, 196 | ppo_time, 197 | ppo_loss, 198 | frame, 199 | scaled_time, 200 | scaled_no_ppo_time, 201 | curr_frames) 202 | 203 | mean_rewards = [0] 204 | mean_lengths = 0 205 | 206 | if self.game_rewards.current_size > 0: 207 | mean_rewards = self.game_rewards.get_mean() 208 | mean_lengths = self.game_lengths.get_mean() 209 | self.mean_rewards = mean_rewards[0] 210 | 211 | for i in range(self.value_size): 212 | rewards_name = 'rewards' if i == 0 else 'rewards{0}'.format(i) 213 | self.writer.add_scalar(rewards_name + '/step'.format(i), mean_rewards[i], frame) 214 | self.writer.add_scalar(rewards_name + '/iter'.format(i), mean_rewards[i], epoch_num) 215 | self.writer.add_scalar(rewards_name + '/time'.format(i), mean_rewards[i], total_time) 216 | 217 | self.writer.add_scalar('episode_lengths/step', mean_lengths, frame) 218 | self.writer.add_scalar('episode_lengths/iter', mean_lengths, epoch_num) 219 | self.writer.add_scalar('episode_lengths/time', mean_lengths, total_time) 220 | 221 | checkpoint_name = self.config['name'] + 'ep' + str(epoch_num) + 'rew' + str(mean_rewards) 222 | 223 | if self.save_freq > 0: 224 | if (epoch_num % self.save_freq == 0) and (mean_rewards[0] <= self.last_mean_rewards): 225 | self.save(os.path.join(self.nn_dir, 'last_' + checkpoint_name)) 226 | 227 | if mean_rewards[0] > self.last_mean_rewards and epoch_num >= self.save_best_after: 228 | print('saving next best rewards: ', mean_rewards) 229 | self.last_mean_rewards = mean_rewards[0] 230 | self.save(os.path.join(self.nn_dir, self.config['name'])) 231 | 232 | if epoch_num > self.max_epochs: 233 | self.save(os.path.join(self.nn_dir, 'last_' + self.config['name'] + 'ep' + str(epoch_num) + 'rew' + str(mean_rewards))) 234 | print('MAX EPOCHS NUM!') 235 | return self.last_mean_rewards, epoch_num 236 | 237 | update_time = 0 238 | if self.print_stats: 239 | fps_step = curr_frames / step_time 240 | # fps_step_inference = curr_frames / scaled_play_time 241 | fps_total = curr_frames / scaled_time 242 | # print(f'fps step: {fps_step:.1f} fps step and policy inference: {fps_step_inference:.1f} fps total: {fps_total:.1f} mean reward: {mean_rewards[0]:.2f} mean lengths: {mean_lengths:.1f}') 243 | print(f'epoch: {epoch_num} fps step: {fps_step:.1f} fps total: {fps_total:.1f} mean reward: {mean_rewards[0]:.2f} mean lengths: {mean_lengths:.1f}') 244 | 245 | def init_tensors(self): 246 | 247 | # use specialized experience buffer; 248 | batch_size = self.num_agents * self.num_actors 249 | 250 | algo_info = { 251 | 'num_actors' : self.num_actors, 252 | 'horizon_length' : self.horizon_length, 253 | } 254 | 255 | self.experience_buffer = ExperienceBuffer( 256 | self.env_info, 257 | algo_info, 258 | self.device 259 | ) 260 | 261 | current_rewards_shape = (batch_size, self.value_size) 262 | self.current_rewards = th.zeros(current_rewards_shape, dtype=th.float32, device=self.device) 263 | self.current_lengths = th.zeros(batch_size, dtype=th.float32, device=self.device) 264 | self.dones = th.ones((batch_size,), dtype=th.uint8, device=self.device) 265 | 266 | self.update_list = ['actions', 'neglogpacs', 'values', 'mus', 'sigmas'] 267 | self.tensor_list = self.update_list + ['obses', 'states', 'dones', 'adv_grads'] 268 | 269 | def cast_obs(self, obs): 270 | if isinstance(obs, th.Tensor): 271 | self.is_tensor_obses = True 272 | elif isinstance(obs, np.ndarray): 273 | assert(obs.dtype != np.int8) 274 | if obs.dtype == np.uint8: 275 | obs = th.ByteTensor(obs).to(self.device) 276 | else: 277 | obs = th.FloatTensor(obs).to(self.device) 278 | return obs 279 | 280 | def obs_to_tensors(self, obs): 281 | obs_is_dict = isinstance(obs, dict) 282 | if obs_is_dict: 283 | raise NotImplementedError() 284 | else: 285 | upd_obs = self.cast_obs(obs) 286 | if not obs_is_dict or 'obs' not in obs: 287 | upd_obs = {'obs' : upd_obs} 288 | return upd_obs 289 | 290 | def env_reset(self): 291 | obs = self.vec_env.reset() 292 | obs = self.obs_to_tensors(obs) 293 | return obs 294 | 295 | def update_epoch(self): 296 | self.epoch_num += 1 297 | return self.epoch_num 298 | 299 | def train_epoch(self): 300 | 301 | self.vec_env.set_train_info(self.frame, self) 302 | 303 | no_ppo_time_start = time.time() 304 | 305 | # set learning rate; 306 | # if self.gi_lr_schedule == 'linear': 307 | # if self.gi_algorithm in ['shac-only', 'grad-ppo-shac', 'basic-lr', 'basic-rp', 'basic-combination']: 308 | # actor_lr = (1e-5 - self.actor_lr) * float(self.epoch_num / self.max_epochs) + self.actor_lr 309 | # else: 310 | # actor_lr = self.actor_lr 311 | # critic_lr = (1e-5 - self.critic_lr) * float(self.epoch_num / self.max_epochs) + self.critic_lr 312 | # else: 313 | # actor_lr = self.actor_lr 314 | # critic_lr = self.critic_lr 315 | 316 | # for param_group in self.actor_optimizer.param_groups: 317 | # param_group['lr'] = actor_lr 318 | # for param_group in self.critic_optimizer.param_groups: 319 | # param_group['lr'] = critic_lr 320 | 321 | # self.writer.add_scalar("info/gi_actor_lr", actor_lr, self.epoch_num) 322 | # self.writer.add_scalar("info/gi_critic_lr", critic_lr, self.epoch_num) 323 | 324 | # # rp actor lr and alpha; 325 | 326 | # self.writer.add_scalar("info_alpha/actor_lr", self.actor_lr, self.epoch_num) 327 | # self.writer.add_scalar("info_alpha/alpha", self.gi_curr_alpha, self.epoch_num) 328 | 329 | ''' 330 | Train actor critic using methods other than PPO. 331 | When we use PPO-based methods (PPO, GI-PPO), 332 | we additionally collect experience to use in PPO 333 | updates afterwards. 334 | ''' 335 | batch_dict = self.train_actor_critic_no_ppo() 336 | no_ppo_time_end = time.time() 337 | 338 | self.curr_frames = batch_dict.pop('played_frames') 339 | 340 | ''' 341 | Train actor using PPO-based algorithms using 342 | collected experience above. 343 | ''' 344 | ppo_time_start = time.time() 345 | if self.use_ppo(): 346 | ppo_loss = self.train_actor_ppo(batch_dict) 347 | else: 348 | # placeholders; 349 | ppo_loss = [th.zeros((1,), dtype=th.float32, device=self.device)] 350 | ppo_time_end = time.time() 351 | 352 | no_ppo_time = no_ppo_time_end - no_ppo_time_start 353 | ppo_time = ppo_time_end - ppo_time_start 354 | total_time = ppo_time_end - no_ppo_time_start 355 | 356 | # update (rp) alpha and actor lr; 357 | 358 | # self.gi_curr_alpha = self.next_alpha 359 | # self.actor_lr = self.next_actor_lr 360 | 361 | return batch_dict['step_time'], \ 362 | no_ppo_time, \ 363 | ppo_time, \ 364 | total_time, \ 365 | ppo_loss 366 | 367 | def train_actor_critic_no_ppo(self): 368 | 369 | epinfos = [] 370 | update_list = self.update_list 371 | 372 | step_time = 0.0 373 | 374 | # indicator for steps that grad computation starts; 375 | grad_start = th.zeros_like(self.experience_buffer.tensor_dict['dones']) 376 | 377 | grad_obses = [] 378 | grad_values = [] 379 | grad_next_values = [] 380 | grad_actions = [] 381 | grad_rewards = [] 382 | grad_fdones = [] 383 | grad_rp_eps = [] 384 | 385 | # use frozen [obs_rms] and [value_rms] during this one function call; 386 | curr_obs_rms = None 387 | curr_val_rms = None 388 | if self.normalize_input: 389 | with th.no_grad(): 390 | curr_obs_rms = copy.deepcopy(self.obs_rms) 391 | if self.normalize_value: 392 | with th.no_grad(): 393 | curr_val_rms = copy.deepcopy(self.val_rms) 394 | 395 | # start with clean grads; 396 | self.obs = self.vec_env.env.initialize_trajectory() 397 | self.obs = self.obs_to_tensors(self.obs) 398 | grad_start[0, :] = 1.0 399 | 400 | for n in range(self.horizon_length): 401 | 402 | if n > 0: 403 | grad_start[n, :] = self.dones 404 | 405 | # get action for current observation; 406 | if self.use_analytic_grads(): 407 | res_dict = self.get_action_values( 408 | self.obs, 409 | curr_obs_rms, 410 | curr_val_rms 411 | ) 412 | else: 413 | with th.no_grad(): 414 | res_dict = self.get_action_values( 415 | self.obs, 416 | curr_obs_rms, 417 | curr_val_rms 418 | ) 419 | 420 | # we store tensor objects with gradients; 421 | grad_obses.append(res_dict['obs']) 422 | grad_values.append(res_dict['values']) 423 | grad_actions.append(res_dict['actions']) 424 | grad_fdones.append(self.dones.float()) 425 | grad_rp_eps.append(res_dict['rp_eps']) 426 | 427 | # [obs] is an observation of the current time step; 428 | # store processed obs, which might have been normalized already; 429 | self.experience_buffer.update_data('obses', n, res_dict['obs']) 430 | 431 | # [dones] indicate if this step is the start of a new episode; 432 | self.experience_buffer.update_data('dones', n, self.dones) 433 | 434 | for k in update_list: 435 | self.experience_buffer.update_data(k, n, res_dict[k]) 436 | 437 | # take action; 438 | step_time_start = time.time() 439 | actions = th.tanh(grad_actions[-1]) 440 | 441 | if self.use_analytic_grads(): 442 | self.obs, rewards, self.dones, infos = self.vec_env.step(actions) 443 | else: 444 | with th.no_grad(): 445 | self.obs, rewards, self.dones, infos = self.vec_env.step(actions) 446 | 447 | self.obs = self.obs_to_tensors(self.obs) 448 | rewards = rewards.unsqueeze(-1) 449 | step_time_end = time.time() 450 | step_time += (step_time_end - step_time_start) 451 | 452 | # compute value of next state; 453 | if self.use_analytic_grads(): 454 | next_obs = infos['obs_before_reset'] 455 | else: 456 | next_obs = self.obs['obs'] 457 | 458 | if self.normalize_input: 459 | # do not update rms here; 460 | next_obs = curr_obs_rms.normalize(next_obs) 461 | next_value = self.target_critic(next_obs) 462 | if self.normalize_value: 463 | next_value = curr_val_rms.normalize(next_value, True) 464 | 465 | # even though [next_value] can wrong when it is based on 466 | # a [next_obs] that is at the start of new episode, 467 | # we deal with it by making it zero when it was an early termination; 468 | grad_next_values.append(next_value) 469 | 470 | done_env_ids = self.dones.nonzero(as_tuple = False).squeeze(-1) 471 | for id in done_env_ids: 472 | if th.isnan(next_obs[id]).sum() > 0 \ 473 | or th.isinf(next_obs[id]).sum() > 0 \ 474 | or (th.abs(next_obs[id]) > 1e6).sum() > 0: # ugly fix for nan values 475 | grad_next_values[-1][id] = 0. 476 | elif self.current_lengths[id] < self.episode_max_length - 1: # early termination 477 | grad_next_values[-1][id] = 0. 478 | 479 | # add default reward; 480 | grad_rewards.append(rewards) 481 | 482 | # @TODO: do not use reward shaper for now; 483 | self.experience_buffer.update_data('rewards', n, rewards) 484 | 485 | self.current_rewards += rewards.detach() 486 | self.current_lengths += 1 487 | all_done_indices = self.dones.nonzero(as_tuple=False) 488 | done_indices = all_done_indices[::self.num_agents] 489 | 490 | self.game_rewards.update(self.current_rewards[done_indices]) 491 | self.game_lengths.update(self.current_lengths[done_indices]) 492 | 493 | not_dones = 1.0 - self.dones.float() 494 | 495 | self.current_rewards = self.current_rewards * not_dones.unsqueeze(1) 496 | self.current_lengths = self.current_lengths * not_dones 497 | 498 | ''' 499 | Update actor and critic networks (but no PPO yet). 500 | 501 | Actor update differs between different algorithms, 502 | but critic update is shared between all algorithms. 503 | ''' 504 | 505 | # start and end of current subsequence; 506 | last_fdones = self.dones.float() 507 | 508 | self.train_actor_no_ppo(grad_start, 509 | grad_obses, 510 | grad_rp_eps, 511 | grad_actions, 512 | grad_values, 513 | grad_next_values, 514 | grad_rewards, 515 | grad_fdones, 516 | last_fdones) 517 | 518 | grad_advs = \ 519 | self.train_critic(grad_obses, 520 | grad_actions, 521 | grad_values, 522 | grad_next_values, 523 | grad_rewards, 524 | grad_fdones, 525 | last_fdones) 526 | 527 | self.update_target_critic() 528 | self.clear_experience_buffer_grads() 529 | 530 | # sort out [batch_dict]; 531 | with th.no_grad(): 532 | 533 | batch_dict = self.experience_buffer.get_transformed_list(swap_and_flatten01, self.tensor_list) 534 | 535 | for i in range(len(grad_advs)): 536 | grad_advs[i] = grad_advs[i].unsqueeze(0) 537 | batch_dict['advantages'] = swap_and_flatten01(th.cat(grad_advs, dim=0).detach()) 538 | batch_dict['played_frames'] = self.batch_size 539 | batch_dict['step_time'] = step_time 540 | 541 | return batch_dict 542 | 543 | def use_analytic_grads(self): 544 | ''' 545 | Whether current RL algorithm requires analytic gradients 546 | from differentiable environment. 547 | ''' 548 | raise NotImplementedError() 549 | 550 | def neglogp(self, x, mean, std, logstd): 551 | ''' 552 | Negative log probability of a batch of actions under a Gaussian policy. 553 | ''' 554 | 555 | assert x.ndim == 2 and mean.ndim == 2 and std.ndim == 2 and logstd.ndim == 2, "" 556 | # assert x.shape[0] == mean.shape[0] and x.shape[0] == std.shape[0] and x.shape[0] == logstd.shape[0], "" 557 | 558 | return 0.5 * (((x - mean) / std)**2).sum(dim=-1) \ 559 | + 0.5 * np.log(2.0 * np.pi) * x.size()[-1] \ 560 | + logstd.sum(dim=-1) 561 | 562 | def get_action_values(self, 563 | obs, 564 | obs_rms: RunningMeanStd, 565 | val_rms: RunningMeanStd): 566 | 567 | # normalize input if needed, we update rms only here; 568 | processed_obs = obs['obs'] 569 | if self.normalize_input: 570 | # update rms; 571 | with th.no_grad(): 572 | self.obs_rms.update(processed_obs) 573 | processed_obs = obs_rms.normalize(processed_obs) 574 | 575 | # [std] is a vector of length [action_dim], which is shared by all the envs; 576 | actions, mu, std, eps = self.actor.forward_with_dist(processed_obs, deterministic=False) 577 | if std.ndim == 1: 578 | std = std.unsqueeze(0) 579 | std = std.expand(mu.shape[0], -1).clone() # make size of [std] same as [actions] and [mu]; 580 | neglogp = self.neglogp(actions, mu, std, th.log(std)) 581 | 582 | # self.target_critic.eval() 583 | values = self.target_critic(processed_obs) 584 | 585 | # if using normalize value, target_critic learns to give normalized state values; 586 | # therefore, unnormalize the resulting value; 587 | if self.normalize_value: 588 | values = val_rms.normalize(values, True) 589 | 590 | res_dict = { 591 | "obs": processed_obs, 592 | "actions": actions, 593 | "mus": mu, 594 | "sigmas": std, 595 | "neglogpacs": neglogp, 596 | "values": values, 597 | "rnn_states": None, 598 | 'rp_eps': eps, 599 | } 600 | 601 | return res_dict 602 | 603 | def train_actor_no_ppo(self, 604 | grad_start: th.Tensor, 605 | grad_obses: List[th.Tensor], 606 | grad_rp_eps: List[th.Tensor], 607 | grad_actions: List[th.Tensor], 608 | grad_values: List[th.Tensor], 609 | grad_next_values: List[th.Tensor], 610 | grad_rewards: List[th.Tensor], 611 | grad_fdones: List[th.Tensor], 612 | last_fdones: th.Tensor): 613 | 614 | ''' 615 | Train actor based on other methods than PPO with current experience. 616 | ''' 617 | 618 | raise NotImplementedError() 619 | 620 | def train_critic(self, 621 | grad_obses: List[th.Tensor], 622 | grad_actions: List[th.Tensor], 623 | grad_values: List[th.Tensor], 624 | grad_next_values: List[th.Tensor], 625 | grad_rewards: List[th.Tensor], 626 | grad_fdones: List[th.Tensor], 627 | last_fdones: th.Tensor): 628 | 629 | with th.no_grad(): 630 | 631 | # compute advantage and add it to state value to get target values; 632 | curr_grad_advs = self.grad_advantages(self.tau, 633 | grad_values, 634 | grad_next_values, 635 | grad_rewards, 636 | grad_fdones, 637 | last_fdones) 638 | grad_advs = curr_grad_advs 639 | 640 | target_values = [] 641 | for i in range(len(curr_grad_advs)): 642 | target_values.append(curr_grad_advs[i] + grad_values[i]) 643 | 644 | th_obs = th.cat(grad_obses, dim=0) 645 | th_target_values = th.cat(target_values, dim=0) 646 | 647 | # update value rms here once; 648 | if self.normalize_value: 649 | self.val_rms.update(th_target_values) 650 | 651 | batch_size = len(th_target_values) // self.critic_num_batch 652 | critic_dataset = CriticDataset(batch_size, th_obs, th_target_values) 653 | 654 | self.critic.train() 655 | critic_loss = 0 656 | for j in range(self.critic_iterations): 657 | 658 | total_critic_loss = 0 659 | batch_cnt = 0 660 | 661 | for i in range(len(critic_dataset)): 662 | 663 | batch_sample = critic_dataset[i] 664 | self.critic_optimizer.zero_grad() 665 | 666 | predicted_values = self.critic(batch_sample['obs']).squeeze(-1) 667 | if self.normalize_value: 668 | # predicted_values = curr_val_rms.normalize(predicted_values, True) 669 | predicted_values = self.val_rms.normalize(predicted_values, True) 670 | 671 | target_values = batch_sample['target_values'] 672 | training_critic_loss = th.mean((predicted_values - target_values) ** 2, dim=0) 673 | training_critic_loss.backward() 674 | 675 | # ugly fix for simulation nan problem 676 | for params in self.critic.parameters(): 677 | params.grad.nan_to_num_(0.0, 0.0, 0.0) 678 | 679 | if self.truncate_grads: 680 | nn.utils.clip_grad_norm_(self.critic.parameters(), self.grad_norm) 681 | 682 | self.critic_optimizer.step() 683 | 684 | total_critic_loss += training_critic_loss 685 | batch_cnt += 1 686 | 687 | # critic_loss = (total_critic_loss / batch_cnt).detach().cpu().item() 688 | # if self.print_stats: 689 | # print('value iter {}/{}, loss = {:7.6f}'.format(j + 1, self.critic_iterations, critic_loss), end='\r') 690 | 691 | return grad_advs 692 | 693 | def update_target_critic(self): 694 | with th.no_grad(): 695 | alpha = self.target_critic_alpha 696 | for param, param_targ in zip(self.critic.parameters(), self.target_critic.parameters()): 697 | param_targ.data.mul_(alpha) 698 | param_targ.data.add_((1. - alpha) * param.data) 699 | 700 | # def get_critic_values(self, obs, use_target_critic: bool, obs_rms_train: bool): 701 | 702 | # if use_target_critic: 703 | # critic = self.target_critic 704 | # # critic.eval() 705 | # else: 706 | # critic = self.critic 707 | 708 | # if self.normalize_input: 709 | 710 | # if obs_rms_train: 711 | # self.running_mean_std.train() 712 | # else: 713 | # self.running_mean_std.eval() 714 | 715 | # processed_obs = self._preproc_obs(obs) 716 | # values = critic(processed_obs) 717 | 718 | # if self.normalize_value: 719 | # values = self.value_mean_std(values, True) 720 | 721 | # return values 722 | 723 | def grad_advantages(self, gae_tau, mb_extrinsic_values, mb_next_extrinsic_values, mb_rewards, mb_fdones, last_fdones): 724 | 725 | num_step = len(mb_extrinsic_values) 726 | mb_advs = [] 727 | 728 | # GAE; 729 | lastgaelam = 0 730 | for t in reversed(range(num_step)): 731 | if t == num_step - 1: 732 | nextnonterminal = 1.0 - last_fdones 733 | else: 734 | nextnonterminal = 1.0 - mb_fdones[t+1] 735 | nextnonterminal = nextnonterminal.unsqueeze(1) 736 | 737 | nextvalues = mb_next_extrinsic_values[t] 738 | 739 | ''' 740 | In computing delta, we do not use [nextnonterinal] because 741 | [nextvalues] should be zero if the episode was finished 742 | before the maximum episode length. 743 | 744 | If the episode was finished by going over horizon, we have 745 | to deal with the [nextvalues] that is not zero, but 746 | [nextnonterminal] is still 0. 747 | 748 | Therefore, we do not consider [nextnonterminal] here. 749 | ''' 750 | delta = mb_rewards[t] + self.gamma * nextvalues - mb_extrinsic_values[t] 751 | mb_adv = lastgaelam = delta + self.gamma * gae_tau * nextnonterminal * lastgaelam 752 | mb_advs.append(mb_adv) 753 | 754 | mb_advs.reverse() 755 | return mb_advs 756 | 757 | def grad_advantages_first_terms_sum(self, grad_advs, grad_start): 758 | 759 | num_timestep = grad_start.shape[0] 760 | num_actors = grad_start.shape[1] 761 | 762 | adv_sum = 0 763 | 764 | for i in range(num_timestep): 765 | for j in range(num_actors): 766 | if grad_start[i, j]: 767 | adv_sum = adv_sum + grad_advs[i][j] 768 | 769 | return adv_sum 770 | 771 | def clear_experience_buffer_grads(self): 772 | 773 | ''' 774 | Clear computation graph attached to the tensors in the experience buffer. 775 | ''' 776 | 777 | with th.no_grad(): 778 | 779 | for k in self.experience_buffer.tensor_dict.keys(): 780 | 781 | if not isinstance(self.experience_buffer.tensor_dict[k], th.Tensor): 782 | 783 | continue 784 | 785 | self.experience_buffer.tensor_dict[k] = self.experience_buffer.tensor_dict[k].detach() 786 | 787 | def train_actor_ppo(self, batch_dict): 788 | 789 | raise NotImplementedError() 790 | 791 | def prepare_dataset(self, batch_dict): 792 | 793 | obses = batch_dict['obses'] 794 | advantages = batch_dict['advantages'] 795 | dones = batch_dict['dones'] 796 | values = batch_dict['values'] 797 | actions = batch_dict['actions'] 798 | neglogpacs = batch_dict['neglogpacs'] 799 | mus = batch_dict['mus'] 800 | sigmas = batch_dict['sigmas'] 801 | 802 | advantages = th.sum(advantages, axis=1) 803 | unnormalized_advantages = advantages 804 | 805 | if self.normalize_advantage: 806 | advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8) 807 | 808 | dataset_dict = {} 809 | dataset_dict['old_values'] = values 810 | dataset_dict['advantages'] = advantages 811 | dataset_dict['actions'] = actions 812 | dataset_dict['obs'] = obses 813 | 814 | dataset_dict['old_mu'] = mus 815 | dataset_dict['old_sigma'] = sigmas 816 | dataset_dict['old_logp_actions'] = neglogpacs 817 | 818 | return dataset_dict 819 | 820 | if self.gi_algorithm == "ppo-only": 821 | 822 | dataset_dict['mu'] = mus 823 | dataset_dict['sigma'] = sigmas 824 | dataset_dict['logp_actions'] = neglogpacs 825 | 826 | elif self.gi_algorithm == "grad-ppo-shac": 827 | 828 | with torch.no_grad(): 829 | n_mus, n_sigmas = self.actor.forward_dist(obses) 830 | if n_sigmas.ndim == 1: 831 | n_sigmas = n_sigmas.unsqueeze(0) 832 | n_sigmas = n_sigmas.expand(mus.shape[0], -1).clone() 833 | 834 | n_neglogpacs = self.neglogp(actions, n_mus, n_sigmas, torch.log(n_sigmas)) 835 | 836 | dataset_dict['mu'] = n_mus 837 | dataset_dict['sigma'] = n_sigmas 838 | dataset_dict['logp_actions'] = n_neglogpacs 839 | 840 | # compute [mus] and [sigmas] again here because we could have 841 | # updated policy in [play_steps] using RP gradients; 842 | # find out if the updated policy is still close enough to the 843 | # original policy, because PPO assumes it; 844 | # if it is not close enough, we decrease [alpha]; 845 | 846 | elif self.gi_algorithm == "grad-ppo-alpha": 847 | 848 | with torch.no_grad(): 849 | n_mus, n_sigmas = self.actor.forward_dist(obses) 850 | if n_sigmas.ndim == 1: 851 | n_sigmas = n_sigmas.unsqueeze(0) 852 | n_sigmas = n_sigmas.expand(mus.shape[0], -1).clone() 853 | 854 | n_neglogpacs = self.neglogp(actions, n_mus, n_sigmas, torch.log(n_sigmas)) 855 | 856 | # find out distance between current policy and old policy; 857 | 858 | pac_ratio = torch.exp(torch.clamp(neglogpacs - n_neglogpacs, max=16.)) # prevent [inf]; 859 | out_of_range_pac_ratio = torch.logical_or(pac_ratio < (1. - self.e_clip), 860 | pac_ratio > (1. + self.e_clip)) 861 | out_of_range_pac_ratio = torch.count_nonzero(out_of_range_pac_ratio) / actions.shape[0] 862 | 863 | self.writer.add_scalar("info_alpha/oor_pac_ratio", out_of_range_pac_ratio, self.epoch_num) 864 | 865 | # find out if current policy is better than old policy in terms of lr gradients; 866 | 867 | est_curr_performance = torch.sum(unnormalized_advantages * pac_ratio) - torch.sum(unnormalized_advantages) 868 | # est_curr_performance = torch.sum(advantages * pac_ratio) - torch.sum(advantages) 869 | 870 | n_est_curr_performance = self.est_curr_performace_rms.normalize(est_curr_performance) 871 | self.est_curr_performace_rms.update(est_curr_performance.unsqueeze(0)) 872 | 873 | self.writer.add_scalar("info_alpha/est_curr_performance", est_curr_performance, self.epoch_num) 874 | self.writer.add_scalar("info_alpha/est_curr_performance_n", n_est_curr_performance, self.epoch_num) 875 | 876 | # if current policy is too far from old policy or is worse than old policy, 877 | # decrease alpha; 878 | 879 | if out_of_range_pac_ratio > self.gi_max_dist_rp_lr or \ 880 | (est_curr_performance < 0 and n_est_curr_performance < -1.): 881 | 882 | self.next_alpha = self.gi_curr_alpha / self.gi_update_factor 883 | if self.gi_dynamic_alpha_scheduler in ['dynamic0', 'dynamic2']: 884 | self.next_actor_lr = self.actor_lr / self.gi_update_factor 885 | self.next_alpha = np.clip(self.next_alpha, self.gi_min_alpha, self.gi_max_alpha) 886 | 887 | dataset_dict['mu'] = n_mus 888 | dataset_dict['sigma'] = n_sigmas 889 | dataset_dict['logp_actions'] = n_neglogpacs 890 | 891 | self.dataset.update_values_dict(dataset_dict) 892 | 893 | if self.has_central_value: 894 | raise NotImplementedError() 895 | 896 | # def get_full_state_weights(self): 897 | 898 | # state = super().get_full_state_weights() 899 | 900 | # state['gi_actor'] = self.actor.state_dict() 901 | # state['gi_critic'] = self.critic.state_dict() 902 | # state['gi_target_critic'] = self.target_critic.state_dict() 903 | # if self.normalize_input: 904 | # state['gi_obs_rms'] = self.obs_rms 905 | # return state 906 | 907 | # def set_full_state_weights(self, weights): 908 | 909 | # super().set_full_state_weights(weights) 910 | 911 | # self.actor.load_state_dict(weights['gi_actor']) 912 | # self.critic.load_state_dict(weights['gi_critic']) 913 | # self.target_critic.load_state_dict(weights['gi_target_critic']) 914 | # if self.normalize_input: 915 | # self.obs_rms = weights['gi_obs_rms'].to(self.ppo_device) 916 | 917 | # def calc_gradients(self, input_dict): 918 | 919 | # # ================================================= 920 | 921 | # value_preds_batch = input_dict['old_values'] 922 | # advantage = input_dict['advantages'] 923 | # actions_batch = input_dict['actions'] 924 | # obs_batch = input_dict['obs'] 925 | 926 | # # these old mu and sigma are used to compute new policy's KL div from 927 | # # the old policy, which could be used to update learning rate later; 928 | # # it is not directly involved in policy updates; 929 | # old_mu_batch = input_dict['mu'] 930 | # old_sigma_batch = input_dict['sigma'] 931 | 932 | # if self.gi_algorithm == "grad-ppo-alpha": 933 | # old_action_log_probs_batch_0 = input_dict['old_logp_actions'] # action log probs before alpha update; 934 | # old_action_log_probs_batch_1 = input_dict['logp_actions'] # action log probs after alpha update; 935 | # else: 936 | # old_action_log_probs_batch = input_dict['old_logp_actions'] # original action log probs; 937 | 938 | # lr_mul = 1.0 939 | # curr_e_clip = lr_mul * self.e_clip 940 | 941 | # if self.is_rnn: 942 | # raise NotImplementedError() 943 | 944 | # for param in self.actor.parameters(): 945 | # if torch.any(torch.isnan(param.data)) or torch.any(torch.isinf(param.data)): 946 | # print("Invalid param 1") 947 | # exit(-1) 948 | 949 | # # get current policy's actions; 950 | # curr_mu, curr_std = self.actor.forward_dist(obs_batch) 951 | # if curr_std.ndim == 1: 952 | # curr_std = curr_std.unsqueeze(0) 953 | # curr_std = curr_std.expand(curr_mu.shape[0], -1).clone() 954 | # neglogp = self.neglogp(actions_batch, curr_mu, curr_std, torch.log(curr_std)) 955 | 956 | # # min_std = float(1e-5) 957 | # # tmp_curr_std = curr_std 958 | # # while True: 959 | # # neglogp = self.neglogp(actions_batch, curr_mu, tmp_curr_std, torch.log(tmp_curr_std)) 960 | # # if torch.any(torch.isnan(neglogp)) or torch.any(torch.isinf(neglogp)): 961 | 962 | # # # isnan_ind = torch.isnan(neglogp) 963 | # # # isinf_ind = torch.isinf(neglogp) 964 | # # # # print(actions_batch[isnan_ind]) 965 | # # # # print(curr_mu[isnan_ind]) 966 | # # # # print(tmp_curr_std[isnan_ind]) 967 | 968 | # # # # print(actions_batch[isinf_ind]) 969 | # # # # print(curr_mu[isinf_ind]) 970 | # # # # print(tmp_curr_std[isinf_ind]) 971 | 972 | # # print(min_std) 973 | # # tmp_curr_std = torch.clamp(curr_std, min=min_std) 974 | # # min_std *= 2. 975 | # # exit(-1) 976 | # # else: 977 | # # break 978 | 979 | # if self.gi_algorithm == "grad-ppo-alpha": 980 | # a_loss = _grad_common_losses.actor_loss_alpha(old_action_log_probs_batch_0, 981 | # old_action_log_probs_batch_1, 982 | # neglogp, 983 | # advantage, 984 | # self.ppo, 985 | # curr_e_clip) 986 | # else: 987 | # a_loss = _grad_common_losses.actor_loss(old_action_log_probs_batch, 988 | # neglogp, 989 | # advantage, 990 | # self.ppo, 991 | # curr_e_clip) 992 | 993 | # c_loss = torch.zeros((1,), device=self.ppo_device) 994 | # b_loss = self.bound_loss(curr_mu) 995 | 996 | # # do not have entropy coef for now; 997 | # losses, sum_mask = torch_ext.apply_masks([a_loss.unsqueeze(1), b_loss.unsqueeze(1)], None) 998 | # a_loss, b_loss = losses[0], losses[1] 999 | 1000 | # entropy = torch.zeros((1,), device=self.ppo_device) 1001 | # assert self.entropy_coef == 0., "" 1002 | 1003 | # # we only use actor loss here for fair comparison; 1004 | # loss = a_loss 1005 | 1006 | # self.ppo_optimizer.zero_grad() 1007 | # if self.multi_gpu: 1008 | # raise NotImplementedError() 1009 | # else: 1010 | # for param in self.actor.parameters(): 1011 | # param.grad = None 1012 | 1013 | # loss.backward() 1014 | 1015 | # #TODO: Refactor this ugliest code of they year 1016 | # if self.truncate_grads: 1017 | # if self.multi_gpu: 1018 | # raise NotImplementedError() 1019 | # else: 1020 | # nn.utils.clip_grad_norm_(self.actor.parameters(), self.grad_norm) 1021 | # self.ppo_optimizer.step() 1022 | # else: 1023 | # self.ppo_optimizer.step() 1024 | 1025 | # for param in self.actor.parameters(): 1026 | # if torch.any(torch.isnan(param.data)) or torch.any(torch.isinf(param.data)): 1027 | 1028 | # print("Invalid param 2") 1029 | # print(loss) 1030 | # print(a_loss) 1031 | 1032 | # # print(_grad_common_losses.actor_loss_alpha(old_action_log_probs_batch_0, 1033 | # # old_action_log_probs_batch_1, 1034 | # # neglogp, 1035 | # # advantage, 1036 | # # self.ppo, 1037 | # # curr_e_clip)) 1038 | # exit(-1) 1039 | 1040 | # with torch.no_grad(): 1041 | # reduce_kl = not self.is_rnn 1042 | # kl_dist = torch_ext.policy_kl(curr_mu.detach(), curr_std.detach(), old_mu_batch, old_sigma_batch, reduce_kl) 1043 | # if self.is_rnn: 1044 | # raise NotImplementedError() 1045 | 1046 | # self.train_result = (a_loss, c_loss, entropy, \ 1047 | # kl_dist, self.last_lr, lr_mul, \ 1048 | # curr_mu.detach(), curr_std.detach(), b_loss) 1049 | 1050 | # def update_lr(self, lr): 1051 | # if self.multi_gpu: 1052 | # lr_tensor = torch.tensor([lr]) 1053 | # self.hvd.broadcast_value(lr_tensor, 'learning_rate') 1054 | # lr = lr_tensor.item() 1055 | 1056 | # for param_group in self.ppo_optimizer.param_groups: 1057 | # param_group['lr'] = lr 1058 | 1059 | # def differentiate_grad_advantages(self, 1060 | # grad_actions: torch.Tensor, 1061 | # grad_advs: torch.Tensor, 1062 | # grad_start: torch.Tensor, 1063 | # debug: bool=False): 1064 | 1065 | # ''' 1066 | # Compute first-order gradients of [grad_advs] w.r.t. [grad_actions] using automatic differentiation. 1067 | # ''' 1068 | 1069 | # num_timestep = grad_start.shape[0] 1070 | # num_actor = grad_start.shape[1] 1071 | 1072 | # adv_sum: torch.Tensor = self.grad_advantages_first_terms_sum(grad_advs, grad_start) 1073 | 1074 | # # compute gradients; 1075 | 1076 | # # first-order gradient; 1077 | 1078 | # # adv_gradient = torch.autograd.grad(adv_sum, grad_actions, retain_graph=debug) 1079 | # # adv_gradient = torch.stack(adv_gradient) 1080 | 1081 | # for ga in grad_actions: 1082 | # ga.retain_grad() 1083 | # adv_sum.backward(retain_graph=debug) 1084 | # adv_gradient = [] 1085 | # for ga in grad_actions: 1086 | # adv_gradient.append(ga.grad) 1087 | # adv_gradient = torch.stack(adv_gradient) 1088 | 1089 | # # reweight grads; 1090 | 1091 | # with torch.no_grad(): 1092 | 1093 | # c = (1.0 / (self.gamma * self.tau)) 1094 | # cv = torch.ones((num_actor, 1), device=adv_gradient.device) 1095 | 1096 | # for nt in range(num_timestep): 1097 | 1098 | # # if new episode has been started, set [cv] to 1; 1099 | # for na in range(num_actor): 1100 | # if grad_start[nt, na]: 1101 | # cv[na, 0] = 1.0 1102 | 1103 | # adv_gradient[nt] = adv_gradient[nt] * cv 1104 | # cv = cv * c 1105 | 1106 | # if debug: 1107 | 1108 | # # compute gradients in brute force and compare; 1109 | # # this is to prove correctness of efficient computation of GAE-based advantage w.r.t. actions; 1110 | 1111 | # for i in range(num_timestep): 1112 | 1113 | # debug_adv_sum = grad_advs[i].sum() 1114 | 1115 | # debug_grad_adv_gradient = torch.autograd.grad(debug_adv_sum, grad_actions[i], retain_graph=True)[0] 1116 | # debug_grad_adv_gradient_norm = torch.norm(debug_grad_adv_gradient, p=2, dim=-1) 1117 | 1118 | # debug_grad_error = torch.norm(debug_grad_adv_gradient - adv_gradient[i], p=2, dim=-1) 1119 | # debug_grad_error_ratio = debug_grad_error / debug_grad_adv_gradient_norm 1120 | 1121 | # assert torch.all(debug_grad_error_ratio < 0.01), \ 1122 | # "Gradient of advantage possibly wrong" 1123 | 1124 | # adv_gradient = adv_gradient.detach() 1125 | 1126 | # return adv_gradient 1127 | 1128 | # def action_eps_jacobian(self, mu, sigma, eps): 1129 | 1130 | # jacobian = torch.zeros((eps.shape[0], eps.shape[1], eps.shape[1])) 1131 | 1132 | # for d in range(eps.shape[1]): 1133 | 1134 | # if sigma.ndim == 1: 1135 | # jacobian[:, d, d] = sigma[d].detach() 1136 | # elif sigma.ndim == 2: 1137 | # jacobian[:, d, d] = sigma[:, d].detach() 1138 | 1139 | # return jacobian 1140 | 1141 | # ''' 1142 | # distr = GradNormal(mu, sigma) 1143 | # eps.requires_grad = True 1144 | # actions = distr.eps_to_action(eps) 1145 | 1146 | # jacobian = torch.zeros((eps.shape[0], actions.shape[1], eps.shape[1])) 1147 | 1148 | # for d in range(actions.shape[1]): 1149 | # target = torch.sum(actions[:, d]) 1150 | # grad = torch.autograd.grad(target, eps, retain_graph=True) 1151 | # grad = torch.stack(grad) 1152 | # jacobian[:, d, :] = grad 1153 | 1154 | # return jacobian 1155 | # ''' 1156 | 1157 | def use_ppo(self): 1158 | ''' 1159 | Whether or not to use PPO. 1160 | ''' 1161 | raise NotImplementedError() 1162 | 1163 | ''' 1164 | Logging 1165 | ''' 1166 | def write_stats(self, total_time, epoch_num, step_time, no_ppo_time, ppo_time, ppo_loss, frame, scaled_time, scaled_play_time, curr_frames): 1167 | 1168 | mean_ppo_loss = th.tensor(ppo_loss).mean().item() 1169 | 1170 | self.writer.add_scalar('performance/step_inference_rl_update_fps', curr_frames / scaled_time, frame) 1171 | self.writer.add_scalar('performance/step_inference_fps', curr_frames / scaled_play_time, frame) 1172 | self.writer.add_scalar('performance/step_fps', curr_frames / step_time, frame) 1173 | self.writer.add_scalar('performance/no_ppo_time', no_ppo_time, frame) 1174 | self.writer.add_scalar('performance/ppo_time', ppo_time, frame) 1175 | self.writer.add_scalar('performance/step_time', step_time, frame) 1176 | self.writer.add_scalar('losses/ppo_loss', mean_ppo_loss, frame) 1177 | self.writer.add_scalar('info/epochs', epoch_num, frame) 1178 | 1179 | if self.use_ppo(): 1180 | self.writer.add_scalar('info/e_clip', self.e_clip, frame) 1181 | # self.algo_observer.after_print_stats(frame, epoch_num, total_time) 1182 | 1183 | def get_weights(self): 1184 | state = self.get_stats_weights() 1185 | state['actor'] = self.actor.state_dict() 1186 | state['critic'] = self.critic.state_dict() 1187 | return state 1188 | 1189 | def get_stats_weights(self): 1190 | state = {} 1191 | # if self.normalize_input: 1192 | # state['running_mean_std'] = self.running_mean_std.state_dict() 1193 | # if self.normalize_value: 1194 | # state['reward_mean_std'] = self.value_mean_std.state_dict() 1195 | return state 1196 | 1197 | def get_optimizers_state(self): 1198 | state = {} 1199 | state['critic'] = self.critic_optimizer.state_dict() 1200 | return state 1201 | 1202 | def get_full_state_weights(self): 1203 | state = self.get_weights() 1204 | state['epoch'] = self.epoch_num 1205 | state['optimizers'] = self.get_optimizers_state() 1206 | state['frame'] = self.frame 1207 | 1208 | # This is actually the best reward ever achieved. last_mean_rewards is perhaps not the best variable name 1209 | # We save it to the checkpoint to prevent overriding the "best ever" checkpoint upon experiment restart 1210 | state['last_mean_rewards'] = self.last_mean_rewards 1211 | 1212 | env_state = self.vec_env.get_env_state() 1213 | state['env_state'] = env_state 1214 | 1215 | return state 1216 | 1217 | def safe_filesystem_op(self, func, *args, **kwargs): 1218 | """ 1219 | This is to prevent spurious crashes related to saving checkpoints or restoring from checkpoints in a Network 1220 | Filesystem environment (i.e. NGC cloud or SLURM) 1221 | """ 1222 | num_attempts = 5 1223 | for attempt in range(num_attempts): 1224 | try: 1225 | return func(*args, **kwargs) 1226 | except Exception as exc: 1227 | print(f'Exception {exc} when trying to execute {func} with args:{args} and kwargs:{kwargs}...') 1228 | wait_sec = 2 ** attempt 1229 | print(f'Waiting {wait_sec} before trying again...') 1230 | time.sleep(wait_sec) 1231 | 1232 | raise RuntimeError(f'Could not execute {func}, give up after {num_attempts} attempts...') 1233 | 1234 | def safe_save(self, state, filename): 1235 | return self.safe_filesystem_op(th.save, state, filename) 1236 | 1237 | def save_checkpoint(self, filename, state): 1238 | print("=> saving checkpoint '{}'".format(filename + '.pth')) 1239 | self.safe_save(state, filename + '.pth') 1240 | 1241 | def save(self, fn): 1242 | state = self.get_full_state_weights() 1243 | self.save_checkpoint(fn, state) --------------------------------------------------------------------------------