├── envs
    ├── __init__.py
    ├── func_optim
    │   ├── base.py
    │   ├── dejong.py
    │   └── ackley.py
    └── base.py
├── src
    └── gippo
    │   ├── __init__.py
    │   ├── runner.py
    │   ├── dataset.py
    │   ├── rl_algorithm
    │       ├── rp.py
    │       ├── lr.py
    │       ├── lrp.py
    │       ├── ppo.py
    │       ├── gippo.py
    │       └── base.py
    │   ├── vecenv.py
    │   ├── network.py
    │   ├── utils.py
    │   └── experience.py
├── .gitignore
├── setup.py
├── README.md
├── config
    └── func_optim
    │   ├── ackley
    │       ├── lr.yaml
    │       ├── rp.yaml
    │       ├── lrp.yaml
    │       ├── ppo.yaml
    │       └── gippo.yaml
    │   ├── dejong
    │       ├── lr.yaml
    │       ├── rp.yaml
    │       ├── lrp.yaml
    │       ├── ppo.yaml
    │       └── gippo.yaml
    │   ├── ackley64
    │       ├── lr.yaml
    │       ├── lrp.yaml
    │       ├── rp.yaml
    │       ├── ppo.yaml
    │       └── gippo.yaml
    │   └── dejong64
    │       ├── lr.yaml
    │       ├── lrp.yaml
    │       ├── rp.yaml
    │       ├── ppo.yaml
    │       └── gippo.yaml
├── run_func_optim.sh
└── train.py


/envs/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/gippo/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.egg-info
2 | .vscode/
3 | __pycache__/
4 | logdir/


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 | 
3 | setup(
4 |     name="gippo",
5 |     version="0.0",
6 |     description="Implementation of gradient informed proximal policy optimization (GI-PPO) algorithm",
7 |     author="Sanghyun Son",
8 |     author_email="shh1295@umd.edu",
9 | )


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Introduction
 2 | 
 3 | This is the code repository for the paper ["Gradient Informed Proximal Policy Optimization"](https://arxiv.org/abs/2312.08710), which was presented in the Neurips 2023 conference. This code was implemented on the basis of [rl_games](https://github.com/Denys88/rl_games) and [SHAC](https://github.com/NVlabs/DiffRL).
 4 | 
 5 | # Installation
 6 | 
 7 | We need following packages.
 8 | 
 9 | * pytorch 1.13.1 (https://pytorch.org/get-started/previous-versions/)
10 | * pyyaml 6.0.1 (pip install pyyaml)
11 | * tensorboard (pip install tensorboard)
12 | * tensorboardx 2.6.2 (pip install tensorboardx)
13 | * urdfpy (pip install urdfpy)
14 | * usd-core 23.8 (pip install usd-core)
15 | * ray 2.6.2 (pip install ray)
16 | * ninja 1.10.2 (conda install -c conda-forge ninja)
17 | * cudatoolkit (conda install -c anaconda cudatoolkit)
18 | * cudatoolkit-dev (conda install -c conda-forge cudatoolkit-dev)
19 | * optuna 3.2.0 (pip install optuna)
20 | * optuna-dashboard 0.11.0 (pip install optuna-dashboard)
21 | * matplotlib (pip install matplotlib)
22 | * highway-env 1.8.2 (pip install highway-env)
23 | * seaborn (pip install seaborn)
24 | * gym (pip install gym)
25 | 
26 | Then, run following command.
27 | 
28 | ```bash
29 | pip install -e .
30 | ```
31 | 
32 | # Usage
33 | 
34 | Run following command for function optimization problems.
35 | 
36 | ```bash
37 | bash ./run_func_optim.sh
38 | ```
39 | 


--------------------------------------------------------------------------------
/config/func_optim/ackley/lr.yaml:
--------------------------------------------------------------------------------
 1 | params:  
 2 |   seed: 1
 3 |   device: 'cuda:0'
 4 | 
 5 |   env:
 6 |     name: AckleyEnv
 7 |     config:
 8 |       dim: 1
 9 | 
10 |   algo:
11 |     name: lr
12 | 
13 |     # network
14 |     network:
15 |       actor: ActorStochasticMLP
16 |       actor_mlp:
17 |         units: [32, 32]
18 |         activation: elu
19 |       actor_logstd_init: 0.0
20 |       fixed_sigma: False
21 | 
22 |       critic: CriticMLP
23 |       critic_mlp:
24 |         units: [32, 32]
25 |         activation: elu
26 | 
27 |     # length
28 |     horizon_length: 1
29 |     max_epochs: 2000
30 |     
31 |     # normalize
32 |     normalize_input: True
33 |     normalize_value: True
34 |     normalize_advantage: True
35 |     
36 |     # GAE
37 |     gamma: 0.99
38 |     tau: 0.95
39 |     
40 |     # save
41 |     save_best_after: 50
42 |     save_frequency: 100
43 |     
44 |     grad_norm: 1.0
45 |     truncate_grads: True
46 |     steps_num: 1
47 |     
48 |     num_actors: 64
49 |     minibatch_size: 64
50 |     mini_epochs: 5
51 |     
52 |     critic_coef: 4
53 |     clip_value: True
54 |     
55 |     defer_summaries_sec: 0.001
56 |     summaries_interval_sec_min: 0.001
57 |     summaries_interval_sec_max: 0.002
58 | 
59 |     
60 | 
61 |     # actor
62 |     actor_learning_rate: 1e-4
63 |     
64 |     # critic
65 |     critic_learning_rate: 1e-3
66 |     critic_iterations: 16
67 |     critic_num_batch: 4
68 |     target_critic_alpha: 0.2
69 | 
70 |     # learning rate scheduler
71 |     lr_schedule: linear # [constant, linear]
72 | 
73 |     # adam
74 |     betas: [0.7, 0.95]


--------------------------------------------------------------------------------
/config/func_optim/ackley/rp.yaml:
--------------------------------------------------------------------------------
 1 | params:  
 2 |   seed: 1
 3 |   device: 'cuda:0'
 4 | 
 5 |   env:
 6 |     name: AckleyEnv
 7 |     config:
 8 |       dim: 1
 9 | 
10 |   algo:
11 |     name: rp
12 | 
13 |     # network
14 |     network:
15 |       actor: ActorStochasticMLP
16 |       actor_mlp:
17 |         units: [32, 32]
18 |         activation: elu
19 |       actor_logstd_init: 0.0
20 |       fixed_sigma: False
21 | 
22 |       critic: CriticMLP
23 |       critic_mlp:
24 |         units: [32, 32]
25 |         activation: elu
26 | 
27 |     # length
28 |     horizon_length: 1
29 |     max_epochs: 2000
30 |     
31 |     # normalize
32 |     normalize_input: True
33 |     normalize_value: True
34 |     normalize_advantage: True
35 |     
36 |     # GAE
37 |     gamma: 0.99
38 |     tau: 0.95
39 |     
40 |     # save
41 |     save_best_after: 50
42 |     save_frequency: 100
43 |     
44 |     grad_norm: 1.0
45 |     truncate_grads: True
46 |     steps_num: 1
47 |     
48 |     num_actors: 64
49 |     minibatch_size: 64
50 |     mini_epochs: 5
51 |     
52 |     critic_coef: 4
53 |     clip_value: True
54 |     
55 |     defer_summaries_sec: 0.001
56 |     summaries_interval_sec_min: 0.001
57 |     summaries_interval_sec_max: 0.002
58 | 
59 |     
60 | 
61 |     # actor
62 |     actor_learning_rate: 1e-3
63 |     
64 |     # critic
65 |     critic_learning_rate: 1e-3
66 |     critic_iterations: 16
67 |     critic_num_batch: 4
68 |     target_critic_alpha: 0.2
69 | 
70 |     # learning rate scheduler
71 |     lr_schedule: linear # [constant, linear]
72 | 
73 |     # adam
74 |     betas: [0.7, 0.95]


--------------------------------------------------------------------------------
/config/func_optim/dejong/lr.yaml:
--------------------------------------------------------------------------------
 1 | params:  
 2 |   seed: 1
 3 |   device: 'cuda:0'
 4 | 
 5 |   env:
 6 |     name: DejongEnv
 7 |     config:
 8 |       dim: 1
 9 | 
10 |   algo:
11 |     name: lr
12 | 
13 |     # network
14 |     network:
15 |       actor: ActorStochasticMLP
16 |       actor_mlp:
17 |         units: [32, 32]
18 |         activation: elu
19 |       actor_logstd_init: 0.0
20 |       fixed_sigma: False
21 | 
22 |       critic: CriticMLP
23 |       critic_mlp:
24 |         units: [32, 32]
25 |         activation: elu
26 | 
27 |     # length
28 |     horizon_length: 1
29 |     max_epochs: 2000
30 |     
31 |     # normalize
32 |     normalize_input: True
33 |     normalize_value: True
34 |     normalize_advantage: True
35 |     
36 |     # GAE
37 |     gamma: 0.99
38 |     tau: 0.95
39 |     
40 |     # save
41 |     save_best_after: 50
42 |     save_frequency: 100
43 |     
44 |     grad_norm: 1.0
45 |     truncate_grads: True
46 |     steps_num: 1
47 |     
48 |     num_actors: 64
49 |     minibatch_size: 64
50 |     mini_epochs: 5
51 |     
52 |     critic_coef: 4
53 |     clip_value: True
54 |     
55 |     defer_summaries_sec: 0.001
56 |     summaries_interval_sec_min: 0.001
57 |     summaries_interval_sec_max: 0.002
58 | 
59 |     
60 | 
61 |     # actor
62 |     actor_learning_rate: 1e-3
63 |     
64 |     # critic
65 |     critic_learning_rate: 1e-3
66 |     critic_iterations: 16
67 |     critic_num_batch: 4
68 |     target_critic_alpha: 0.2
69 | 
70 |     # learning rate scheduler
71 |     lr_schedule: linear # [constant, linear]
72 | 
73 |     # adam
74 |     betas: [0.7, 0.95]


--------------------------------------------------------------------------------
/config/func_optim/dejong/rp.yaml:
--------------------------------------------------------------------------------
 1 | params:  
 2 |   seed: 1
 3 |   device: 'cuda:0'
 4 | 
 5 |   env:
 6 |     name: DejongEnv
 7 |     config:
 8 |       dim: 1
 9 | 
10 |   algo:
11 |     name: rp
12 | 
13 |     # network
14 |     network:
15 |       actor: ActorStochasticMLP
16 |       actor_mlp:
17 |         units: [32, 32]
18 |         activation: elu
19 |       actor_logstd_init: 0.0
20 |       fixed_sigma: False
21 | 
22 |       critic: CriticMLP
23 |       critic_mlp:
24 |         units: [32, 32]
25 |         activation: elu
26 | 
27 |     # length
28 |     horizon_length: 1
29 |     max_epochs: 2000
30 |     
31 |     # normalize
32 |     normalize_input: True
33 |     normalize_value: True
34 |     normalize_advantage: True
35 |     
36 |     # GAE
37 |     gamma: 0.99
38 |     tau: 0.95
39 |     
40 |     # save
41 |     save_best_after: 50
42 |     save_frequency: 100
43 |     
44 |     grad_norm: 1.0
45 |     truncate_grads: True
46 |     steps_num: 1
47 |     
48 |     num_actors: 64
49 |     minibatch_size: 64
50 |     mini_epochs: 5
51 |     
52 |     critic_coef: 4
53 |     clip_value: True
54 |     
55 |     defer_summaries_sec: 0.001
56 |     summaries_interval_sec_min: 0.001
57 |     summaries_interval_sec_max: 0.002
58 | 
59 |     
60 | 
61 |     # actor
62 |     actor_learning_rate: 1e-2
63 |     
64 |     # critic
65 |     critic_learning_rate: 1e-3
66 |     critic_iterations: 16
67 |     critic_num_batch: 4
68 |     target_critic_alpha: 0.2
69 | 
70 |     # learning rate scheduler
71 |     lr_schedule: linear # [constant, linear]
72 | 
73 |     # adam
74 |     betas: [0.7, 0.95]


--------------------------------------------------------------------------------
/config/func_optim/ackley/lrp.yaml:
--------------------------------------------------------------------------------
 1 | params:  
 2 |   seed: 1
 3 |   device: 'cuda:0'
 4 | 
 5 |   env:
 6 |     name: AckleyEnv
 7 |     config:
 8 |       dim: 1
 9 | 
10 |   algo:
11 |     name: lrp
12 | 
13 |     # network
14 |     network:
15 |       actor: ActorStochasticMLP
16 |       actor_mlp:
17 |         units: [32, 32]
18 |         activation: elu
19 |       actor_logstd_init: 0.0
20 |       fixed_sigma: False
21 | 
22 |       critic: CriticMLP
23 |       critic_mlp:
24 |         units: [32, 32]
25 |         activation: elu
26 | 
27 |     # length
28 |     horizon_length: 1
29 |     max_epochs: 2000
30 |     
31 |     # normalize
32 |     normalize_input: True
33 |     normalize_value: True
34 |     normalize_advantage: True
35 |     
36 |     # GAE
37 |     gamma: 0.99
38 |     tau: 0.95
39 |     
40 |     # save
41 |     save_best_after: 50
42 |     save_frequency: 100
43 |     
44 |     grad_norm: 1.0
45 |     truncate_grads: True
46 |     steps_num: 1
47 |     
48 |     num_actors: 64
49 |     minibatch_size: 64
50 |     mini_epochs: 5
51 |     
52 |     critic_coef: 4
53 |     clip_value: True
54 |     
55 |     defer_summaries_sec: 0.001
56 |     summaries_interval_sec_min: 0.001
57 |     summaries_interval_sec_max: 0.002
58 | 
59 |     
60 | 
61 |     # actor
62 |     actor_learning_rate: 1e-4
63 |     
64 |     # critic
65 |     critic_learning_rate: 1e-3
66 |     critic_iterations: 16
67 |     critic_num_batch: 4
68 |     target_critic_alpha: 0.2
69 | 
70 |     # learning rate scheduler
71 |     lr_schedule: linear # [constant, linear]
72 | 
73 |     # adam
74 |     betas: [0.7, 0.95]


--------------------------------------------------------------------------------
/config/func_optim/ackley64/lr.yaml:
--------------------------------------------------------------------------------
 1 | params:  
 2 |   seed: 1
 3 |   device: 'cuda:0'
 4 | 
 5 |   env:
 6 |     name: AckleyEnv
 7 |     config:
 8 |       dim: 64
 9 | 
10 |   algo:
11 |     name: lr
12 | 
13 |     # network
14 |     network:
15 |       actor: ActorStochasticMLP
16 |       actor_mlp:
17 |         units: [32, 32]
18 |         activation: elu
19 |       actor_logstd_init: 0.0
20 |       fixed_sigma: False
21 | 
22 |       critic: CriticMLP
23 |       critic_mlp:
24 |         units: [32, 32]
25 |         activation: elu
26 | 
27 |     # length
28 |     horizon_length: 1
29 |     max_epochs: 2000
30 |     
31 |     # normalize
32 |     normalize_input: True
33 |     normalize_value: True
34 |     normalize_advantage: True
35 |     
36 |     # GAE
37 |     gamma: 0.99
38 |     tau: 0.95
39 |     
40 |     # save
41 |     save_best_after: 50
42 |     save_frequency: 100
43 |     
44 |     grad_norm: 1.0
45 |     truncate_grads: True
46 |     steps_num: 1
47 |     
48 |     num_actors: 64
49 |     minibatch_size: 64
50 |     mini_epochs: 5
51 |     
52 |     critic_coef: 4
53 |     clip_value: True
54 |     
55 |     defer_summaries_sec: 0.001
56 |     summaries_interval_sec_min: 0.001
57 |     summaries_interval_sec_max: 0.002
58 | 
59 |     
60 | 
61 |     # actor
62 |     actor_learning_rate: 3e-4
63 |     
64 |     # critic
65 |     critic_learning_rate: 1e-3
66 |     critic_iterations: 16
67 |     critic_num_batch: 4
68 |     target_critic_alpha: 0.2
69 | 
70 |     # learning rate scheduler
71 |     lr_schedule: linear # [constant, linear]
72 | 
73 |     # adam
74 |     betas: [0.7, 0.95]


--------------------------------------------------------------------------------
/config/func_optim/ackley64/lrp.yaml:
--------------------------------------------------------------------------------
 1 | params:  
 2 |   seed: 1
 3 |   device: 'cuda:0'
 4 | 
 5 |   env:
 6 |     name: AckleyEnv
 7 |     config:
 8 |       dim: 64
 9 | 
10 |   algo:
11 |     name: lrp
12 | 
13 |     # network
14 |     network:
15 |       actor: ActorStochasticMLP
16 |       actor_mlp:
17 |         units: [32, 32]
18 |         activation: elu
19 |       actor_logstd_init: 0.0
20 |       fixed_sigma: False
21 | 
22 |       critic: CriticMLP
23 |       critic_mlp:
24 |         units: [32, 32]
25 |         activation: elu
26 | 
27 |     # length
28 |     horizon_length: 1
29 |     max_epochs: 2000
30 |     
31 |     # normalize
32 |     normalize_input: True
33 |     normalize_value: True
34 |     normalize_advantage: True
35 |     
36 |     # GAE
37 |     gamma: 0.99
38 |     tau: 0.95
39 |     
40 |     # save
41 |     save_best_after: 50
42 |     save_frequency: 100
43 |     
44 |     grad_norm: 1.0
45 |     truncate_grads: True
46 |     steps_num: 1
47 |     
48 |     num_actors: 64
49 |     minibatch_size: 64
50 |     mini_epochs: 5
51 |     
52 |     critic_coef: 4
53 |     clip_value: True
54 |     
55 |     defer_summaries_sec: 0.001
56 |     summaries_interval_sec_min: 0.001
57 |     summaries_interval_sec_max: 0.002
58 | 
59 |     
60 | 
61 |     # actor
62 |     actor_learning_rate: 3e-4
63 |     
64 |     # critic
65 |     critic_learning_rate: 1e-3
66 |     critic_iterations: 16
67 |     critic_num_batch: 4
68 |     target_critic_alpha: 0.2
69 | 
70 |     # learning rate scheduler
71 |     lr_schedule: linear # [constant, linear]
72 | 
73 |     # adam
74 |     betas: [0.7, 0.95]


--------------------------------------------------------------------------------
/config/func_optim/ackley64/rp.yaml:
--------------------------------------------------------------------------------
 1 | params:  
 2 |   seed: 1
 3 |   device: 'cuda:0'
 4 | 
 5 |   env:
 6 |     name: AckleyEnv
 7 |     config:
 8 |       dim: 64
 9 | 
10 |   algo:
11 |     name: rp
12 | 
13 |     # network
14 |     network:
15 |       actor: ActorStochasticMLP
16 |       actor_mlp:
17 |         units: [32, 32]
18 |         activation: elu
19 |       actor_logstd_init: 0.0
20 |       fixed_sigma: False
21 | 
22 |       critic: CriticMLP
23 |       critic_mlp:
24 |         units: [32, 32]
25 |         activation: elu
26 | 
27 |     # length
28 |     horizon_length: 1
29 |     max_epochs: 2000
30 |     
31 |     # normalize
32 |     normalize_input: True
33 |     normalize_value: True
34 |     normalize_advantage: True
35 |     
36 |     # GAE
37 |     gamma: 0.99
38 |     tau: 0.95
39 |     
40 |     # save
41 |     save_best_after: 50
42 |     save_frequency: 100
43 |     
44 |     grad_norm: 1.0
45 |     truncate_grads: True
46 |     steps_num: 1
47 |     
48 |     num_actors: 64
49 |     minibatch_size: 64
50 |     mini_epochs: 5
51 |     
52 |     critic_coef: 4
53 |     clip_value: True
54 |     
55 |     defer_summaries_sec: 0.001
56 |     summaries_interval_sec_min: 0.001
57 |     summaries_interval_sec_max: 0.002
58 | 
59 |     
60 | 
61 |     # actor
62 |     actor_learning_rate: 1e-3
63 |     
64 |     # critic
65 |     critic_learning_rate: 1e-3
66 |     critic_iterations: 16
67 |     critic_num_batch: 4
68 |     target_critic_alpha: 0.2
69 | 
70 |     # learning rate scheduler
71 |     lr_schedule: linear # [constant, linear]
72 | 
73 |     # adam
74 |     betas: [0.7, 0.95]


--------------------------------------------------------------------------------
/config/func_optim/dejong/lrp.yaml:
--------------------------------------------------------------------------------
 1 | params:  
 2 |   seed: 1
 3 |   device: 'cuda:0'
 4 | 
 5 |   env:
 6 |     name: DejongEnv
 7 |     config:
 8 |       dim: 1
 9 | 
10 |   algo:
11 |     name: lrp
12 | 
13 |     # network
14 |     network:
15 |       actor: ActorStochasticMLP
16 |       actor_mlp:
17 |         units: [32, 32]
18 |         activation: elu
19 |       actor_logstd_init: 0.0
20 |       fixed_sigma: False
21 | 
22 |       critic: CriticMLP
23 |       critic_mlp:
24 |         units: [32, 32]
25 |         activation: elu
26 | 
27 |     # length
28 |     horizon_length: 1
29 |     max_epochs: 2000
30 |     
31 |     # normalize
32 |     normalize_input: True
33 |     normalize_value: True
34 |     normalize_advantage: True
35 |     
36 |     # GAE
37 |     gamma: 0.99
38 |     tau: 0.95
39 |     
40 |     # save
41 |     save_best_after: 50
42 |     save_frequency: 100
43 |     
44 |     grad_norm: 1.0
45 |     truncate_grads: True
46 |     steps_num: 1
47 |     
48 |     num_actors: 64
49 |     minibatch_size: 64
50 |     mini_epochs: 5
51 |     
52 |     critic_coef: 4
53 |     clip_value: True
54 |     
55 |     defer_summaries_sec: 0.001
56 |     summaries_interval_sec_min: 0.001
57 |     summaries_interval_sec_max: 0.002
58 | 
59 |     
60 | 
61 |     # actor
62 |     actor_learning_rate: 1e-2
63 |     
64 |     # critic
65 |     critic_learning_rate: 1e-3
66 |     critic_iterations: 16
67 |     critic_num_batch: 4
68 |     target_critic_alpha: 0.2
69 | 
70 |     # learning rate scheduler
71 |     lr_schedule: linear # [constant, linear]
72 | 
73 |     # adam
74 |     betas: [0.7, 0.95]


--------------------------------------------------------------------------------
/config/func_optim/dejong64/lr.yaml:
--------------------------------------------------------------------------------
 1 | params:  
 2 |   seed: 1
 3 |   device: 'cuda:0'
 4 | 
 5 |   env:
 6 |     name: DejongEnv
 7 |     config:
 8 |       dim: 64
 9 | 
10 |   algo:
11 |     name: lr
12 | 
13 |     # network
14 |     network:
15 |       actor: ActorStochasticMLP
16 |       actor_mlp:
17 |         units: [32, 32]
18 |         activation: elu
19 |       actor_logstd_init: 0.0
20 |       fixed_sigma: False
21 | 
22 |       critic: CriticMLP
23 |       critic_mlp:
24 |         units: [32, 32]
25 |         activation: elu
26 | 
27 |     # length
28 |     horizon_length: 1
29 |     max_epochs: 2000
30 |     
31 |     # normalize
32 |     normalize_input: True
33 |     normalize_value: True
34 |     normalize_advantage: True
35 |     
36 |     # GAE
37 |     gamma: 0.99
38 |     tau: 0.95
39 |     
40 |     # save
41 |     save_best_after: 50
42 |     save_frequency: 100
43 |     
44 |     grad_norm: 1.0
45 |     truncate_grads: True
46 |     steps_num: 1
47 |     
48 |     num_actors: 64
49 |     minibatch_size: 64
50 |     mini_epochs: 5
51 |     
52 |     critic_coef: 4
53 |     clip_value: True
54 |     
55 |     defer_summaries_sec: 0.001
56 |     summaries_interval_sec_min: 0.001
57 |     summaries_interval_sec_max: 0.002
58 | 
59 |     
60 | 
61 |     # actor
62 |     actor_learning_rate: 1e-3
63 |     
64 |     # critic
65 |     critic_learning_rate: 1e-3
66 |     critic_iterations: 16
67 |     critic_num_batch: 4
68 |     target_critic_alpha: 0.2
69 | 
70 |     # learning rate scheduler
71 |     lr_schedule: linear # [constant, linear]
72 | 
73 |     # adam
74 |     betas: [0.7, 0.95]


--------------------------------------------------------------------------------
/config/func_optim/dejong64/lrp.yaml:
--------------------------------------------------------------------------------
 1 | params:  
 2 |   seed: 1
 3 |   device: 'cuda:0'
 4 | 
 5 |   env:
 6 |     name: DejongEnv
 7 |     config:
 8 |       dim: 64
 9 | 
10 |   algo:
11 |     name: lrp
12 | 
13 |     # network
14 |     network:
15 |       actor: ActorStochasticMLP
16 |       actor_mlp:
17 |         units: [32, 32]
18 |         activation: elu
19 |       actor_logstd_init: 0.0
20 |       fixed_sigma: False
21 | 
22 |       critic: CriticMLP
23 |       critic_mlp:
24 |         units: [32, 32]
25 |         activation: elu
26 | 
27 |     # length
28 |     horizon_length: 1
29 |     max_epochs: 2000
30 |     
31 |     # normalize
32 |     normalize_input: True
33 |     normalize_value: True
34 |     normalize_advantage: True
35 |     
36 |     # GAE
37 |     gamma: 0.99
38 |     tau: 0.95
39 |     
40 |     # save
41 |     save_best_after: 50
42 |     save_frequency: 100
43 |     
44 |     grad_norm: 1.0
45 |     truncate_grads: True
46 |     steps_num: 1
47 |     
48 |     num_actors: 64
49 |     minibatch_size: 64
50 |     mini_epochs: 5
51 |     
52 |     critic_coef: 4
53 |     clip_value: True
54 |     
55 |     defer_summaries_sec: 0.001
56 |     summaries_interval_sec_min: 0.001
57 |     summaries_interval_sec_max: 0.002
58 | 
59 |     
60 | 
61 |     # actor
62 |     actor_learning_rate: 1e-3
63 |     
64 |     # critic
65 |     critic_learning_rate: 1e-3
66 |     critic_iterations: 16
67 |     critic_num_batch: 4
68 |     target_critic_alpha: 0.2
69 | 
70 |     # learning rate scheduler
71 |     lr_schedule: linear # [constant, linear]
72 | 
73 |     # adam
74 |     betas: [0.7, 0.95]


--------------------------------------------------------------------------------
/config/func_optim/dejong64/rp.yaml:
--------------------------------------------------------------------------------
 1 | params:  
 2 |   seed: 1
 3 |   device: 'cuda:0'
 4 | 
 5 |   env:
 6 |     name: DejongEnv
 7 |     config:
 8 |       dim: 64
 9 | 
10 |   algo:
11 |     name: rp
12 | 
13 |     # network
14 |     network:
15 |       actor: ActorStochasticMLP
16 |       actor_mlp:
17 |         units: [32, 32]
18 |         activation: elu
19 |       actor_logstd_init: 0.0
20 |       fixed_sigma: False
21 | 
22 |       critic: CriticMLP
23 |       critic_mlp:
24 |         units: [32, 32]
25 |         activation: elu
26 | 
27 |     # length
28 |     horizon_length: 1
29 |     max_epochs: 2000
30 |     
31 |     # normalize
32 |     normalize_input: True
33 |     normalize_value: True
34 |     normalize_advantage: True
35 |     
36 |     # GAE
37 |     gamma: 0.99
38 |     tau: 0.95
39 |     
40 |     # save
41 |     save_best_after: 50
42 |     save_frequency: 100
43 |     
44 |     grad_norm: 1.0
45 |     truncate_grads: True
46 |     steps_num: 1
47 |     
48 |     num_actors: 64
49 |     minibatch_size: 64
50 |     mini_epochs: 5
51 |     
52 |     critic_coef: 4
53 |     clip_value: True
54 |     
55 |     defer_summaries_sec: 0.001
56 |     summaries_interval_sec_min: 0.001
57 |     summaries_interval_sec_max: 0.002
58 | 
59 |     
60 | 
61 |     # actor
62 |     actor_learning_rate: 1e-2
63 |     
64 |     # critic
65 |     critic_learning_rate: 1e-3
66 |     critic_iterations: 16
67 |     critic_num_batch: 4
68 |     target_critic_alpha: 0.2
69 | 
70 |     # learning rate scheduler
71 |     lr_schedule: linear # [constant, linear]
72 | 
73 |     # adam
74 |     betas: [0.7, 0.95]


--------------------------------------------------------------------------------
/config/func_optim/ackley/ppo.yaml:
--------------------------------------------------------------------------------
 1 | params:  
 2 |   seed: 1
 3 |   device: 'cuda:0'
 4 | 
 5 |   env:
 6 |     name: AckleyEnv
 7 |     config:
 8 |       dim: 1
 9 | 
10 |   algo:
11 |     name: ppo
12 | 
13 |     # network
14 |     network:
15 |       actor: ActorStochasticMLP
16 |       actor_mlp:
17 |         units: [32, 32]
18 |         activation: elu
19 |       actor_logstd_init: 0.0
20 |       fixed_sigma: False
21 | 
22 |       critic: CriticMLP
23 |       critic_mlp:
24 |         units: [32, 32]
25 |         activation: elu
26 | 
27 |     # length
28 |     horizon_length: 1
29 |     max_epochs: 2000
30 |     
31 |     # normalize
32 |     normalize_input: True
33 |     normalize_value: True
34 |     normalize_advantage: True
35 |     
36 |     # GAE
37 |     gamma: 0.99
38 |     tau: 0.95
39 |     
40 |     # save
41 |     save_best_after: 50
42 |     save_frequency: 100
43 |     
44 |     grad_norm: 1.0
45 |     truncate_grads: True
46 |     steps_num: 1
47 |     
48 |     num_actors: 64
49 |     
50 |     critic_coef: 4
51 |     clip_value: True
52 |     
53 |     defer_summaries_sec: 0.001
54 |     summaries_interval_sec_min: 0.001
55 |     summaries_interval_sec_max: 0.002
56 | 
57 |     # actor
58 |     actor_learning_rate: 1e-4
59 |     
60 |     # critic
61 |     critic_learning_rate: 1e-3
62 |     critic_iterations: 16
63 |     critic_num_batch: 4
64 |     target_critic_alpha: 0.2
65 | 
66 |     # learning rate scheduler
67 |     lr_schedule: linear # [constant, linear]
68 | 
69 |     # adam
70 |     betas: [0.7, 0.95]
71 | 
72 |     # ppo
73 |     ppo:
74 |       e_clip: 0.2
75 |       minibatch_size: 64
76 |       mini_epochs: 5


--------------------------------------------------------------------------------
/config/func_optim/ackley64/ppo.yaml:
--------------------------------------------------------------------------------
 1 | params:  
 2 |   seed: 1
 3 |   device: 'cuda:0'
 4 | 
 5 |   env:
 6 |     name: AckleyEnv
 7 |     config:
 8 |       dim: 64
 9 | 
10 |   algo:
11 |     name: ppo
12 | 
13 |     # network
14 |     network:
15 |       actor: ActorStochasticMLP
16 |       actor_mlp:
17 |         units: [32, 32]
18 |         activation: elu
19 |       actor_logstd_init: 0.0
20 |       fixed_sigma: False
21 | 
22 |       critic: CriticMLP
23 |       critic_mlp:
24 |         units: [32, 32]
25 |         activation: elu
26 | 
27 |     # length
28 |     horizon_length: 1
29 |     max_epochs: 2000
30 |     
31 |     # normalize
32 |     normalize_input: True
33 |     normalize_value: True
34 |     normalize_advantage: True
35 |     
36 |     # GAE
37 |     gamma: 0.99
38 |     tau: 0.95
39 |     
40 |     # save
41 |     save_best_after: 50
42 |     save_frequency: 100
43 |     
44 |     grad_norm: 1.0
45 |     truncate_grads: True
46 |     steps_num: 1
47 |     
48 |     num_actors: 64
49 |     
50 |     critic_coef: 4
51 |     clip_value: True
52 |     
53 |     defer_summaries_sec: 0.001
54 |     summaries_interval_sec_min: 0.001
55 |     summaries_interval_sec_max: 0.002
56 | 
57 |     # actor
58 |     actor_learning_rate: 1e-2
59 |     
60 |     # critic
61 |     critic_learning_rate: 1e-3
62 |     critic_iterations: 16
63 |     critic_num_batch: 4
64 |     target_critic_alpha: 0.2
65 | 
66 |     # learning rate scheduler
67 |     lr_schedule: linear # [constant, linear]
68 | 
69 |     # adam
70 |     betas: [0.7, 0.95]
71 | 
72 |     # ppo
73 |     ppo:
74 |       e_clip: 0.2
75 |       minibatch_size: 64
76 |       mini_epochs: 5


--------------------------------------------------------------------------------
/config/func_optim/dejong/ppo.yaml:
--------------------------------------------------------------------------------
 1 | params:  
 2 |   seed: 1
 3 |   device: 'cuda:0'
 4 | 
 5 |   env:
 6 |     name: DejongEnv
 7 |     config:
 8 |       dim: 1
 9 | 
10 |   algo:
11 |     name: ppo
12 | 
13 |     # network
14 |     network:
15 |       actor: ActorStochasticMLP
16 |       actor_mlp:
17 |         units: [32, 32]
18 |         activation: elu
19 |       actor_logstd_init: 0.0
20 |       fixed_sigma: False
21 | 
22 |       critic: CriticMLP
23 |       critic_mlp:
24 |         units: [32, 32]
25 |         activation: elu
26 | 
27 |     # length
28 |     horizon_length: 1
29 |     max_epochs: 2000
30 |     
31 |     # normalize
32 |     normalize_input: True
33 |     normalize_value: True
34 |     normalize_advantage: True
35 |     
36 |     # GAE
37 |     gamma: 0.99
38 |     tau: 0.95
39 |     
40 |     # save
41 |     save_best_after: 50
42 |     save_frequency: 100
43 |     
44 |     grad_norm: 1.0
45 |     truncate_grads: True
46 |     steps_num: 1
47 |     
48 |     num_actors: 64
49 |     
50 |     critic_coef: 4
51 |     clip_value: True
52 |     
53 |     defer_summaries_sec: 0.001
54 |     summaries_interval_sec_min: 0.001
55 |     summaries_interval_sec_max: 0.002
56 | 
57 |     # actor
58 |     actor_learning_rate: 1e-4
59 |     
60 |     # critic
61 |     critic_learning_rate: 1e-3
62 |     critic_iterations: 16
63 |     critic_num_batch: 4
64 |     target_critic_alpha: 0.2
65 | 
66 |     # learning rate scheduler
67 |     lr_schedule: linear # [constant, linear]
68 | 
69 |     # adam
70 |     betas: [0.7, 0.95]
71 | 
72 |     # ppo
73 |     ppo:
74 |       e_clip: 0.2
75 |       minibatch_size: 64
76 |       mini_epochs: 5


--------------------------------------------------------------------------------
/config/func_optim/dejong64/ppo.yaml:
--------------------------------------------------------------------------------
 1 | params:  
 2 |   seed: 1
 3 |   device: 'cuda:0'
 4 | 
 5 |   env:
 6 |     name: DejongEnv
 7 |     config:
 8 |       dim: 64
 9 | 
10 |   algo:
11 |     name: ppo
12 | 
13 |     # network
14 |     network:
15 |       actor: ActorStochasticMLP
16 |       actor_mlp:
17 |         units: [32, 32]
18 |         activation: elu
19 |       actor_logstd_init: 0.0
20 |       fixed_sigma: False
21 | 
22 |       critic: CriticMLP
23 |       critic_mlp:
24 |         units: [32, 32]
25 |         activation: elu
26 | 
27 |     # length
28 |     horizon_length: 1
29 |     max_epochs: 2000
30 |     
31 |     # normalize
32 |     normalize_input: True
33 |     normalize_value: True
34 |     normalize_advantage: True
35 |     
36 |     # GAE
37 |     gamma: 0.99
38 |     tau: 0.95
39 |     
40 |     # save
41 |     save_best_after: 50
42 |     save_frequency: 100
43 |     
44 |     grad_norm: 1.0
45 |     truncate_grads: True
46 |     steps_num: 1
47 |     
48 |     num_actors: 64
49 |     
50 |     critic_coef: 4
51 |     clip_value: True
52 |     
53 |     defer_summaries_sec: 0.001
54 |     summaries_interval_sec_min: 0.001
55 |     summaries_interval_sec_max: 0.002
56 | 
57 |     # actor
58 |     actor_learning_rate: 1e-2
59 |     
60 |     # critic
61 |     critic_learning_rate: 1e-3
62 |     critic_iterations: 16
63 |     critic_num_batch: 4
64 |     target_critic_alpha: 0.2
65 | 
66 |     # learning rate scheduler
67 |     lr_schedule: linear # [constant, linear]
68 | 
69 |     # adam
70 |     betas: [0.7, 0.95]
71 | 
72 |     # ppo
73 |     ppo:
74 |       e_clip: 0.2
75 |       minibatch_size: 64
76 |       mini_epochs: 5


--------------------------------------------------------------------------------
/config/func_optim/ackley/gippo.yaml:
--------------------------------------------------------------------------------
 1 | params:  
 2 |   seed: 1
 3 |   device: 'cuda:0'
 4 | 
 5 |   env:
 6 |     name: AckleyEnv
 7 |     config:
 8 |       dim: 1
 9 | 
10 |   algo:
11 |     name: gippo
12 | 
13 |     # network
14 |     network:
15 |       actor: ActorStochasticMLP
16 |       actor_mlp:
17 |         units: [32, 32]
18 |         activation: elu
19 |       actor_logstd_init: 0.0
20 |       fixed_sigma: False
21 | 
22 |       critic: CriticMLP
23 |       critic_mlp:
24 |         units: [32, 32]
25 |         activation: elu
26 | 
27 |     # length
28 |     horizon_length: 1
29 |     max_epochs: 2000
30 |     
31 |     # normalize
32 |     normalize_input: True
33 |     normalize_value: True
34 |     normalize_advantage: True
35 |     
36 |     # GAE
37 |     gamma: 0.99
38 |     tau: 0.95
39 |     
40 |     # save
41 |     save_best_after: 50
42 |     save_frequency: 100
43 |     
44 |     grad_norm: 1.0
45 |     truncate_grads: True
46 |     steps_num: 1
47 |     
48 |     num_actors: 64
49 |     
50 |     critic_coef: 4
51 |     clip_value: True
52 | 
53 |     # actor
54 |     actor_learning_rate: 1e-4         # ppo
55 |     actor_learning_rate_no_ppo: 1e-3  # analytical grads
56 |     
57 |     # critic
58 |     critic_learning_rate: 1e-3
59 |     critic_iterations: 16
60 |     critic_num_batch: 4
61 |     target_critic_alpha: 0.2
62 | 
63 |     # learning rate scheduler
64 |     lr_schedule: linear # [constant, linear]
65 | 
66 |     # adam
67 |     betas: [0.7, 0.95]
68 | 
69 |     # ppo
70 |     ppo:
71 |       e_clip: 0.2
72 |       minibatch_size: 64
73 |       mini_epochs: 5
74 | 
75 |     # gippo
76 |     gi:
77 |       alpha: 1e-5
78 |       alpha_interval: 0.40
79 |       alpha_update_factor: 1.1
80 |       max_alpha: 1e-0
81 |       num_iter: 16
82 |       max_oorr: 0.5


--------------------------------------------------------------------------------
/config/func_optim/dejong/gippo.yaml:
--------------------------------------------------------------------------------
 1 | params:  
 2 |   seed: 1
 3 |   device: 'cuda:0'
 4 | 
 5 |   env:
 6 |     name: DejongEnv
 7 |     config:
 8 |       dim: 1
 9 | 
10 |   algo:
11 |     name: gippo
12 | 
13 |     # network
14 |     network:
15 |       actor: ActorStochasticMLP
16 |       actor_mlp:
17 |         units: [32, 32]
18 |         activation: elu
19 |       actor_logstd_init: 0.0
20 |       fixed_sigma: False
21 | 
22 |       critic: CriticMLP
23 |       critic_mlp:
24 |         units: [32, 32]
25 |         activation: elu
26 | 
27 |     # length
28 |     horizon_length: 1
29 |     max_epochs: 2000
30 |     
31 |     # normalize
32 |     normalize_input: True
33 |     normalize_value: True
34 |     normalize_advantage: True
35 |     
36 |     # GAE
37 |     gamma: 0.99
38 |     tau: 0.95
39 |     
40 |     # save
41 |     save_best_after: 50
42 |     save_frequency: 100
43 |     
44 |     grad_norm: 1.0
45 |     truncate_grads: True
46 |     steps_num: 1
47 |     
48 |     num_actors: 64
49 |     
50 |     critic_coef: 4
51 |     clip_value: True
52 | 
53 |     # actor
54 |     actor_learning_rate: 1e-4         # ppo
55 |     actor_learning_rate_no_ppo: 1e-3  # analytical grads
56 |     
57 |     # critic
58 |     critic_learning_rate: 1e-3
59 |     critic_iterations: 16
60 |     critic_num_batch: 4
61 |     target_critic_alpha: 0.2
62 | 
63 |     # learning rate scheduler
64 |     lr_schedule: linear # [constant, linear]
65 | 
66 |     # adam
67 |     betas: [0.7, 0.95]
68 | 
69 |     # ppo
70 |     ppo:
71 |       e_clip: 0.2
72 |       minibatch_size: 64
73 |       mini_epochs: 5
74 | 
75 |     # gippo
76 |     gi:
77 |       alpha: 1e-5
78 |       alpha_interval: 0.40
79 |       alpha_update_factor: 1.1
80 |       max_alpha: 1e-0
81 |       num_iter: 16
82 |       max_oorr: 0.5


--------------------------------------------------------------------------------
/config/func_optim/ackley64/gippo.yaml:
--------------------------------------------------------------------------------
 1 | params:  
 2 |   seed: 1
 3 |   device: 'cuda:0'
 4 | 
 5 |   env:
 6 |     name: AckleyEnv
 7 |     config:
 8 |       dim: 64
 9 | 
10 |   algo:
11 |     name: gippo
12 | 
13 |     # network
14 |     network:
15 |       actor: ActorStochasticMLP
16 |       actor_mlp:
17 |         units: [32, 32]
18 |         activation: elu
19 |       actor_logstd_init: 0.0
20 |       fixed_sigma: False
21 | 
22 |       critic: CriticMLP
23 |       critic_mlp:
24 |         units: [32, 32]
25 |         activation: elu
26 | 
27 |     # length
28 |     horizon_length: 1
29 |     max_epochs: 2000
30 |     
31 |     # normalize
32 |     normalize_input: True
33 |     normalize_value: True
34 |     normalize_advantage: True
35 |     
36 |     # GAE
37 |     gamma: 0.99
38 |     tau: 0.95
39 |     
40 |     # save
41 |     save_best_after: 50
42 |     save_frequency: 100
43 |     
44 |     grad_norm: 1.0
45 |     truncate_grads: True
46 |     steps_num: 1
47 |     
48 |     num_actors: 64
49 |     
50 |     critic_coef: 4
51 |     clip_value: True
52 | 
53 |     # actor
54 |     actor_learning_rate: 1e-2         # ppo
55 |     actor_learning_rate_no_ppo: 1e-3  # analytical grads
56 |     
57 |     # critic
58 |     critic_learning_rate: 1e-3
59 |     critic_iterations: 16
60 |     critic_num_batch: 4
61 |     target_critic_alpha: 0.2
62 | 
63 |     # learning rate scheduler
64 |     lr_schedule: linear # [constant, linear]
65 | 
66 |     # adam
67 |     betas: [0.7, 0.95]
68 | 
69 |     # ppo
70 |     ppo:
71 |       e_clip: 0.2
72 |       minibatch_size: 64
73 |       mini_epochs: 5
74 | 
75 |     # gippo
76 |     gi:
77 |       alpha: 1e-5
78 |       alpha_interval: 0.40
79 |       alpha_update_factor: 1.1
80 |       max_alpha: 1e-0
81 |       num_iter: 16
82 |       max_oorr: 0.5


--------------------------------------------------------------------------------
/config/func_optim/dejong64/gippo.yaml:
--------------------------------------------------------------------------------
 1 | params:  
 2 |   seed: 1
 3 |   device: 'cuda:0'
 4 | 
 5 |   env:
 6 |     name: DejongEnv
 7 |     config:
 8 |       dim: 64
 9 | 
10 |   algo:
11 |     name: gippo
12 | 
13 |     # network
14 |     network:
15 |       actor: ActorStochasticMLP
16 |       actor_mlp:
17 |         units: [32, 32]
18 |         activation: elu
19 |       actor_logstd_init: 0.0
20 |       fixed_sigma: False
21 | 
22 |       critic: CriticMLP
23 |       critic_mlp:
24 |         units: [32, 32]
25 |         activation: elu
26 | 
27 |     # length
28 |     horizon_length: 1
29 |     max_epochs: 2000
30 |     
31 |     # normalize
32 |     normalize_input: True
33 |     normalize_value: True
34 |     normalize_advantage: True
35 |     
36 |     # GAE
37 |     gamma: 0.99
38 |     tau: 0.95
39 |     
40 |     # save
41 |     save_best_after: 50
42 |     save_frequency: 100
43 |     
44 |     grad_norm: 1.0
45 |     truncate_grads: True
46 |     steps_num: 1
47 |     
48 |     num_actors: 64
49 |     
50 |     critic_coef: 4
51 |     clip_value: True
52 | 
53 |     # actor
54 |     actor_learning_rate: 1e-2         # ppo
55 |     actor_learning_rate_no_ppo: 1e-3  # analytical grads
56 |     
57 |     # critic
58 |     critic_learning_rate: 1e-3
59 |     critic_iterations: 16
60 |     critic_num_batch: 4
61 |     target_critic_alpha: 0.2
62 | 
63 |     # learning rate scheduler
64 |     lr_schedule: linear # [constant, linear]
65 | 
66 |     # adam
67 |     betas: [0.7, 0.95]
68 | 
69 |     # ppo
70 |     ppo:
71 |       e_clip: 0.2
72 |       minibatch_size: 64
73 |       mini_epochs: 5
74 | 
75 |     # gippo
76 |     gi:
77 |       alpha: 1e-5
78 |       alpha_interval: 0.40
79 |       alpha_update_factor: 1.1
80 |       max_alpha: 1e-0
81 |       num_iter: 16
82 |       max_oorr: 0.5


--------------------------------------------------------------------------------
/envs/func_optim/base.py:
--------------------------------------------------------------------------------
 1 | import torch as th
 2 | 
 3 | from envs.base import BaseEnv
 4 | 
 5 | class FuncOptimEnv(BaseEnv):
 6 | 
 7 |     def __init__(self,
 8 |                 num_envs,
 9 |                 dim=1, 
10 |                 seed=0, 
11 |                 no_grad=True, 
12 |                 render=False, 
13 |                 device='cuda:0'):
14 |         
15 |         super(FuncOptimEnv, self).__init__(
16 |             num_envs=num_envs,
17 |             num_obs=1, 
18 |             num_act=dim,
19 |             episode_length=1,
20 |             seed=seed,
21 |             no_grad=no_grad,
22 |             render=render,
23 |             device=device
24 |         )
25 | 
26 |         self.dim = dim
27 |         self.render_resolution = 1e3
28 |     
29 |     def preprocess_actions(self, actions: th.Tensor):
30 |         actions = actions.view((self.num_envs, self.num_actions))         
31 |         actions = th.clip(actions, -1., 1.)
32 |         return actions
33 | 
34 |     def step(self, actions: th.Tensor):
35 |         actions = self.preprocess_actions(actions)
36 |         self.actions = actions
37 |             
38 |         self.reset_buf = th.zeros_like(self.reset_buf)
39 | 
40 |         self.progress_buf += 1
41 |         self.num_frames += 1
42 | 
43 |         self.calculateObservations()
44 |         self.calculateReward()
45 | 
46 |         if self.no_grad == False:
47 |             self.obs_buf_before_reset = self.obs_buf.clone()
48 |             self.extras = {
49 |                 'obs_before_reset': self.obs_buf_before_reset,
50 |                 'episode_end': self.termination_buf
51 |                 }
52 | 
53 |         self.reset()
54 |         return self.obs_buf, self.rew_buf, self.reset_buf, self.extras
55 | 
56 |     def reset(self):
57 | 
58 |         self.calculateObservations()
59 |         self.progress_buf[:] = 0
60 |         
61 |         return self.obs_buf
62 | 
63 |     def calculateObservations(self):
64 | 
65 |         self.obs_buf = th.zeros_like(self.obs_buf)
66 | 
67 |     def calculateReward(self):
68 | 
69 |         self.rew_buf = self.evaluate(self.actions)
70 | 
71 |         # reset agents
72 |         self.reset_buf = th.where(self.progress_buf > self.episode_length - 1, th.ones_like(self.reset_buf), self.reset_buf)
73 | 
74 |     def evaluate(self, x: th.Tensor):
75 | 
76 |         raise NotImplementedError()


--------------------------------------------------------------------------------
/src/gippo/runner.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import time
 3 | import random
 4 | from copy import deepcopy
 5 | 
 6 | import numpy as np
 7 | import torch as th
 8 | 
 9 | from gippo.rl_algorithm.lr import LR
10 | from gippo.rl_algorithm.rp import RP
11 | from gippo.rl_algorithm.lrp import LRP
12 | from gippo.rl_algorithm.ppo import PPO
13 | from gippo.rl_algorithm.gippo import GIPPO
14 | from gippo.vecenv import create_vecenv
15 | 
16 | class Runner:
17 | 
18 |     def __init__(self):
19 |         th.backends.cudnn.benchmark = True
20 |         
21 |     def reset(self):
22 |         pass
23 | 
24 |     def load_config(self, params):
25 |         self.seed = params.get('seed', None)
26 |         if self.seed is None:
27 |             self.seed = int(time.time())
28 | 
29 |         print(f"self.seed = {self.seed}")
30 | 
31 |         self.algo_params = params['algo']
32 |         self.algo_name = self.algo_params['name']
33 |         self.exp_config = None
34 | 
35 |         if self.seed:
36 |             th.manual_seed(self.seed)
37 |             th.cuda.manual_seed_all(self.seed)
38 |             np.random.seed(self.seed)
39 |             random.seed(self.seed)
40 | 
41 |             # deal with environment specific seed if applicable
42 |             if 'config' in params['env']:
43 |                 params['env']['config']['seed'] = self.seed
44 |                 
45 |         self.params = params
46 | 
47 |     def load(self, yaml_config):
48 |         config = deepcopy(yaml_config)
49 |         self.default_config = deepcopy(config['params'])
50 |         self.load_config(params=self.default_config)
51 | 
52 |     def run_train(self, args):
53 |         print('Started to train')
54 | 
55 |         algo_config = self.params['algo']
56 |         env_config = self.params['env']
57 |         device = self.params['device']
58 |         log_path = self.params['log_path']
59 |         
60 |         if self.algo_name == 'lr':
61 |             agent = LR(algo_config, env_config, device, log_path)
62 |         elif self.algo_name == 'rp':
63 |             agent = RP(algo_config, env_config, device, log_path)
64 |         elif self.algo_name == 'lrp':
65 |             agent = LRP(algo_config, env_config, device, log_path)
66 |         elif self.algo_name == 'ppo':
67 |             agent = PPO(algo_config, env_config, device, log_path)
68 |         elif self.algo_name == 'gippo':
69 |             agent = GIPPO(algo_config, env_config, device, log_path)
70 |         else:
71 |             raise NotImplementedError()
72 |         # _restore(agent, args)
73 |         # _override_sigma(agent, args)
74 |         agent.train()


--------------------------------------------------------------------------------
/src/gippo/dataset.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | From
 3 | https://github.com/Denys88/rl_games/blob/master/rl_games/common/datasets.py
 4 | https://github.com/NVlabs/DiffRL/blob/main/utils/dataset.py
 5 | '''
 6 | import numpy as np
 7 | from torch.utils.data import Dataset
 8 | 
 9 | class PPODataset(Dataset):
10 |     def __init__(self, batch_size, minibatch_size, device):
11 |         self.batch_size = batch_size
12 |         self.minibatch_size = minibatch_size
13 |         self.device = device
14 |         self.length = self.batch_size // self.minibatch_size
15 |         self.special_names = []
16 |         
17 |     def update_values_dict(self, values_dict):
18 |         self.values_dict = values_dict     
19 | 
20 |     def update_mu_sigma(self, mu, sigma):	    
21 |         start = self.last_range[0]	           
22 |         end = self.last_range[1]	
23 |         self.values_dict['mu'][start:end] = mu	
24 |         self.values_dict['sigma'][start:end] = sigma 
25 | 
26 |     def __len__(self):
27 |         return self.length
28 | 
29 |     def _get_item(self, idx):
30 |         start = idx * self.minibatch_size
31 |         end = (idx + 1) * self.minibatch_size
32 |         self.last_range = (start, end)
33 |         input_dict = {}
34 |         for k,v in self.values_dict.items():
35 |             if k not in self.special_names and v is not None:
36 |                 if type(v) is dict:
37 |                     v_dict = { kd:vd[start:end] for kd, vd in v.items() }
38 |                     input_dict[k] = v_dict
39 |                 else:
40 |                     input_dict[k] = v[start:end]
41 |                 
42 |         return input_dict
43 | 
44 |     def __getitem__(self, idx):
45 |         sample = self._get_item(idx)
46 |         return sample
47 | 
48 | class CriticDataset:
49 |     def __init__(self, batch_size, obs, target_values, shuffle = False, drop_last = False):
50 |         self.obs = obs.view(-1, obs.shape[-1])
51 |         self.target_values = target_values.view(-1)
52 |         self.batch_size = batch_size
53 | 
54 |         if shuffle:
55 |             self.shuffle()
56 |         
57 |         if drop_last:
58 |             self.length = self.obs.shape[0] // self.batch_size
59 |         else:
60 |             self.length = ((self.obs.shape[0] - 1) // self.batch_size) + 1
61 |     
62 |     def shuffle(self):
63 |         index = np.random.permutation(self.obs.shape[0])
64 |         self.obs = self.obs[index, :]
65 |         self.target_values = self.target_values[index]
66 | 
67 |     def __len__(self):
68 |         return self.length
69 |     
70 |     def __getitem__(self, index):
71 |         start_idx = index * self.batch_size
72 |         end_idx = min((index + 1) * self.batch_size, self.obs.shape[0])
73 |         return {'obs': self.obs[start_idx:end_idx, :], 'target_values': self.target_values[start_idx:end_idx]}


--------------------------------------------------------------------------------
/run_func_optim.sh:
--------------------------------------------------------------------------------
 1 | ITER=5
 2 | 
 3 | # Dejong
 4 | for (( i=1; i<=${ITER}; i++ ))
 5 | do
 6 |     python ./train.py --cfg ./config/func_optim/dejong/lr.yaml --logdir ./logdir/func_optim/dejong/lr/ --seed ${i} --device cpu
 7 |     python ./train.py --cfg ./config/func_optim/dejong/rp.yaml --logdir ./logdir/func_optim/dejong/rp/ --seed ${i} --device cpu
 8 |     python ./train.py --cfg ./config/func_optim/dejong/lrp.yaml --logdir ./logdir/func_optim/dejong/lrp/ --seed ${i} --device cpu
 9 |     python ./train.py --cfg ./config/func_optim/dejong/ppo.yaml --logdir ./logdir/func_optim/dejong/ppo/ --seed ${i} --device cpu
10 |     python ./train.py --cfg ./config/func_optim/dejong/gippo.yaml --logdir ./logdir/func_optim/dejong/gippo/ --seed ${i} --device cpu
11 | done
12 | 
13 | # Dejong 64
14 | for (( i=1; i<=${ITER}; i++ ))
15 | do
16 |     python ./train.py --cfg ./config/func_optim/dejong64/lr.yaml --logdir ./logdir/func_optim/dejong64/lr/ --seed ${i} --device cpu
17 |     python ./train.py --cfg ./config/func_optim/dejong64/rp.yaml --logdir ./logdir/func_optim/dejong64/rp/ --seed ${i} --device cpu
18 |     python ./train.py --cfg ./config/func_optim/dejong64/lrp.yaml --logdir ./logdir/func_optim/dejong64/lrp/ --seed ${i} --device cpu
19 |     python ./train.py --cfg ./config/func_optim/dejong64/ppo.yaml --logdir ./logdir/func_optim/dejong64/ppo/ --seed ${i} --device cpu
20 |     python ./train.py --cfg ./config/func_optim/dejong64/gippo.yaml --logdir ./logdir/func_optim/dejong64/gippo/ --seed ${i} --device cpu
21 | done
22 | 
23 | # Ackley
24 | for (( i=1; i<=${ITER}; i++ ))
25 | do
26 |     python ./train.py --cfg ./config/func_optim/ackley/lr.yaml --logdir ./logdir/func_optim/ackley/lr/ --seed ${i} --device cpu
27 |     python ./train.py --cfg ./config/func_optim/ackley/rp.yaml --logdir ./logdir/func_optim/ackley/rp/ --seed ${i} --device cpu
28 |     python ./train.py --cfg ./config/func_optim/ackley/lrp.yaml --logdir ./logdir/func_optim/ackley/lrp/ --seed ${i} --device cpu
29 |     python ./train.py --cfg ./config/func_optim/ackley/ppo.yaml --logdir ./logdir/func_optim/ackley/ppo/ --seed ${i} --device cpu
30 |     python ./train.py --cfg ./config/func_optim/ackley/gippo.yaml --logdir ./logdir/func_optim/ackley/gippo/ --seed ${i} --device cpu
31 | done
32 | 
33 | # Ackley 64
34 | for (( i=1; i<=${ITER}; i++ ))
35 | do
36 |     python ./train.py --cfg ./config/func_optim/ackley64/lr.yaml --logdir ./logdir/func_optim/ackley64/lr/ --seed ${i} --device cpu
37 |     python ./train.py --cfg ./config/func_optim/ackley64/rp.yaml --logdir ./logdir/func_optim/ackley64/rp/ --seed ${i} --device cpu
38 |     python ./train.py --cfg ./config/func_optim/ackley64/lrp.yaml --logdir ./logdir/func_optim/ackley64/lrp/ --seed ${i} --device cpu
39 |     python ./train.py --cfg ./config/func_optim/ackley64/ppo.yaml --logdir ./logdir/func_optim/ackley64/ppo/ --seed ${i} --device cpu
40 |     python ./train.py --cfg ./config/func_optim/ackley64/gippo.yaml --logdir ./logdir/func_optim/ackley64/gippo/ --seed ${i} --device cpu
41 | done


--------------------------------------------------------------------------------
/envs/base.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Modified from
 3 | https://github.com/NVlabs/DiffRL/blob/main/envs/dflex_env.py
 4 | '''
 5 | import numpy as np
 6 | import torch as th
 7 | 
 8 | from gym import spaces
 9 | 
10 | class BaseEnv:
11 |     
12 |     def __init__(self, 
13 |                 num_envs, 
14 |                 num_obs, 
15 |                 num_act, 
16 |                 episode_length, 
17 |                 seed=0, 
18 |                 no_grad=True, 
19 |                 render=False, 
20 |                 device='cuda:0'):
21 |         
22 |         self.seed = seed
23 | 
24 |         self.no_grad = no_grad
25 |         
26 |         self.episode_length = episode_length
27 | 
28 |         self.device = device
29 | 
30 |         self.render = render
31 | 
32 |         self.sim_time = 0.0
33 | 
34 |         self.num_frames = 0 # record the number of frames for rendering
35 | 
36 |         self.num_environments = num_envs
37 |         self.num_agents = 1
38 | 
39 |         # initialize observation and action space
40 |         self.num_observations = num_obs
41 |         self.num_actions = num_act
42 | 
43 |         self.obs_space = spaces.Box(np.ones(self.num_observations, dtype=np.float32) * -np.Inf, 
44 |                                     np.ones(self.num_observations, dtype=np.float32) * np.Inf)
45 |         self.act_space = spaces.Box(np.ones(self.num_actions, dtype=np.float32) * np.float32(-1.), 
46 |                                     np.ones(self.num_actions, dtype=np.float32) * np.float32(1.))
47 | 
48 |         # allocate buffers
49 |         self.obs_buf = th.zeros(
50 |             (self.num_envs, self.num_observations), device=self.device, dtype=th.float32, requires_grad=False)
51 |         self.rew_buf = th.zeros(
52 |             self.num_envs, device=self.device, dtype=th.float32, requires_grad=False)
53 |         self.reset_buf = th.ones(
54 |             self.num_envs, device=self.device, dtype=th.int64, requires_grad=False)
55 |         
56 |         # end of the episode
57 |         self.termination_buf = th.zeros(
58 |             self.num_envs, device=self.device, dtype=th.int64, requires_grad=False)
59 |         self.progress_buf = th.zeros(
60 |             self.num_envs, device=self.device, dtype=th.int64, requires_grad=False)
61 |         self.actions = th.zeros(
62 |             (self.num_envs, self.num_actions), device = self.device, dtype = th.float32, requires_grad = False)
63 | 
64 |         self.extras = {}
65 | 
66 |     def get_number_of_agents(self):
67 |         return self.num_agents
68 | 
69 |     @property
70 |     def observation_space(self):
71 |         return self.obs_space
72 | 
73 |     @property
74 |     def action_space(self):
75 |         return self.act_space
76 | 
77 |     @property
78 |     def num_envs(self):
79 |         return self.num_environments
80 | 
81 |     @property
82 |     def num_acts(self):
83 |         return self.num_actions
84 | 
85 |     @property
86 |     def num_obs(self):
87 |         return self.num_observations
88 | 
89 |     def get_state(self):
90 |         raise NotImplementedError()
91 |     
92 |     def reset_with_state(self, env_ids=None, force_reset=True):
93 |         raise NotImplementedError()


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import argparse
  3 | 
  4 | import os
  5 | import yaml
  6 | import time
  7 | 
  8 | from gippo import vecenv
  9 | from gippo.runner import Runner
 10 | 
 11 | from envs.func_optim.dejong import DejongEnv
 12 | from envs.func_optim.ackley import AckleyEnv
 13 | 
 14 | vecenv.register_vecenv_config(
 15 |                 'BASE',
 16 |                 lambda env_name,
 17 |                 num_actors,
 18 |                 **kwargs: vecenv.BaseVecEnv(env_name, num_actors, **kwargs))
 19 | 
 20 | vecenv.register_env_config(
 21 |     'DejongEnv',
 22 |     {
 23 |         'vecenv_type': 'BASE',
 24 |         'env_creator': lambda **kwargs: DejongEnv(**kwargs),
 25 |     }
 26 | )
 27 | vecenv.register_env_config(
 28 |     'AckleyEnv',
 29 |     {
 30 |         'vecenv_type': 'BASE',
 31 |         'env_creator': lambda **kwargs: AckleyEnv(**kwargs)
 32 |     }
 33 | )
 34 | 
 35 | def parse_arguments(description="Testing Args", custom_parameters=[]):
 36 |     parser = argparse.ArgumentParser()
 37 | 
 38 |     for argument in custom_parameters:
 39 |         if ("name" in argument) and ("type" in argument or "action" in argument):
 40 |             help_str = ""
 41 |             if "help" in argument:
 42 |                 help_str = argument["help"]
 43 | 
 44 |             if "type" in argument:
 45 |                 if "default" in argument:
 46 |                     parser.add_argument(argument["name"], type=argument["type"], default=argument["default"], help=help_str)
 47 |                 else:
 48 |                     print("ERROR: default must be specified if using type")
 49 |             elif "action" in argument:
 50 |                 parser.add_argument(argument["name"], action=argument["action"], help=help_str)
 51 |         else:
 52 |             print()
 53 |             print("ERROR: command line argument name, type/action must be defined, argument not added to parser")
 54 |             print("supported keys: name, type, default, action, help")
 55 |             print()
 56 |     
 57 |     args = parser.parse_args()
 58 |     return args
 59 | 
 60 | def get_args():
 61 |     custom_parameters = [
 62 |         {"name": "--cfg", "type": str, "default": "./config/func_optim/dejong/lr.yaml",
 63 |             "help": "Configuration file for training"},
 64 |         {"name": "--device", "type": str, "default": "cuda:0",
 65 |             "help": "Choose CPU or GPU device for inferencing policy network"},
 66 |         {"name": "--render", "action": "store_true", "default": False,
 67 |             "help": "whether generate rendering file."},
 68 |         {"name": "--logdir", "type": str, "default": "logdir/"},
 69 |         {"name": "--seed", "type": int, "default": 1},]
 70 | 
 71 |     # parse arguments
 72 |     args = parse_arguments(
 73 |         description="Training args",
 74 |         custom_parameters=custom_parameters)
 75 |     
 76 |     return args
 77 | 
 78 | if __name__ == '__main__':
 79 | 
 80 |     args = get_args()
 81 |     vargs = vars(args)
 82 |     
 83 |     with open(args.cfg, 'r') as f:
 84 |         cfg_train = yaml.load(f, Loader=yaml.SafeLoader)
 85 |     
 86 |     # save command line args to config;
 87 |     cfg_train["params"]["command_line_args"] = {}
 88 |     for key in vargs.keys():
 89 |         cfg_train["params"]["command_line_args"][key] = vargs[key]
 90 | 
 91 |     # save config;
 92 |     log_dir = cfg_train["params"]["command_line_args"]["logdir"]
 93 |     log_dir = log_dir + time.strftime("%Y-%m-%d-%H-%M-%S")
 94 |     os.makedirs(log_dir, exist_ok = True)
 95 |     yaml.dump(cfg_train, open(os.path.join(log_dir, 'cfg.yaml'), 'w'))
 96 |     cfg_train["params"]["log_path"] = log_dir
 97 |     cfg_train["params"]["device"] = vargs["device"]
 98 |     cfg_train["params"]["seed"] = vargs["seed"]
 99 | 
100 |     runner = Runner()
101 |     runner.load(cfg_train)
102 |     runner.run_train(vargs)


--------------------------------------------------------------------------------
/src/gippo/rl_algorithm/rp.py:
--------------------------------------------------------------------------------
 1 | import torch as th
 2 | import torch.utils as tu
 3 | from typing import List
 4 | 
 5 | from gippo.rl_algorithm.base import RLAlgorithm
 6 | from gippo.utils import swap_and_flatten01
 7 | 
 8 | class RP(RLAlgorithm):
 9 | 
10 |     def __init__(self, config, env_config, device="cpu", log_path=None):
11 |         
12 |         super(RP, self).__init__(config, env_config, device, log_path)
13 |         
14 |         self.actor_lr = float(config["actor_learning_rate"])
15 |         self.actor_optimizer = th.optim.Adam(
16 |             self.actor.parameters(), 
17 |             betas = config['betas'], 
18 |             lr = self.actor_lr
19 |         )
20 | 
21 |     def train_actor_critic_no_ppo(self):
22 | 
23 |         '''
24 |         Set learning rate.
25 |         '''
26 |         # set learning rate;
27 |         actor_lr = self.actor_lr
28 |         critic_lr = self.critic_lr
29 |         if self.lr_schedule == 'linear':
30 |             actor_lr = (1e-5 - self.actor_lr) * float(self.epoch_num / self.max_epochs) + self.actor_lr
31 |             critic_lr = (1e-5 - self.critic_lr) * float(self.epoch_num / self.max_epochs) + self.critic_lr
32 |         
33 |         for param_group in self.actor_optimizer.param_groups:
34 |             param_group['lr'] = actor_lr
35 |         for param_group in self.critic_optimizer.param_groups:
36 |             param_group['lr'] = critic_lr
37 | 
38 |         self.writer.add_scalar("info/actor_lr", actor_lr, self.epoch_num)
39 |         self.writer.add_scalar("info/critic_lr", critic_lr, self.epoch_num)
40 | 
41 |         return super().train_actor_critic_no_ppo()
42 | 
43 |     def use_analytic_grads(self):
44 |         
45 |         return True
46 |     
47 |     def use_ppo(self):
48 | 
49 |         return False
50 | 
51 |     def get_optimizers_state(self):
52 |         state = super().get_optimizers_state()
53 |         state['actor'] = self.actor_optimizer.state_dict()
54 |         
55 |         return state
56 | 
57 |     def train_actor_no_ppo(self,
58 |                             grad_start: th.Tensor,
59 |                             grad_obses: List[th.Tensor],
60 |                             grad_rp_eps: List[th.Tensor],
61 |                             grad_actions: List[th.Tensor],
62 |                             grad_values: List[th.Tensor],
63 |                             grad_next_values: List[th.Tensor],
64 |                             grad_rewards: List[th.Tensor],
65 |                             grad_fdones: List[th.Tensor],
66 |                             last_fdones: th.Tensor):
67 |         '''
68 |         Train actor using Reparameterization-Trick (RP) techinque.
69 | 
70 |         Follow variance reduction scheme of SHAC (https://arxiv.org/abs/2204.07137),
71 |         such as truncated time horizon.
72 |         '''
73 | 
74 |         self.actor.train()
75 | 
76 |         # compute advantages;
77 |         curr_grad_advs = self.grad_advantages(self.tau, 
78 |                                             grad_values, 
79 |                                             grad_next_values,
80 |                                             grad_rewards,
81 |                                             grad_fdones,
82 |                                             last_fdones)
83 | 
84 |         # add value of the states;
85 |         for i in range(len(grad_values)):
86 |             curr_grad_advs[i] = curr_grad_advs[i] + grad_values[i]
87 | 
88 |         # compute loss;
89 |         actor_loss: th.Tensor = -self.grad_advantages_first_terms_sum(curr_grad_advs, grad_start)
90 |         
91 |         # divide by number of trajectories;
92 |         actor_loss = actor_loss / th.count_nonzero(grad_start)
93 |         
94 |         # update actor;
95 |         self.actor_optimizer.zero_grad()
96 |         actor_loss.backward()
97 |         if self.truncate_grads:
98 |             th.nn.utils.clip_grad_norm_(self.actor.parameters(), self.grad_norm)    
99 |         self.actor_optimizer.step()


--------------------------------------------------------------------------------
/envs/func_optim/dejong.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch as th
  3 | import os
  4 | 
  5 | import matplotlib.pyplot as plt
  6 | 
  7 | from envs.func_optim.base import FuncOptimEnv
  8 | 
  9 | class DejongEnv(FuncOptimEnv):
 10 | 
 11 |     def __init__(self, 
 12 |                 num_envs,
 13 |                 dim=1, 
 14 |                 seed=0, 
 15 |                 no_grad=True, 
 16 |                 render=False, 
 17 |                 device='cuda:0'):
 18 | 
 19 |         super(DejongEnv, self).__init__(
 20 |             num_envs=num_envs,
 21 |             dim=dim,
 22 |             seed=seed,
 23 |             no_grad=no_grad,
 24 |             render=render,
 25 |             device=device)
 26 |         
 27 |         self.bound = 5.12
 28 |     
 29 |     def preprocess_actions(self, actions: th.Tensor):
 30 |         actions = super().preprocess_actions(actions)
 31 |         actions = actions * self.bound
 32 |         return actions
 33 | 
 34 |     def render(self, mode = 'human', actions = None, p_actions = None):
 35 | 
 36 |         if self.visualize:
 37 | 
 38 |             assert self.dim == 1, ""
 39 | 
 40 |             min_action = -self.bound
 41 |             max_action = self.bound
 42 |             step = (max_action - min_action) / self.render_resolution
 43 | 
 44 |             x = th.arange(min_action, max_action, step).unsqueeze(-1)
 45 |             y = self.evaluate(x)
 46 | 
 47 |             x = x[:, 0].cpu().numpy()
 48 |             y = y.cpu().numpy()
 49 | 
 50 |             f = plt.figure()
 51 |             f.set_figwidth(6.4 * 2)
 52 |             f.set_figheight(4.8 * 2)
 53 | 
 54 |             plt.plot(x, y, color='blue')
 55 | 
 56 |             with th.no_grad():
 57 | 
 58 |                 if actions == None:
 59 |                     x = self.actions[:, 0].cpu().numpy()
 60 |                     y = self.rew_buf.cpu().numpy()
 61 |                 elif actions != None:
 62 |                     x = th.clip(actions, -1, 1) * self.bound
 63 |                     y = self.evaluate(x)
 64 | 
 65 |                     x = x[:, 0].cpu().numpy()
 66 |                     y = y.cpu().numpy()
 67 |                 else:
 68 |                     raise ValueError()
 69 | 
 70 |             plt.plot(x, y, 'x', color='black', markersize=5e-0)
 71 | 
 72 |             with th.no_grad():
 73 | 
 74 |                 if p_actions != None:
 75 |                     x = th.clip(p_actions, -1, 1) * self.bound
 76 |                     y = self.evaluate(x)
 77 | 
 78 |                     x = x[:, 0].cpu().numpy()
 79 |                     y = y.cpu().numpy()
 80 | 
 81 |             plt.plot(x, y, 'o', color='red', markersize=2e-0)
 82 | 
 83 |             plt.title("Dejong Function, Step {}".format(self.num_frames))
 84 |             plt.xlabel("x")
 85 |             plt.ylabel("y")
 86 | 
 87 |             dir = './outputs/dejong/'
 88 | 
 89 |             if not os.path.exists(dir):
 90 |                 os.makedirs(dir)
 91 | 
 92 |             plt.savefig("./outputs/dejong/dejong_{}.png".format(self.num_frames))
 93 | 
 94 |     def reset(self, env_ids=None, force_reset=True):
 95 |         
 96 |         self.calculateObservations()
 97 | 
 98 |         return self.obs_buf
 99 | 
100 |     '''
101 |     cut off the gradient from the current state to previous states
102 |     '''
103 |     def clear_grad(self):
104 |         
105 |         pass
106 | 
107 |     '''
108 |     This function starts collecting a new trajectory from the current states but cut off the computation graph to the previous states.
109 |     It has to be called every time the algorithm starts an episode and return the observation vectors
110 |     '''
111 |     def initialize_trajectory(self):
112 |         self.clear_grad()
113 |         self.calculateObservations()
114 |         return self.obs_buf
115 | 
116 |     def calculateObservations(self):
117 | 
118 |         self.obs_buf = th.zeros_like(self.obs_buf)
119 | 
120 |     def calculateReward(self):
121 | 
122 |         self.rew_buf = self.evaluate(self.actions)
123 | 
124 |         # reset agents
125 |         self.reset_buf = th.where(self.progress_buf > self.episode_length - 1, th.ones_like(self.reset_buf), self.reset_buf)
126 | 
127 |     def evaluate(self, x: th.Tensor):
128 | 
129 |         y = th.sum(x * x, dim=1)
130 | 
131 |         return -y


--------------------------------------------------------------------------------
/src/gippo/vecenv.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Modified from
  3 | https://github.com/Denys88/rl_games/blob/master/rl_games/common/vecenv.py
  4 | https://github.com/NVlabs/DiffRL/blob/main/examples/train_rl.py#L52
  5 | '''
  6 | vecenv_config = {}      # vectorized environment, which usually wraps around
  7 |                         # a single environment and provides parallelized interface;
  8 | env_config = {}         # single environment config;
  9 | 
 10 | def register_vecenv_config(config_name, func):
 11 |     vecenv_config[config_name] = func
 12 | 
 13 | def register_env_config(env_name, config):
 14 |     env_config[env_name] = config
 15 | 
 16 | def create_vecenv(env_name, num_actors, **kwargs):
 17 |     vecenv_name = env_config[env_name]['vecenv_type']
 18 |     return vecenv_config[vecenv_name](env_name, num_actors, **kwargs)
 19 | 
 20 | '''
 21 | Vectorized Environment
 22 | '''
 23 | 
 24 | class IVecEnv:
 25 |     def step(self, actions):
 26 |         raise NotImplementedError
 27 | 
 28 |     def reset(self):
 29 |         raise NotImplementedError
 30 | 
 31 |     def has_action_masks(self):
 32 |         return False
 33 | 
 34 |     def get_number_of_agents(self):
 35 |         return 1
 36 | 
 37 |     def get_env_info(self):
 38 |         pass
 39 | 
 40 |     def seed(self, seed):
 41 |         pass
 42 | 
 43 |     def set_train_info(self, env_frames, *args, **kwargs):
 44 |         """
 45 |         Send the information in the direction algo->environment.
 46 |         Most common use case: tell the environment how far along we are in the training process. This is useful
 47 |         for implementing curriculums and things such as that.
 48 |         """
 49 |         pass
 50 | 
 51 |     def get_env_state(self):
 52 |         """
 53 |         Return serializable environment state to be saved to checkpoint.
 54 |         Can be used for stateful training sessions, i.e. with adaptive curriculums.
 55 |         """
 56 |         return None
 57 | 
 58 |     def set_env_state(self, env_state):
 59 |         pass
 60 | 
 61 | class BaseVecEnv(IVecEnv):
 62 |     def __init__(self, env_name, num_actors, **kwargs):
 63 |         kwargs['num_envs'] = num_actors
 64 |         self.env = env_config[env_name]['env_creator'](**kwargs)
 65 | 
 66 |         self.full_state = {}
 67 |         self.device = kwargs['device']
 68 | 
 69 |         self.full_state["obs"] = self.env.reset(force_reset=True).to(self.device)
 70 |         
 71 |     def step(self, actions):
 72 |         self.full_state["obs"], reward, is_done, info = self.env.step(actions.to(self.device))
 73 | 
 74 |         return self.full_state["obs"].to(self.device), \
 75 |                 reward.to(self.device), \
 76 |                 is_done.to(self.device), \
 77 |                 info
 78 | 
 79 |     def reset(self):
 80 |         self.full_state["obs"] = self.env.reset(force_reset=True)
 81 | 
 82 |         return self.full_state["obs"].to(self.device)
 83 | 
 84 |     def get_number_of_agents(self):
 85 |         return self.env.get_number_of_agents()
 86 | 
 87 |     def get_env_info(self):
 88 |         info = {}
 89 |         info['action_space'] = self.env.action_space
 90 |         info['observation_space'] = self.env.observation_space
 91 |         return info
 92 | 
 93 | class RLGPUEnv(IVecEnv):
 94 |     def __init__(self, env_name, num_actors, **kwargs):
 95 |         self.env = env_config[env_name]['env_creator'](**kwargs)
 96 | 
 97 |         self.full_state = {}
 98 |         raise NotImplementedError()
 99 |         self.rl_device = "cuda:0"
100 | 
101 |         self.full_state["obs"] = self.env.reset(force_reset=True).to(self.rl_device)
102 |         print(self.full_state["obs"].shape)
103 | 
104 |     def step(self, actions):
105 |         self.full_state["obs"], reward, is_done, info = self.env.step(actions.to(self.env.device))
106 | 
107 |         return self.full_state["obs"].to(self.rl_device), reward.to(self.rl_device), is_done.to(self.rl_device), info
108 | 
109 |     def reset(self):
110 |         self.full_state["obs"] = self.env.reset(force_reset=True)
111 | 
112 |         return self.full_state["obs"].to(self.rl_device)
113 | 
114 |     def get_number_of_agents(self):
115 |         return self.env.get_number_of_agents()
116 | 
117 |     def get_env_info(self):
118 |         info = {}
119 |         info['action_space'] = self.env.action_space
120 |         info['observation_space'] = self.env.observation_space
121 | 
122 |         print(info['action_space'], info['observation_space'])
123 | 
124 |         return info


--------------------------------------------------------------------------------
/src/gippo/rl_algorithm/lr.py:
--------------------------------------------------------------------------------
  1 | import torch as th
  2 | import torch.utils as tu
  3 | from typing import List
  4 | 
  5 | from gippo.rl_algorithm.base import RLAlgorithm
  6 | from gippo.utils import swap_and_flatten01
  7 | 
  8 | class LR(RLAlgorithm):
  9 | 
 10 |     def __init__(self, config, env_config, device="cpu", log_path=None):
 11 | 
 12 |         super(LR, self).__init__(config, env_config, device, log_path)
 13 |         
 14 |         self.actor_lr = float(config["actor_learning_rate"])
 15 |         self.actor_optimizer = th.optim.Adam(
 16 |             self.actor.parameters(), 
 17 |             betas = config['betas'], 
 18 |             lr = self.actor_lr
 19 |         )
 20 | 
 21 |     def train_actor_critic_no_ppo(self):
 22 | 
 23 |         '''
 24 |         Set learning rate.
 25 |         '''
 26 |         # set learning rate;
 27 |         actor_lr = self.actor_lr
 28 |         critic_lr = self.critic_lr
 29 |         if self.lr_schedule == 'linear':
 30 |             actor_lr = (1e-5 - self.actor_lr) * float(self.epoch_num / self.max_epochs) + self.actor_lr
 31 |             critic_lr = (1e-5 - self.critic_lr) * float(self.epoch_num / self.max_epochs) + self.critic_lr
 32 |         
 33 |         for param_group in self.actor_optimizer.param_groups:
 34 |             param_group['lr'] = actor_lr
 35 |         for param_group in self.critic_optimizer.param_groups:
 36 |             param_group['lr'] = critic_lr
 37 | 
 38 |         self.writer.add_scalar("info/actor_lr", actor_lr, self.epoch_num)
 39 |         self.writer.add_scalar("info/critic_lr", critic_lr, self.epoch_num)
 40 |         
 41 |         return super().train_actor_critic_no_ppo()
 42 | 
 43 |     def use_analytic_grads(self):
 44 |         
 45 |         return False
 46 |     
 47 |     def use_ppo(self):
 48 | 
 49 |         return False
 50 | 
 51 |     def get_optimizers_state(self):
 52 |         state = super().get_optimizers_state()
 53 |         state['actor'] = self.actor_optimizer.state_dict()
 54 |         
 55 |         return state
 56 | 
 57 |     def train_actor_no_ppo(self,
 58 |                             grad_start: th.Tensor,
 59 |                             grad_obses: List[th.Tensor],
 60 |                             grad_rp_eps: List[th.Tensor],
 61 |                             grad_actions: List[th.Tensor],
 62 |                             grad_values: List[th.Tensor],
 63 |                             grad_next_values: List[th.Tensor],
 64 |                             grad_rewards: List[th.Tensor],
 65 |                             grad_fdones: List[th.Tensor],
 66 |                             last_fdones: th.Tensor):
 67 |         '''
 68 |         Train actor using Likelihood-Ratio (LR) techinque.
 69 | 
 70 |         There are two additional measures to reduce variance:
 71 |         1. Use advantage term instead of total expected return.
 72 |         (Using total expected return resulted in hopless results in some problems...)
 73 |         2. Normalize advantages (if [normalize_advantage] flag is set).
 74 |         '''
 75 | 
 76 |         self.actor.train()
 77 | 
 78 |         with th.no_grad():
 79 |             # compute advantages;
 80 |             curr_grad_advs = self.grad_advantages(self.tau, 
 81 |                                                 grad_values, 
 82 |                                                 grad_next_values,
 83 |                                                 grad_rewards,
 84 |                                                 grad_fdones,
 85 |                                                 last_fdones)
 86 |         
 87 |             t_obses = swap_and_flatten01(th.stack(grad_obses, dim=0))
 88 |             t_advantages = swap_and_flatten01(th.stack(curr_grad_advs, dim=0))
 89 |             t_actions = swap_and_flatten01(th.stack(grad_actions, dim=0))
 90 |         
 91 |             # to reduce variance, we admit normalizing advantages;
 92 |             if self.normalize_advantage:
 93 |                 t_advantages = (t_advantages - t_advantages.mean()) / (t_advantages.std() + 1e-8)
 94 |             
 95 |         _, mu, std, _ = self.actor.forward_with_dist(t_obses)
 96 |         t_neglogpacs = self.neglogp(t_actions, mu, std, th.log(std))
 97 |                 
 98 |         actor_loss = t_advantages * t_neglogpacs.unsqueeze(-1)
 99 |         
100 |         # divide by number of (s, a) pairs;
101 |         actor_loss = th.mean(actor_loss)
102 |         
103 |         self.actor_optimizer.zero_grad()
104 |         actor_loss.backward()
105 |         if self.truncate_grads:
106 |             th.nn.utils.clip_grad_norm_(self.actor.parameters(), self.grad_norm)    
107 |         self.actor_optimizer.step()


--------------------------------------------------------------------------------
/envs/func_optim/ackley.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch as th
  3 | import os
  4 | 
  5 | import matplotlib.pyplot as plt
  6 | 
  7 | from envs.func_optim.base import FuncOptimEnv
  8 | 
  9 | class AckleyEnv(FuncOptimEnv):
 10 | 
 11 |     def __init__(self, 
 12 |                 num_envs,
 13 |                 dim=1, 
 14 |                 seed=0, 
 15 |                 no_grad=True, 
 16 |                 render=False, 
 17 |                 device='cuda:0'):
 18 | 
 19 |         super(AckleyEnv, self).__init__(
 20 |             num_envs=num_envs,
 21 |             dim=dim,
 22 |             seed=seed,
 23 |             no_grad=no_grad,
 24 |             render=render,
 25 |             device=device)
 26 | 
 27 |         self.a = 20
 28 |         self.b = 0.2
 29 |         self.c = 2.0 * th.pi
 30 |         self.bound = 32.768
 31 |     
 32 |     def preprocess_actions(self, actions: th.Tensor):
 33 |         actions = super().preprocess_actions(actions)
 34 |         actions = actions * self.bound
 35 |         return actions
 36 | 
 37 |     def render(self, mode = 'human', actions = None, p_actions = None):
 38 | 
 39 |         if self.visualize:
 40 | 
 41 |             assert self.dim == 1, ""
 42 | 
 43 |             min_action = -self.bound
 44 |             max_action = self.bound
 45 |             step = (max_action - min_action) / self.render_resolution
 46 | 
 47 |             x = th.arange(min_action, max_action, step).unsqueeze(-1)
 48 |             y = self.evaluate(x)
 49 | 
 50 |             x = x[:, 0].cpu().numpy()
 51 |             y = y.cpu().numpy()
 52 | 
 53 |             f = plt.figure()
 54 |             f.set_figwidth(6.4 * 2)
 55 |             f.set_figheight(4.8 * 2)
 56 | 
 57 |             plt.plot(x, y, color='blue')
 58 | 
 59 |             with th.no_grad():
 60 | 
 61 |                 if actions == None:
 62 |                     x = self.actions[:, 0].cpu().numpy()
 63 |                     y = self.rew_buf.cpu().numpy()
 64 |                 elif actions != None:
 65 |                     x = th.clip(actions, -1, 1) * self.bound
 66 |                     y = self.evaluate(x)
 67 | 
 68 |                     x = x[:, 0].cpu().numpy()
 69 |                     y = y.cpu().numpy()
 70 |                 else:
 71 |                     raise ValueError()
 72 | 
 73 |             plt.plot(x, y, 'x', color='black', markersize=5e-0)
 74 | 
 75 |             with th.no_grad():
 76 | 
 77 |                 if p_actions != None:
 78 |                     x = th.clip(p_actions, -1, 1) * self.bound
 79 |                     y = self.evaluate(x)
 80 | 
 81 |                     x = x[:, 0].cpu().numpy()
 82 |                     y = y.cpu().numpy()
 83 | 
 84 |             plt.plot(x, y, 'o', color='red', markersize=2e-0)
 85 | 
 86 |             plt.title("Ackley Function, Step {}".format(self.num_frames))
 87 |             plt.xlabel("x")
 88 |             plt.ylabel("y")
 89 | 
 90 |             dir = './outputs/ackley/'
 91 | 
 92 |             if not os.path.exists(dir):
 93 |                 os.makedirs(dir)
 94 | 
 95 |             plt.savefig("./outputs/ackley/ackley_{}.png".format(self.num_frames))
 96 | 
 97 |     def reset(self, env_ids=None, force_reset=True):
 98 |         
 99 |         self.calculateObservations()
100 | 
101 |         return self.obs_buf
102 | 
103 |     '''
104 |     cut off the gradient from the current state to previous states
105 |     '''
106 |     def clear_grad(self):
107 |         
108 |         pass
109 | 
110 |     '''
111 |     This function starts collecting a new trajectory from the current states but cut off the computation graph to the previous states.
112 |     It has to be called every time the algorithm starts an episode and return the observation vectors
113 |     '''
114 |     def initialize_trajectory(self):
115 |         self.clear_grad()
116 |         self.calculateObservations()
117 |         return self.obs_buf
118 | 
119 |     def calculateObservations(self):
120 | 
121 |         self.obs_buf = th.zeros_like(self.obs_buf)
122 | 
123 |     def calculateReward(self):
124 | 
125 |         self.rew_buf = self.evaluate(self.actions)
126 | 
127 |         # reset agents
128 |         self.reset_buf = th.where(self.progress_buf > self.episode_length - 1, th.ones_like(self.reset_buf), self.reset_buf)
129 | 
130 |     def evaluate(self, x: th.Tensor):
131 | 
132 |         t0 = th.zeros((len(x),), device=x.device, dtype=x.dtype)
133 |         t1 = th.zeros((len(x),), device=x.device, dtype=x.dtype)
134 |         one = th.ones((len(x),), device=x.device, dtype=x.dtype)
135 | 
136 |         for i in range(self.dim):
137 | 
138 |             xi = x[:, i]
139 |             t0 = t0 + th.pow(xi, 2.0)
140 |             t1 = t1 + th.cos(self.c * xi)
141 | 
142 |         t0 = t0 / self.dim
143 |         t1 = t1 / self.dim
144 | 
145 |         y = -self.a * th.exp(-self.b * th.sqrt(t0)) - th.exp(t1) + self.a + th.exp(one)
146 | 
147 |         return -y


--------------------------------------------------------------------------------
/src/gippo/network.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Modified from 
  3 | https://github.com/NVlabs/DiffRL/blob/main/models/actor.py
  4 | https://github.com/NVlabs/DiffRL/blob/main/models/critic.py
  5 | https://github.com/NVlabs/DiffRL/blob/main/models/model_utils.py
  6 | '''
  7 | import numpy as np
  8 | import torch as th
  9 | from torch import nn
 10 | 
 11 | from gippo.utils import Normal
 12 | 
 13 | '''
 14 | Initialize the parameters of module using the given weight and bias initialization functions.
 15 | '''
 16 | def init(module, weight_init, bias_init, gain=1):
 17 |     weight_init(module.weight.data) #, gain=gain)
 18 |     bias_init(module.bias.data)
 19 |     return module
 20 | 
 21 | def get_activation_func(activation_name):
 22 |     if activation_name.lower() == 'tanh':
 23 |         return nn.Tanh()
 24 |     elif activation_name.lower() == 'relu':
 25 |         return nn.ReLU()
 26 |     elif activation_name.lower() == 'elu':
 27 |         return nn.ELU()
 28 |     elif activation_name.lower() == 'identity':
 29 |         return nn.Identity()
 30 |     else:
 31 |         raise NotImplementedError('Activation func {} not defined'.format(activation_name))
 32 | 
 33 | '''
 34 | Actor
 35 | '''
 36 | class ActorStochasticMLP(nn.Module):
 37 |     def __init__(self, 
 38 |                 obs_dim, 
 39 |                 action_dim, 
 40 |                 cfg_network, 
 41 |                 device='cuda:0'):
 42 |         super(ActorStochasticMLP, self).__init__()
 43 | 
 44 |         self.device = device
 45 |         self.layer_dims = [obs_dim] + cfg_network['actor_mlp']['units']
 46 | 
 47 |         modules = []
 48 |         for i in range(len(self.layer_dims) - 1):
 49 |             modules.append(nn.Linear(self.layer_dims[i], self.layer_dims[i + 1]))
 50 |             modules.append(get_activation_func(cfg_network['actor_mlp']['activation']))
 51 |             modules.append(th.nn.LayerNorm(self.layer_dims[i + 1]))
 52 |         self.actor_mlp = nn.Sequential(*modules).to(device)
 53 |         
 54 |         # mu;
 55 |         out_size = self.layer_dims[-1]
 56 |         self.mu = [nn.Linear(out_size, action_dim), get_activation_func('identity')]
 57 |         self.mu = nn.Sequential(*self.mu).to(device)
 58 |         
 59 |         # logstd;
 60 |         self.fixed_sigma = cfg_network['fixed_sigma']
 61 |         if cfg_network['fixed_sigma']:
 62 |             logstd = cfg_network.get('actor_logstd_init', -1.0)
 63 |             self.logstd = nn.Parameter(th.ones(action_dim, dtype=th.float32, device=device) * logstd)
 64 |         else:
 65 |             self.logstd = nn.Linear(out_size, action_dim).to(device)
 66 |             
 67 |         self.action_dim = action_dim
 68 |         self.obs_dim = obs_dim
 69 | 
 70 |         # print(self.actor_mlp)
 71 |         # print(self.mu)
 72 |         # print(self.logstd)
 73 | 
 74 |     def forward(self, obs, deterministic = False):
 75 |         out = self.actor_mlp(obs)
 76 |         mu = self.mu(out)
 77 | 
 78 |         if deterministic:
 79 |             return mu
 80 |         else:
 81 |             if self.fixed_sigma:
 82 |                 std = self.logstd.exp() # (num_actions)
 83 |             else:
 84 |                 std = th.exp(self.logstd(out))
 85 |             dist = Normal(mu, std)
 86 |             sample = dist.rsample()
 87 |             return sample
 88 |     
 89 |     def forward_with_dist(self, obs, deterministic = False):
 90 |         mu, std = self.forward_dist(obs)
 91 |             
 92 |         dist = Normal(mu, std)
 93 |         eps = dist.sample_eps()
 94 |         
 95 |         if deterministic:
 96 |             eps = eps.zero_()
 97 |         sample = dist.eps_to_action(eps)
 98 | 
 99 |         return sample, mu, std, eps
100 |         
101 |     def evaluate_actions_log_probs(self, obs, actions):
102 |         mu, std = self.forward_dist(obs)    
103 |         dist = Normal(mu, std)
104 |         return dist.log_prob(actions)
105 | 
106 |     def forward_dist(self, obs):
107 |         out = self.actor_mlp(obs)
108 |         mu = self.mu(out)
109 |         if self.fixed_sigma:
110 |             std = self.logstd.exp() # (num_actions)
111 |         else:
112 |             std = th.exp(self.logstd(out))
113 |             
114 |         return mu, std
115 |     
116 | '''
117 | Critic
118 | '''
119 | class CriticMLP(nn.Module):
120 |     def __init__(self, obs_dim, cfg_network, device='cuda:0'):
121 |         super(CriticMLP, self).__init__()
122 | 
123 |         self.device = device
124 | 
125 |         self.layer_dims = [obs_dim] + cfg_network['critic_mlp']['units'] + [1]
126 | 
127 |         init_ = lambda m: init(m, nn.init.orthogonal_, lambda x: nn.init.
128 |                         constant_(x, 0), np.sqrt(2))
129 |                         
130 |         modules = []
131 |         for i in range(len(self.layer_dims) - 1):
132 |             modules.append(init_(nn.Linear(self.layer_dims[i], self.layer_dims[i + 1])))
133 |             if i < len(self.layer_dims) - 2:
134 |                 modules.append(get_activation_func(cfg_network['critic_mlp']['activation']))
135 |                 modules.append(nn.LayerNorm(self.layer_dims[i + 1]))
136 | 
137 |         self.critic = nn.Sequential(*modules).to(device)
138 |     
139 |         self.obs_dim = obs_dim
140 | 
141 |         # print(self.critic)
142 | 
143 |     def forward(self, observations):
144 |         return self.critic(observations)
145 | 


--------------------------------------------------------------------------------
/src/gippo/utils.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch as th
  3 | import random
  4 | import os
  5 | 
  6 | '''
  7 | From 
  8 | https://github.com/NVlabs/DiffRL/blob/a4c0dd1696d3c3b885ce85a3cb64370b580cb913/utils/common.py#L72
  9 | '''
 10 | def seeding(seed=0, torch_deterministic=False):
 11 |     print("Setting seed: {}".format(seed))
 12 | 
 13 |     random.seed(seed)
 14 |     np.random.seed(seed)
 15 |     th.manual_seed(seed)
 16 |     os.environ['PYTHONHASHSEED'] = str(seed)
 17 |     th.cuda.manual_seed(seed)
 18 |     th.cuda.manual_seed_all(seed)
 19 | 
 20 |     if torch_deterministic:
 21 |         # refer to https://docs.nvidia.com/cuda/cublas/index.html#cublasApi_reproducibility
 22 |         os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8'
 23 |         th.backends.cudnn.benchmark = False
 24 |         th.backends.cudnn.deterministic = True
 25 |         th.use_deterministic_algorithms(True)
 26 |     else:
 27 |         th.backends.cudnn.benchmark = True
 28 |         th.backends.cudnn.deterministic = False
 29 | 
 30 |     return seed
 31 | 
 32 | from torch.distributions.utils import _standard_normal
 33 | class Normal(th.distributions.Normal):
 34 |     
 35 |     def __init__(self, loc, scale, validate_args=None):
 36 |         super().__init__(loc, scale, validate_args)
 37 |     
 38 |     def sample_eps(self, sample_shape=th.Size()):
 39 |         shape = self._extended_shape(sample_shape)
 40 |         eps = _standard_normal(shape, dtype=self.loc.dtype, device=self.loc.device)
 41 |         return eps
 42 |     
 43 |     def eps_to_action(self, eps):
 44 |         return self.loc + eps * self.scale
 45 |     
 46 | '''
 47 | From 
 48 | https://github.com/NVlabs/DiffRL/blob/main/utils/running_mean_std.py
 49 | '''
 50 | from typing import Tuple
 51 | class RunningMeanStd(object):
 52 |     def __init__(self, epsilon: float = 1e-4, shape: Tuple[int, ...] = (), device = 'cuda:0'):
 53 |         """
 54 |         Calulates the running mean and std of a data stream
 55 |         https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
 56 |         :param epsilon: helps with arithmetic issues
 57 |         :param shape: the shape of the data stream's output
 58 |         """
 59 |         self.mean = th.zeros(shape, dtype = th.float32, device = device)
 60 |         self.var = th.ones(shape, dtype = th.float32, device = device)
 61 |         self.count = epsilon
 62 | 
 63 |     def to(self, device):
 64 |         rms = RunningMeanStd(device = device)
 65 |         rms.mean = self.mean.to(device).clone()
 66 |         rms.var = self.var.to(device).clone()
 67 |         rms.count = self.count
 68 |         return rms
 69 |     
 70 |     @th.no_grad()
 71 |     def update(self, arr: th.tensor) -> None:
 72 |         batch_mean = th.mean(arr, dim = 0)
 73 |         batch_var = th.var(arr, dim = 0, unbiased = False)
 74 |         batch_count = arr.shape[0]
 75 |         self.update_from_moments(batch_mean, batch_var, batch_count)
 76 | 
 77 |     def update_from_moments(self, batch_mean: th.tensor, batch_var: th.tensor, batch_count: int) -> None:
 78 |         delta = batch_mean - self.mean
 79 |         tot_count = self.count + batch_count
 80 | 
 81 |         new_mean = self.mean + delta * batch_count / tot_count
 82 |         m_a = self.var * self.count
 83 |         m_b = batch_var * batch_count
 84 |         m_2 = m_a + m_b + th.square(delta) * self.count * batch_count / (self.count + batch_count)
 85 |         new_var = m_2 / (self.count + batch_count)
 86 | 
 87 |         new_count = batch_count + self.count
 88 | 
 89 |         self.mean = new_mean
 90 |         self.var = new_var
 91 |         self.count = new_count
 92 | 
 93 |     def normalize(self, arr:th.tensor, un_norm = False) -> th.tensor:
 94 |         if not un_norm:
 95 |             result = (arr - self.mean) / th.sqrt(self.var + 1e-5)
 96 |         else:
 97 |             result = arr * th.sqrt(self.var + 1e-5) + self.mean
 98 |         return result
 99 | 
100 | '''
101 | From
102 | https://github.com/SonSang/DiffRL/blob/stable/externals/rl_games/rl_games/algos_torch/torch_ext.py#L275
103 | '''
104 | class AverageMeter(th.nn.Module):
105 |     def __init__(self, in_shape, max_size):
106 |         super(AverageMeter, self).__init__()
107 |         self.max_size = max_size
108 |         self.current_size = 0
109 |         self.register_buffer("mean", th.zeros(in_shape, dtype = th.float32))
110 | 
111 |     def update(self, values):
112 |         size = values.size()[0]
113 |         if size == 0:
114 |             return
115 |         new_mean = th.mean(values.float(), dim=0)
116 |         size = np.clip(size, 0, self.max_size)
117 |         old_size = min(self.max_size - size, self.current_size)
118 |         size_sum = old_size + size
119 |         self.current_size = size_sum
120 |         self.mean = (self.mean * old_size + new_mean * size) / size_sum
121 | 
122 |     def clear(self):
123 |         self.current_size = 0
124 |         self.mean.fill_(0)
125 | 
126 |     def __len__(self):
127 |         return self.current_size
128 | 
129 |     def get_mean(self):
130 |         return self.mean.squeeze(0).cpu().numpy()
131 | 
132 | '''
133 | From
134 | https://github.com/Denys88/rl_games/blob/master/rl_games/common/a2c_common.py#L30
135 | '''
136 | def swap_and_flatten01(arr):
137 |     """
138 |     swap and then flatten axes 0 and 1
139 |     """
140 |     if arr is None:
141 |         return arr
142 |     s = arr.size()
143 |     return arr.transpose(0, 1).reshape(s[0] * s[1], *s[2:])
144 | 
145 | '''
146 | From 
147 | https://github.com/Denys88/rl_games/blob/master/rl_games/algos_torch/torch_ext.py#L10
148 | '''
149 | numpy_to_torch_dtype_dict = {
150 |     np.dtype('bool')       : th.bool,
151 |     np.dtype('uint8')      : th.uint8,
152 |     np.dtype('int8')       : th.int8,
153 |     np.dtype('int16')      : th.int16,
154 |     np.dtype('int32')      : th.int32,
155 |     np.dtype('int64')      : th.int64,
156 |     np.dtype('float16')    : th.float16,
157 |     np.dtype('float32')    : th.float32,
158 |     np.dtype('float64')    : th.float64,
159 |     np.dtype('complex64')  : th.complex64,
160 |     np.dtype('complex128') : th.complex128,
161 | }


--------------------------------------------------------------------------------
/src/gippo/experience.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Modified from
  3 | https://github.com/Denys88/rl_games/blob/master/rl_games/common/experience.py#L285
  4 | '''
  5 | 
  6 | import numpy as np
  7 | import torch as th
  8 | import gym
  9 | 
 10 | from gippo.utils import numpy_to_torch_dtype_dict
 11 | 
 12 | class ExperienceBuffer:
 13 |     def __init__(self, env_info, algo_info, device, aux_tensor_dict=None):
 14 |         self.env_info = env_info
 15 |         self.algo_info = algo_info
 16 |         self.device = device
 17 | 
 18 |         self.num_agents = env_info.get('agents', 1)
 19 |         self.action_space = env_info['action_space']
 20 |         
 21 |         self.num_actors = algo_info['num_actors']
 22 |         self.horizon_length = algo_info['horizon_length']
 23 |         batch_size = self.num_actors * self.num_agents
 24 |         self.obs_base_shape = (self.horizon_length, self.num_agents * self.num_actors)
 25 |         self.state_base_shape = (self.horizon_length, self.num_actors)
 26 |         if type(self.action_space) is gym.spaces.Discrete:
 27 |             raise ValueError()
 28 |         if type(self.action_space) is gym.spaces.Tuple:
 29 |             raise ValueError()
 30 |         if type(self.action_space) is gym.spaces.Box:
 31 |             self.actions_shape = (self.action_space.shape[0],) 
 32 |             self.actions_num = self.action_space.shape[0]
 33 |             self.is_continuous = True
 34 |         self.tensor_dict = {}
 35 |         self._init_from_env_info(self.env_info)
 36 | 
 37 |         self.aux_tensor_dict = aux_tensor_dict
 38 |         if self.aux_tensor_dict is not None:
 39 |             self._init_from_aux_dict(self.aux_tensor_dict)
 40 | 
 41 |     def _init_from_env_info(self, env_info):
 42 |         obs_base_shape = self.obs_base_shape
 43 |         state_base_shape = self.state_base_shape
 44 | 
 45 |         self.tensor_dict['obses'] = self._create_tensor_from_space(env_info['observation_space'], obs_base_shape)
 46 |         
 47 |         val_space = gym.spaces.Box(low=0, high=1,shape=(env_info.get('value_size',1),))
 48 |         self.tensor_dict['rewards'] = self._create_tensor_from_space(val_space, obs_base_shape)
 49 |         self.tensor_dict['values'] = self._create_tensor_from_space(val_space, obs_base_shape)
 50 |         self.tensor_dict['neglogpacs'] = self._create_tensor_from_space(gym.spaces.Box(low=0, high=1,shape=(), dtype=np.float32), obs_base_shape)
 51 |         self.tensor_dict['dones'] = self._create_tensor_from_space(gym.spaces.Box(low=0, high=1,shape=(), dtype=np.uint8), obs_base_shape)
 52 |         
 53 |         assert self.is_continuous, "Only continuous action space is supported"
 54 |         self.tensor_dict['actions'] = self._create_tensor_from_space(gym.spaces.Box(low=0, high=1,shape=self.actions_shape, dtype=np.float32), obs_base_shape)
 55 |         self.tensor_dict['mus'] = self._create_tensor_from_space(gym.spaces.Box(low=0, high=1,shape=self.actions_shape, dtype=np.float32), obs_base_shape)
 56 |         self.tensor_dict['sigmas'] = self._create_tensor_from_space(gym.spaces.Box(low=0, high=1,shape=self.actions_shape, dtype=np.float32), obs_base_shape)
 57 | 
 58 |         '''
 59 |         Gradient info
 60 |         '''
 61 |         # store first and second order analytical gradients of advantage w.r.t. actions;
 62 |         
 63 |         base_shape = self.obs_base_shape
 64 |         action_shape = self.actions_shape
 65 |         dtype = th.float32
 66 |         device = self.device
 67 |         
 68 |         self.tensor_dict['adv_gradient'] = th.zeros(base_shape + action_shape, dtype=dtype, device=device)
 69 |         self.tensor_dict['adv_hessian'] = th.zeros(base_shape + action_shape + action_shape, dtype=dtype, device=device)
 70 | 
 71 |     def _init_from_aux_dict(self, tensor_dict):
 72 |         obs_base_shape = self.obs_base_shape
 73 |         for k,v in tensor_dict.items():
 74 |             self.tensor_dict[k] = self._create_tensor_from_space(gym.spaces.Box(low=0, high=1,shape=(v), dtype=np.float32), obs_base_shape)
 75 | 
 76 |     def _create_tensor_from_space(self, space, base_shape):       
 77 |         if type(space) is gym.spaces.Box:
 78 |             dtype = numpy_to_torch_dtype_dict[space.dtype]
 79 |             return th.zeros(base_shape + space.shape, dtype= dtype, device = self.device)
 80 |         if type(space) is gym.spaces.Discrete:
 81 |             dtype = numpy_to_torch_dtype_dict[space.dtype]
 82 |             return th.zeros(base_shape, dtype= dtype, device = self.device)
 83 |         if type(space) is gym.spaces.Tuple:
 84 |             '''
 85 |             assuming that tuple is only Discrete tuple
 86 |             '''
 87 |             dtype = numpy_to_torch_dtype_dict[space.dtype]
 88 |             tuple_len = len(space)
 89 |             return th.zeros(base_shape +(tuple_len,), dtype= dtype, device = self.device)
 90 |         if type(space) is gym.spaces.Dict:
 91 |             t_dict = {}
 92 |             for k,v in space.spaces.items():
 93 |                 t_dict[k] = self._create_tensor_from_space(v, base_shape)
 94 |             return t_dict
 95 | 
 96 |     def update_data(self, name, index, val):
 97 |         if type(val) is dict:
 98 |             for k,v in val.items():
 99 |                 self.tensor_dict[name][k][index,:] = v
100 |         else:
101 |             self.tensor_dict[name][index,:] = val
102 | 
103 |     def get_transformed(self, transform_op):
104 |         res_dict = {}
105 |         for k, v in self.tensor_dict.items():
106 |             if type(v) is dict:
107 |                 transformed_dict = {}
108 |                 for kd,vd in v.items():
109 |                     transformed_dict[kd] = transform_op(vd)
110 |                 res_dict[k] = transformed_dict
111 |             else:
112 |                 res_dict[k] = transform_op(v)
113 |         
114 |         return res_dict
115 | 
116 |     def get_transformed_list(self, transform_op, tensor_list):
117 |         res_dict = {}
118 |         for k in tensor_list:
119 |             v = self.tensor_dict.get(k)
120 |             if v is None:
121 |                 continue
122 |             if type(v) is dict:
123 |                 transformed_dict = {}
124 |                 for kd,vd in v.items():
125 |                     transformed_dict[kd] = transform_op(vd)
126 |                 res_dict[k] = transformed_dict
127 |             else:
128 |                 res_dict[k] = transform_op(v)
129 |         
130 |         return res_dict


--------------------------------------------------------------------------------
/src/gippo/rl_algorithm/lrp.py:
--------------------------------------------------------------------------------
  1 | import torch as th
  2 | import numpy as np
  3 | import torch.utils as tu
  4 | from typing import List
  5 | 
  6 | from gippo.rl_algorithm.base import RLAlgorithm
  7 | from gippo.utils import swap_and_flatten01
  8 | 
  9 | class LRP(RLAlgorithm):
 10 | 
 11 |     def __init__(self, config, env_config, device="cpu", log_path=None):
 12 |         
 13 |         super(LRP, self).__init__(config, env_config, device, log_path)
 14 |         
 15 |         self.actor_lr = float(config["actor_learning_rate"])
 16 |         self.actor_optimizer = th.optim.Adam(
 17 |             self.actor.parameters(), 
 18 |             betas = config['betas'], 
 19 |             lr = self.actor_lr
 20 |         )
 21 |         
 22 |         '''
 23 |         Parameters for sample variance estimation of policy gradients.
 24 |         '''
 25 |         # [var_est_num_sample]: Number of samples to use for sample variance estimation;
 26 |         self.var_est_num_sample = config.get("var_est_num_sample", 16)
 27 | 
 28 |         # [var_est_max_grad_len]: Length of the first N values (in policy gradients) to use
 29 |         # for sample variance estimation;
 30 |         self.var_est_max_grad_len = config.get("var_est_max_grad_len", 512)
 31 | 
 32 |     def train_actor_critic_no_ppo(self):
 33 | 
 34 |         '''
 35 |         Set learning rate.
 36 |         '''
 37 |         # set learning rate;
 38 |         actor_lr = self.actor_lr
 39 |         critic_lr = self.critic_lr
 40 |         if self.lr_schedule == 'linear':
 41 |             actor_lr = (1e-5 - self.actor_lr) * float(self.epoch_num / self.max_epochs) + self.actor_lr
 42 |             critic_lr = (1e-5 - self.critic_lr) * float(self.epoch_num / self.max_epochs) + self.critic_lr
 43 |         
 44 |         for param_group in self.actor_optimizer.param_groups:
 45 |             param_group['lr'] = actor_lr
 46 |         for param_group in self.critic_optimizer.param_groups:
 47 |             param_group['lr'] = critic_lr
 48 | 
 49 |         self.writer.add_scalar("info/actor_lr", actor_lr, self.epoch_num)
 50 |         self.writer.add_scalar("info/critic_lr", critic_lr, self.epoch_num)
 51 | 
 52 |         return super().train_actor_critic_no_ppo()
 53 | 
 54 |     def use_analytic_grads(self):
 55 |         
 56 |         return True
 57 |     
 58 |     def use_ppo(self):
 59 | 
 60 |         return False
 61 | 
 62 |     def get_optimizers_state(self):
 63 |         state = super().get_optimizers_state()
 64 |         state['actor'] = self.actor_optimizer.state_dict()
 65 |         
 66 |         return state
 67 | 
 68 |     def train_actor_no_ppo(self,
 69 |                             grad_start: th.Tensor,
 70 |                             grad_obses: List[th.Tensor],
 71 |                             grad_rp_eps: List[th.Tensor],
 72 |                             grad_actions: List[th.Tensor],
 73 |                             grad_values: List[th.Tensor],
 74 |                             grad_next_values: List[th.Tensor],
 75 |                             grad_rewards: List[th.Tensor],
 76 |                             grad_fdones: List[th.Tensor],
 77 |                             last_fdones: th.Tensor):
 78 |         '''
 79 |         Combine policy gradients obtained through LR and RP techinques.
 80 | 
 81 |         Use sample variance of the policy gradients to combine them.
 82 |         '''
 83 | 
 84 |         self.actor.train()
 85 | 
 86 |         lr_gradient_var = None
 87 |         rp_gradient_var = None
 88 | 
 89 |         '''
 90 |         Preliminaries
 91 |         '''
 92 |         # compute advantages;
 93 |         curr_grad_advs = self.grad_advantages(self.tau, 
 94 |                                             grad_values, 
 95 |                                             grad_next_values,
 96 |                                             grad_rewards,
 97 |                                             grad_fdones,
 98 |                                             last_fdones)
 99 |         
100 |         '''
101 |         Estimate LR gradients and their variances.
102 |         '''
103 | 
104 |         with th.no_grad():
105 |             t_obses = swap_and_flatten01(th.stack(grad_obses, dim=0))
106 |             t_advantages = swap_and_flatten01(th.stack(curr_grad_advs, dim=0))
107 |             t_actions = swap_and_flatten01(th.stack(grad_actions, dim=0))
108 |             
109 |             # to reduce variance, we admit normalizing advantages;
110 |             if self.normalize_advantage:
111 |                 t_advantages = (t_advantages - t_advantages.mean()) / (t_advantages.std() + 1e-8)
112 | 
113 |         _, mu, std, _ = self.actor.forward_with_dist(t_obses)
114 |         t_neglogpacs = self.neglogp(t_actions, mu, std, th.log(std))
115 |             
116 |         actor_loss = t_advantages * t_neglogpacs.unsqueeze(-1)
117 |         
118 |         # randomly select subset to compute sample variance;
119 |         sample_num = np.min([self.var_est_num_sample, len(actor_loss)])  #if len(actor_loss) > 64 else len(actor_loss)
120 |         actor_loss_num = len(actor_loss)
121 |         actor_loss_indices = th.randperm(actor_loss_num)[:sample_num]
122 |         lr_gradients = []
123 |         for ai in actor_loss_indices:
124 |             al = actor_loss[ai].sum()
125 |             
126 |             self.actor_optimizer.zero_grad()
127 |             al.backward(retain_graph=True)
128 |             assert len(self.actor_optimizer.param_groups) == 1, ""
129 |             grad_list = []
130 |             for param in self.actor_optimizer.param_groups[0]['params']:
131 |                 grad_list.append(param.grad.reshape([-1]))
132 |             grad = th.cat(grad_list)
133 |             
134 |             # if length of the gradient is too long, we truncate it
135 |             # because it is too time consuming to use all of the gradients;
136 |             if len(grad) > self.var_est_max_grad_len:
137 |                 grad = grad[:self.var_est_max_grad_len]
138 |             lr_gradients.append(grad)
139 |             
140 |         lr_gradients = th.stack(lr_gradients, dim=0)
141 |             
142 |         lr_gradient_cov = th.cov(lr_gradients.transpose(0, 1))
143 |         if lr_gradient_cov.ndim == 0:
144 |             lr_gradient_cov = lr_gradient_cov.unsqueeze(0).unsqueeze(0)
145 |         lr_gradient_var = lr_gradient_cov.diagonal(0).sum()
146 | 
147 |         '''
148 |         Estimate RP gradients and their variances.
149 |         '''    
150 |         
151 |         # add value of the states;
152 |         for i in range(len(grad_values)):
153 |             curr_grad_advs[i] = curr_grad_advs[i] + grad_values[i]
154 | 
155 |         rp_gradients = []
156 |         for i in range(grad_start.shape[0]):
157 |             for j in range(grad_start.shape[1]):
158 |                 if not grad_start[i, j]:
159 |                     continue
160 |                 
161 |                 al: th.Tensor = -curr_grad_advs[i][j].sum()
162 |                 
163 |                 self.actor_optimizer.zero_grad()
164 |                 al.backward(retain_graph=True)
165 |                 assert len(self.actor_optimizer.param_groups) == 1, ""
166 |                 grad_list = []
167 |                 for param in self.actor_optimizer.param_groups[0]['params']:
168 |                     grad_list.append(param.grad.reshape([-1]))
169 |                 grad = th.cat(grad_list)
170 | 
171 |                 # if length of the gradient is too long, we truncate it
172 |                 # because it is too time consuming to use all of the gradients;
173 |                 if len(grad) > self.var_est_max_grad_len:
174 |                     grad = grad[:self.var_est_max_grad_len]
175 |                 rp_gradients.append(grad)
176 |                 
177 |                 if len(rp_gradients) >= self.var_est_num_sample:
178 |                     break
179 |             
180 |             if len(rp_gradients) >= self.var_est_num_sample:
181 |                 break
182 |                 
183 |         rp_gradients = th.stack(rp_gradients, dim=0)
184 |             
185 |         rp_gradient_cov = th.cov(rp_gradients.transpose(0, 1))
186 |         if rp_gradient_cov.ndim == 0:
187 |             rp_gradient_cov = rp_gradient_cov.unsqueeze(0).unsqueeze(0)
188 |         rp_gradient_var = rp_gradient_cov.diagonal(0).sum()
189 |             
190 |         '''
191 |         Interpolate LR and RP gradients using sample variances.
192 |         '''
193 |         k_lr = (rp_gradient_var) / (lr_gradient_var + rp_gradient_var + 1e-8)
194 |         k_rp = 1. - k_lr
195 |         
196 |         # self.writer.add_scalar("info/basic_k_lr", k_lr, self.epoch_num)
197 |         
198 |         lr_actor_loss = t_advantages * t_neglogpacs.unsqueeze(-1)
199 |         lr_actor_loss = th.mean(lr_actor_loss)
200 |         
201 |         rp_actor_loss = -self.grad_advantages_first_terms_sum(curr_grad_advs, grad_start)
202 |         rp_actor_loss = rp_actor_loss / th.count_nonzero(grad_start)
203 |         
204 |         actor_loss = (lr_actor_loss * k_lr) + (rp_actor_loss * k_rp)
205 |         
206 |         # update actor;
207 |         self.actor_optimizer.zero_grad()
208 |         actor_loss.backward()
209 |         if self.truncate_grads:
210 |             th.nn.utils.clip_grad_norm_(self.actor.parameters(), self.grad_norm)    
211 |         self.actor_optimizer.step()


--------------------------------------------------------------------------------
/src/gippo/rl_algorithm/ppo.py:
--------------------------------------------------------------------------------
  1 | import torch as th
  2 | from typing import List
  3 | 
  4 | from gippo.rl_algorithm.base import RLAlgorithm
  5 | from gippo.dataset import PPODataset
  6 | from copy import deepcopy
  7 | 
  8 | class PPO(RLAlgorithm):
  9 | 
 10 |     def __init__(self, config, env_config, device="cpu", log_path=None):
 11 |         
 12 |         super(PPO, self).__init__(config, env_config, device, log_path)
 13 |         
 14 |         self.actor_lr = float(config["actor_learning_rate"])
 15 |         self.actor_optimizer = th.optim.Adam(
 16 |             self.actor.parameters(),
 17 |             lr = self.actor_lr,
 18 |             eps = 1e-8,
 19 |         )
 20 | 
 21 |         ppo_config = config.get("ppo", {})
 22 | 
 23 |         # clipping parameter for PPO updates;
 24 |         self.e_clip = float(ppo_config.get("e_clip", 0.2))
 25 |         
 26 |         # minibatch settings for PPO updates;
 27 |         self.mini_epochs = int(ppo_config.get("mini_epochs", 5))
 28 |         self.minibatch_size = int(ppo_config.get("minibatch_size", 
 29 |                                             self.horizon_length * self.num_actors))
 30 |         self.dataset = PPODataset(self.batch_size, 
 31 |                                 self.minibatch_size, 
 32 |                                 device)
 33 | 
 34 |         '''
 35 |         Measures to prevent false optimization.
 36 | 
 37 |         Theoretically, we optimize surrogate loss function for learning
 38 |         better policy. However, if learning rate is too large, the optimization
 39 |         result could be worse than the previous one. If such case is detected,
 40 |         we decrease the learning rate and try again. [max_optim_iter] denotes
 41 |         the maximum number of such cycles.
 42 |         '''
 43 |         # use backup actor to restore the previous policy when optimization fails;
 44 |         self.b_actor = deepcopy(self.actor)
 45 | 
 46 |         # maximum number of iterations for actor optimization;
 47 |         self.max_optim_iter = int(ppo_config.get("max_optim_iter", 8))
 48 | 
 49 |         # multiplier to decrease learning rate;
 50 |         self.learning_rate_multiplier = float(ppo_config.get("learning_rate_multiplier", 1.5))
 51 |         
 52 |     def train_actor_critic_no_ppo(self):
 53 | 
 54 |         return super().train_actor_critic_no_ppo()
 55 | 
 56 |     def use_analytic_grads(self):
 57 |         
 58 |         return False
 59 |     
 60 |     def use_ppo(self):
 61 | 
 62 |         return True
 63 | 
 64 |     def get_optimizers_state(self):
 65 |         state = super().get_optimizers_state()
 66 |         state['actor'] = self.actor_optimizer.state_dict()
 67 |         
 68 |         return state
 69 | 
 70 |     def train_actor_no_ppo(self,
 71 |                             grad_start: th.Tensor,
 72 |                             grad_obses: List[th.Tensor],
 73 |                             grad_rp_eps: List[th.Tensor],
 74 |                             grad_actions: List[th.Tensor],
 75 |                             grad_values: List[th.Tensor],
 76 |                             grad_next_values: List[th.Tensor],
 77 |                             grad_rewards: List[th.Tensor],
 78 |                             grad_fdones: List[th.Tensor],
 79 |                             last_fdones: th.Tensor):
 80 | 
 81 |         pass
 82 | 
 83 |     def train_actor_ppo(self, batch_dict):
 84 | 
 85 |         self.prepare_dataset(batch_dict)
 86 | 
 87 |         # backup actor and optimizer to prevent policy degradation;
 88 |         self.backup_actor()
 89 | 
 90 |         initial_actor_lr = self.actor_lr
 91 | 
 92 |         for iter in range(self.max_optim_iter):
 93 |             
 94 |             a_losses = []
 95 |         
 96 |             for _ in range(0, self.mini_epochs):
 97 | 
 98 |                 for i in range(len(self.dataset)):
 99 | 
100 |                     a_loss, cmu, csigma = self.calc_gradients(self.dataset[i])
101 |                     a_losses.append(a_loss)
102 |                     self.dataset.update_mu_sigma(cmu, csigma)   
103 | 
104 |                     # this is erroneous code in original implementation,
105 |                     # put here for fair reproducibility;
106 |                     for param in self.actor_optimizer.param_groups:
107 |                         param['lr'] = self.actor_lr
108 | 
109 |             first_mini_epoch_loss = th.stack(a_losses[:len(self.dataset)]).mean()
110 |             last_mini_epoch_loss = th.stack(a_losses[-len(self.dataset):]).mean()
111 | 
112 |             if last_mini_epoch_loss > first_mini_epoch_loss:
113 |                 
114 |                 with th.no_grad():
115 |                     
116 |                     # optimization failed, restore the previous policy;
117 |                     self.restore_actor()
118 |                     
119 |                     # decrease learning rate;
120 |                     # @TODO: this is also an error in original implementation,
121 |                     # put here for fair reproducibility;
122 |                     for param in self.actor_optimizer.param_groups:
123 |                         param['lr'] = initial_actor_lr / self.learning_rate_multiplier
124 |                     self.actor_lr = initial_actor_lr / self.learning_rate_multiplier
125 |             else:
126 |                 # @TODO: this is also an error in original implementation,
127 |                 # put here for fair reproducibility;
128 |                 self.actor_lr = initial_actor_lr
129 |                 break
130 | 
131 |         self.writer.add_scalar("info/actor_lr", self.actor_lr, self.epoch_num)
132 | 
133 |         return a_losses
134 | 
135 |     def prepare_dataset(self, batch_dict):
136 | 
137 |         obses = batch_dict['obses']
138 |         advantages = batch_dict['advantages']
139 |         dones = batch_dict['dones']
140 |         values = batch_dict['values']
141 |         actions = batch_dict['actions']
142 |         neglogpacs = batch_dict['neglogpacs']
143 |         mus = batch_dict['mus']
144 |         sigmas = batch_dict['sigmas']
145 |         
146 |         advantages = th.sum(advantages, axis=1)
147 |         
148 |         if self.normalize_advantage:
149 |             advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)
150 |                 
151 |         dataset_dict = {}
152 |         dataset_dict['old_values'] = values
153 |         dataset_dict['advantages'] = advantages
154 |         dataset_dict['actions'] = actions
155 |         dataset_dict['obs'] = obses
156 |         
157 |         dataset_dict['old_mu'] = mus
158 |         dataset_dict['old_sigma'] = sigmas
159 |         dataset_dict['old_logp_actions'] = neglogpacs
160 | 
161 |         dataset_dict['mu'] = mus
162 |         dataset_dict['sigma'] = sigmas
163 |         dataset_dict['logp_actions'] = neglogpacs
164 | 
165 |         self.dataset.update_values_dict(dataset_dict)
166 | 
167 |     def backup_actor(self):
168 | 
169 |         with th.no_grad():
170 |             for param, param_targ in zip(self.actor.parameters(), self.b_actor.parameters()):
171 |                 param_targ.data.mul_(0.)
172 |                 param_targ.data.add_(param.data)
173 | 
174 |     def restore_actor(self):
175 | 
176 |         with th.no_grad():
177 |             for param, param_targ in zip(self.b_actor.parameters(), self.actor.parameters()):
178 |                 param_targ.data.mul_(0.)
179 |                 param_targ.data.add_(param.data)
180 | 
181 |     def calc_gradients(self, input_dict):
182 | 
183 |         advantage = input_dict['advantages']
184 |         actions_batch = input_dict['actions']
185 |         obs_batch = input_dict['obs']
186 |         old_action_log_probs_batch = input_dict['old_logp_actions']         # original action log probs;
187 |         curr_e_clip = self.e_clip
188 | 
189 |         # get current policy's actions;
190 |         curr_mu, curr_std = self.actor.forward_dist(obs_batch)
191 |         if curr_std.ndim == 1:
192 |             curr_std = curr_std.unsqueeze(0)                      
193 |             curr_std = curr_std.expand(curr_mu.shape[0], -1).clone()
194 |         neglogp = self.neglogp(actions_batch, curr_mu, curr_std, th.log(curr_std))
195 | 
196 |         a_loss = self.actor_loss(old_action_log_probs_batch, 
197 |                                 neglogp, 
198 |                                 advantage,
199 |                                 curr_e_clip).mean()
200 | 
201 |         # we only use actor loss here for fair comparison;
202 |         loss = a_loss
203 |         
204 |         self.actor_optimizer.zero_grad()
205 |         loss.backward()
206 |         if self.truncate_grads:
207 |             th.nn.utils.clip_grad_norm_(self.actor.parameters(), self.grad_norm)
208 |         self.actor_optimizer.step()
209 |    
210 |         self.train_result = (a_loss, curr_mu.detach(), curr_std.detach())
211 | 
212 |         return self.train_result
213 | 
214 |     def actor_loss(self, old_action_log_probs_batch, action_log_probs, advantage, curr_e_clip):
215 |         ratio = old_action_log_probs_batch - action_log_probs
216 |         ratio = th.clamp(ratio, max=64.0)        # prevent ratio becoming [inf];
217 |         ratio = th.exp(ratio)
218 |         
219 |         surr1 = advantage * ratio
220 |         surr2 = advantage * th.clamp(ratio, 1.0 - curr_e_clip,
221 |                                 1.0 + curr_e_clip)
222 |         a_loss = th.max(-surr1, -surr2)
223 |     
224 |         return a_loss


--------------------------------------------------------------------------------
/src/gippo/rl_algorithm/gippo.py:
--------------------------------------------------------------------------------
  1 | import torch as th
  2 | import numpy as np
  3 | from typing import List
  4 | 
  5 | from gippo.rl_algorithm.ppo import PPO
  6 | from gippo.utils import swap_and_flatten01, Normal, RunningMeanStd
  7 | 
  8 | class GIPPO(PPO):
  9 | 
 10 |     def __init__(self, config, env_config, device="cpu", log_path=None):
 11 |         
 12 |         super(GIPPO, self).__init__(config, env_config, device, log_path)
 13 |         
 14 |         '''
 15 |         Use different optimizers for analytical gradient-based 
 16 |         actor update and PPO-based actor update
 17 |         @TODO: Merge two optimizers?
 18 |         '''
 19 |         self.actor_lr_no_ppo = float(config["actor_learning_rate_no_ppo"])
 20 |         self.actor_optimizer_no_ppo = th.optim.Adam(
 21 |             self.actor.parameters(), 
 22 |             betas = config['betas'], 
 23 |             lr = self.actor_lr_no_ppo
 24 |         )
 25 | 
 26 |         '''
 27 |         Parameters for alpha-policy updates.
 28 |         '''
 29 |         gi_config = config.get("gi", {})
 30 |         self.gi_alpha = float(gi_config.get("alpha", 1e-3))
 31 |         self.gi_alpha_interval = float(gi_config.get("alpha_interval", 0.2))
 32 |         self.gi_alpha_update_factor = float(gi_config.get("alpha_update_factor", 1.1))
 33 |         self.gi_max_alpha = float(gi_config.get("max_alpha", 1.0))
 34 |         self.gi_num_iter = int(gi_config.get("num_iter", 16))
 35 |         self.gi_max_oorr = float(gi_config.get("max_oorr", 0.5))
 36 |         
 37 |         # rms for estimated alpha-policy performance;
 38 |         self.est_alpha_performace_rms = RunningMeanStd()
 39 |         
 40 |     def use_analytic_grads(self):
 41 |         
 42 |         return True
 43 |     
 44 |     def use_ppo(self):
 45 | 
 46 |         return True
 47 | 
 48 |     def get_optimizers_state(self):
 49 |         state = super().get_optimizers_state()
 50 |         state['actor_no_ppo'] = self.actor_optimizer_no_ppo.state_dict()
 51 |         
 52 |         return state
 53 | 
 54 |     def train_actor_critic_no_ppo(self):
 55 | 
 56 |         '''
 57 |         Set learning rate.
 58 |         '''
 59 |         # set learning rate;
 60 |         # do not change actor learning rate;
 61 |         # @TODO: too messy code, and errorneous;
 62 |         actor_lr_no_ppo = self.actor_lr_no_ppo
 63 |         critic_lr = self.critic_lr
 64 |         if self.lr_schedule == 'linear':
 65 |             critic_lr = (1e-5 - self.critic_lr) * float(self.epoch_num / self.max_epochs) + self.critic_lr
 66 |         
 67 |         for param_group in self.actor_optimizer_no_ppo.param_groups:
 68 |             param_group['lr'] = actor_lr_no_ppo
 69 |         for param_group in self.critic_optimizer.param_groups:
 70 |             param_group['lr'] = critic_lr
 71 | 
 72 |         self.writer.add_scalar("info/actor_lr_no_ppo", self.actor_lr_no_ppo, self.epoch_num)
 73 |         self.writer.add_scalar("info/critic_lr", critic_lr, self.epoch_num)
 74 |         self.writer.add_scalar("gi_info/alpha", self.gi_alpha, self.epoch_num)
 75 | 
 76 |         return super().train_actor_critic_no_ppo()
 77 | 
 78 |     def train_actor_no_ppo(self,
 79 |                             grad_start: th.Tensor,
 80 |                             grad_obses: List[th.Tensor],
 81 |                             grad_rp_eps: List[th.Tensor],
 82 |                             grad_actions: List[th.Tensor],
 83 |                             grad_values: List[th.Tensor],
 84 |                             grad_next_values: List[th.Tensor],
 85 |                             grad_rewards: List[th.Tensor],
 86 |                             grad_fdones: List[th.Tensor],
 87 |                             last_fdones: th.Tensor):
 88 | 
 89 |         # compute advantages;
 90 |         curr_grad_advs = self.grad_advantages(self.tau, 
 91 |                                             grad_values, 
 92 |                                             grad_next_values,
 93 |                                             grad_rewards,
 94 |                                             grad_fdones,
 95 |                                             last_fdones)
 96 |             
 97 |         # compute gradients of advantages w.r.t. actions;
 98 |         t_adv_gradient = self.differentiate_grad_advantages(grad_actions,
 99 |                                                             curr_grad_advs,
100 |                                                             grad_start,
101 |                                                             False)
102 | 
103 |         with th.no_grad():
104 |         
105 |             t_obses = swap_and_flatten01(th.stack(grad_obses, dim=0))
106 |             t_rp_eps = swap_and_flatten01(th.stack(grad_rp_eps, dim=0))
107 |             
108 |             t_advantages = swap_and_flatten01(th.stack(curr_grad_advs, dim=0))
109 |             t_actions = swap_and_flatten01(th.stack(grad_actions, dim=0))
110 |             t_adv_gradient = swap_and_flatten01(t_adv_gradient)
111 |             t_alpha_actions = t_actions + self.gi_alpha * t_adv_gradient
112 |             
113 |             # write log about variance;
114 |             # advantage variance;
115 |             t_advantages_var = th.var(t_advantages, dim=0)
116 |             t_adv_gradient_cov = th.cov(t_adv_gradient.transpose(0, 1))
117 |             if t_adv_gradient_cov.ndim == 0:
118 |                 t_adv_gradient_cov = t_adv_gradient_cov.unsqueeze(0).unsqueeze(0)
119 |             t_adv_gradient_var = t_adv_gradient_cov.diagonal(0).sum()
120 |             
121 |             self.writer.add_scalar("gi_info/advantage_variance", t_advantages_var, self.epoch_num)
122 |             self.writer.add_scalar("gi_info/advantage_gradient_variance", t_adv_gradient_var, self.epoch_num)
123 |                 
124 |         # backup actor before actor update;
125 |         self.backup_actor()
126 | 
127 |         '''
128 |         Update policy to alpha-policy.
129 |         '''
130 |         for i in range(self.max_optim_iter):
131 |         
132 |             actor_loss_0 = None
133 |             actor_loss_1 = None
134 | 
135 |             for j in range(self.gi_num_iter):
136 |                 
137 |                 _, mu, std, _ = self.actor.forward_with_dist(t_obses)
138 |                 
139 |                 distr = Normal(mu, std)
140 |                 rpeps_actions = distr.eps_to_action(t_rp_eps)
141 |                 
142 |                 actor_loss = (rpeps_actions - t_alpha_actions) * (rpeps_actions - t_alpha_actions)
143 |                 actor_loss = th.sum(actor_loss, dim=-1)
144 |                 actor_loss = actor_loss.mean()
145 |                 
146 |                 # update actor;
147 |                 self.actor_optimizer_no_ppo.zero_grad()
148 |                 actor_loss.backward()
149 |                 if self.truncate_grads:
150 |                     th.nn.utils.clip_grad_norm_(self.actor.parameters(), self.grad_norm)    
151 |                 self.actor_optimizer_no_ppo.step()
152 |                 
153 |                 if j == 0:
154 |                     actor_loss_0 = actor_loss.detach().cpu().item()
155 |                 elif j == self.gi_num_iter - 1:
156 |                     actor_loss_1 = actor_loss.detach().cpu().item()
157 |                     
158 |             log_actor_loss_0 = np.log(actor_loss_0)
159 |             log_actor_loss_1 = np.log(actor_loss_1)
160 |             actor_loss_ratio = np.exp(log_actor_loss_1 - log_actor_loss_0)
161 |             
162 |             if actor_loss_0 < actor_loss_1:
163 |                 
164 |                 with th.no_grad():
165 |                     
166 |                     # if optimization did not work well, restore original
167 |                     # policy, decrease learning rate and try again;
168 |                     self.restore_actor()
169 | 
170 |                     for param in self.actor_optimizer_no_ppo.param_groups:
171 |                         param['lr'] /= self.learning_rate_multiplier
172 | 
173 |                     continue
174 | 
175 |             else:
176 | 
177 |                 # @TODO: errorneous code, put here for fair reproducibility;
178 |                 for param in self.actor_optimizer_no_ppo.param_groups:
179 |                     param['lr'] = self.actor_lr_no_ppo
180 | 
181 |                 break
182 |         
183 |         self.writer.add_scalar("gi_info/actor_loss_ratio", actor_loss_ratio, self.epoch_num)
184 |                 
185 |         did_converge = actor_loss_0 > actor_loss_1
186 |                 
187 |         '''
188 |         Estimate determinant of (I + alpha * advantage Hessian)
189 |         and use it to safely bound alpha.
190 |         '''
191 |         with th.no_grad():
192 | 
193 |             old_mu, old_std = self.experience_buffer.tensor_dict['mus'], \
194 |                                 self.experience_buffer.tensor_dict['sigmas']
195 |                                 
196 |             old_mu, old_std = swap_and_flatten01(old_mu), swap_and_flatten01(old_std)
197 |                                 
198 |             _, new_mu, new_std, _ = self.actor.forward_with_dist(t_obses)
199 |             
200 |         preupdate_action_eps_jac = self.action_eps_jacobian(old_mu, old_std, t_rp_eps)
201 |         postupdate_action_eps_jac = self.action_eps_jacobian(new_mu, new_std, t_rp_eps)
202 |         
203 |         preupdate_action_eps_jacdet = th.logdet(preupdate_action_eps_jac)
204 |         postupdate_action_eps_jacdet = th.logdet(postupdate_action_eps_jac)
205 |         
206 |         est_hessian_logdet = postupdate_action_eps_jacdet - preupdate_action_eps_jacdet
207 |         est_hessian_det = th.exp(est_hessian_logdet)
208 |         
209 |         mean_est_hessian_det = th.mean(est_hessian_det)
210 |         min_est_hessian_det = th.min(est_hessian_det)
211 |         max_est_hessian_det = th.max(est_hessian_det)
212 |         
213 |         self.writer.add_scalar("gi_info/mean_est_hessian_det", mean_est_hessian_det, self.epoch_num)
214 |         self.writer.add_scalar("gi_info/min_est_hessian_det", min_est_hessian_det, self.epoch_num)
215 |         self.writer.add_scalar("gi_info/max_est_hessian_det", max_est_hessian_det, self.epoch_num)
216 |         
217 |         '''
218 |         Update alpha and actor learning rate for next iteration.
219 |         '''
220 |         curr_alpha = self.gi_alpha
221 |         curr_actor_lr_no_ppo = self.actor_lr_no_ppo
222 |         
223 |         next_alpha = curr_alpha
224 |         next_actor_lr_no_ppo = curr_actor_lr_no_ppo
225 |         
226 |         # we have to keep [est_hessian_det] in this range;
227 |         min_safe_interval = (1. - self.gi_alpha_interval)
228 |         max_safe_interval = (1. + self.gi_alpha_interval)
229 |         
230 |         if not did_converge:
231 |             # alpha does not change, only decrease actor learning rate;
232 |             next_actor_lr_no_ppo = curr_actor_lr_no_ppo / self.learning_rate_multiplier
233 |         else:
234 |             # actor_lr does not change, only change alpha;
235 |             if min_est_hessian_det < min_safe_interval or \
236 |                 max_est_hessian_det > max_safe_interval:
237 |                 next_alpha = curr_alpha / self.gi_alpha_update_factor
238 |             else:
239 |                 next_alpha = curr_alpha * self.gi_alpha_update_factor
240 |                 
241 |         next_alpha = np.clip(next_alpha, None, self.gi_max_alpha)
242 |         next_actor_lr_no_ppo = np.clip(next_actor_lr_no_ppo, 1e-5, None)
243 | 
244 |         '''
245 |         Observe how much alpha-policy is different from the original
246 |         policy, and then adjust [next_alpha] accordingly.
247 |         '''
248 |         next_alpha = self.adjust_next_alpha_by_policy_diff(next_alpha, curr_grad_advs)
249 | 
250 |         self.gi_alpha = next_alpha
251 |         self.actor_lr_no_ppo = next_actor_lr_no_ppo
252 | 
253 |         return
254 | 
255 |     def differentiate_grad_advantages(self, 
256 |                                     grad_actions: th.Tensor, 
257 |                                     grad_advs: th.Tensor, 
258 |                                     grad_start: th.Tensor, 
259 |                                     debug=False):
260 |         
261 |         '''
262 |         Compute first-order gradients of [grad_advs] w.r.t. 
263 |         [grad_actions] using automatic differentiation.
264 |         '''
265 | 
266 |         num_timestep = grad_start.shape[0]
267 |         num_actor = grad_start.shape[1]
268 | 
269 |         '''
270 |         Using GAE, we can compute gradient of [grad_advs] at each
271 |         time step by only backpropagating once for the first time
272 |         step of a trajectory.
273 |         '''
274 |         adv_sum: th.Tensor = self.grad_advantages_first_terms_sum(grad_advs, grad_start)
275 |         for ga in grad_actions:
276 |             ga.retain_grad()
277 |         adv_sum.backward(retain_graph=debug)
278 |         adv_gradient = []
279 |         for ga in grad_actions:
280 |             adv_gradient.append(ga.grad)
281 |         adv_gradient = th.stack(adv_gradient)
282 |         
283 |         # reweight gradients, so that we get correct gradients
284 |         # for each time step;
285 |         with th.no_grad():
286 | 
287 |             c = (1.0 / (self.gamma * self.tau))
288 |             cv = th.ones((num_actor, 1), device=adv_gradient.device)
289 | 
290 |             for nt in range(num_timestep):
291 | 
292 |                 # if new episode has been started, set [cv] to 1; 
293 |                 for na in range(num_actor):
294 |                     if grad_start[nt, na]:
295 |                         cv[na, 0] = 1.0
296 | 
297 |                 adv_gradient[nt] = adv_gradient[nt] * cv
298 |                 cv = cv * c
299 |                 
300 |         if debug:
301 | 
302 |             '''
303 |             Compute gradients of [grad_advs] at each time step
304 |             in brute force and compare it with the above computation
305 |             results, which is more efficient than this.
306 |             '''
307 |             for i in range(num_timestep):
308 |                 
309 |                 debug_adv_sum = grad_advs[i].sum()
310 |                 
311 |                 debug_grad_adv_gradient = th.autograd.grad(debug_adv_sum, grad_actions[i], retain_graph=True)[0]
312 |                 debug_grad_adv_gradient_norm = th.norm(debug_grad_adv_gradient, p=2, dim=-1)
313 |                 
314 |                 debug_grad_error = th.norm(debug_grad_adv_gradient - adv_gradient[i], p=2, dim=-1)
315 |                 debug_grad_error_ratio = debug_grad_error / debug_grad_adv_gradient_norm
316 |                 
317 |                 assert th.all(debug_grad_error_ratio < 0.01), \
318 |                     "Gradient of advantage possibly wrong"
319 |                         
320 |         adv_gradient = adv_gradient.detach()
321 |         
322 |         return adv_gradient
323 | 
324 |     def action_eps_jacobian(self, mu, sigma, eps):
325 | 
326 |         '''
327 |         Assume action is computed as:
328 |         a = mu + sigma * eps,
329 |         where mu, sigma, and eps are all one-dim tensors.
330 |         '''
331 |         
332 |         jacobian = th.zeros((eps.shape[0], eps.shape[1], eps.shape[1]))
333 |         
334 |         for d in range(eps.shape[1]):
335 |             
336 |             if sigma.ndim == 1:
337 |                 jacobian[:, d, d] = sigma[d].detach()
338 |             elif sigma.ndim == 2:
339 |                 jacobian[:, d, d] = sigma[:, d].detach()
340 |             
341 |         return jacobian
342 | 
343 |     @th.no_grad()
344 |     def adjust_next_alpha_by_policy_diff(self, next_alpha, grad_advs):
345 |         '''
346 |         Observe how much alpha-policy is different from the original
347 |         policy, and then adjust [next_alpha] accordingly.
348 | 
349 |         If alpha-policy is too far away from the original policy,
350 |         decrease [next_alpha], so that there is some room for PPO
351 |         optimization.
352 |         '''
353 |         obses = swap_and_flatten01(self.experience_buffer.tensor_dict['obses'].detach())
354 |         neglogpacs = swap_and_flatten01(self.experience_buffer.tensor_dict['neglogpacs'].detach())
355 |         actions = swap_and_flatten01(self.experience_buffer.tensor_dict['actions'].detach())
356 |         advantages = swap_and_flatten01(th.cat(grad_advs, dim=0).detach())
357 | 
358 |         n_mus, n_sigmas = self.actor.forward_dist(obses)
359 |         if n_sigmas.ndim == 1:
360 |             n_sigmas = n_sigmas.unsqueeze(0)                      
361 |             n_sigmas = n_sigmas.expand(n_mus.shape[0], -1).clone()
362 |         
363 |         n_neglogpacs = self.neglogp(actions, n_mus, n_sigmas, th.log(n_sigmas))
364 |         
365 |         '''
366 |         Estimate difference between alpha-policy and original policy
367 |         using out-of-range-ratio.
368 |         '''
369 |         pac_ratio = th.exp(th.clamp(neglogpacs - n_neglogpacs, max=16.))  # prevent [inf];
370 |         out_of_range_pac_ratio = th.logical_or(pac_ratio < (1. - self.e_clip), 
371 |                                                     pac_ratio > (1. + self.e_clip))
372 |         out_of_range_pac_ratio = th.count_nonzero(out_of_range_pac_ratio) / actions.shape[0]
373 |         
374 |         self.writer.add_scalar("gi_info/out_of_range_ratio", out_of_range_pac_ratio, self.epoch_num)
375 |                 
376 |         '''
377 |         Evaluate the bias of analytical gradients by estimating the
378 |         performance of alpha-policy in terms of PPO.
379 |         '''
380 |         est_alpha_performance = \
381 |             th.sum(advantages * pac_ratio) - \
382 |             th.sum(advantages)
383 |         
384 |         # @TODO: Ugly approach to prevent overly noisy [est_alpha_performance];
385 |         n_est_alpha_performance = self.est_alpha_performace_rms.normalize(est_alpha_performance)
386 |         self.est_alpha_performace_rms.update(est_alpha_performance.unsqueeze(0))
387 |         
388 |         self.writer.add_scalar("gi_info/est_alpha_performance", est_alpha_performance, self.epoch_num)
389 |         self.writer.add_scalar("gi_info/est_alpha_performance_normalized", n_est_alpha_performance, self.epoch_num)
390 |                 
391 |         '''
392 |         In following conditions, decrease [next_alpha]:
393 |         1. [out_of_range_pac_ratio] is too high (guarantee PPO update);
394 |         2. [est_alpha_performance] is negative (biased analytical grads);
395 |         '''
396 |         if out_of_range_pac_ratio > self.gi_max_oorr or \
397 |             (est_alpha_performance < 0 and n_est_alpha_performance < -1.):
398 |             
399 |             next_alpha = self.gi_alpha / self.gi_alpha_update_factor
400 |         
401 |         next_alpha = np.clip(next_alpha, None, self.gi_max_alpha)
402 |         return next_alpha
403 | 
404 |     def prepare_dataset(self, batch_dict):
405 | 
406 |         super().prepare_dataset(batch_dict)
407 | 
408 |         '''
409 |         Since policy could have been updated to alpha policy,
410 |         change [mu], [sigma], and [logp_actions] accordingly.
411 |         '''
412 |         obses = batch_dict['obses']
413 |         actions = batch_dict['actions']
414 | 
415 |         with th.no_grad():
416 |             n_mus, n_sigmas = self.actor.forward_dist(obses)
417 |             if n_sigmas.ndim == 1:
418 |                 n_sigmas = n_sigmas.unsqueeze(0)                      
419 |                 n_sigmas = n_sigmas.expand(n_mus.shape[0], -1).clone()    
420 |             n_neglogpacs = self.neglogp(actions, n_mus, n_sigmas, th.log(n_sigmas))
421 |         
422 |         self.dataset.values_dict['mu'] = n_mus
423 |         self.dataset.values_dict['sigma'] = n_sigmas
424 |         self.dataset.values_dict['logp_actions'] = n_neglogpacs
425 | 
426 |     def calc_gradients(self, input_dict):
427 | 
428 |         advantage = input_dict['advantages']
429 |         actions_batch = input_dict['actions']
430 |         obs_batch = input_dict['obs']
431 | 
432 |         old_action_log_probs_batch_before_alpha = input_dict['old_logp_actions']      # action log probs before alpha update;
433 |         old_action_log_probs_batch_after_alpha = input_dict['logp_actions']           # action log probs after alpha update;
434 |         
435 |         curr_e_clip = self.e_clip
436 | 
437 |         # get current policy's actions;
438 |         curr_mu, curr_std = self.actor.forward_dist(obs_batch)
439 |         if curr_std.ndim == 1:
440 |             curr_std = curr_std.unsqueeze(0)                      
441 |             curr_std = curr_std.expand(curr_mu.shape[0], -1).clone()
442 |         neglogp = self.neglogp(actions_batch, curr_mu, curr_std, th.log(curr_std))
443 | 
444 |         a_loss = self.actor_loss(old_action_log_probs_batch_before_alpha,
445 |                                 old_action_log_probs_batch_after_alpha, 
446 |                                 neglogp, 
447 |                                 advantage,
448 |                                 curr_e_clip).mean()
449 | 
450 |         # we only use actor loss here for fair comparison;
451 |         loss = a_loss
452 |         
453 |         self.actor_optimizer.zero_grad()
454 |         loss.backward()
455 |         if self.truncate_grads:
456 |             th.nn.utils.clip_grad_norm_(self.actor.parameters(), self.grad_norm)
457 |         self.actor_optimizer.step()
458 |    
459 |         self.train_result = (a_loss, curr_mu.detach(), curr_std.detach())
460 | 
461 |         return self.train_result
462 | 
463 |     def actor_loss(self, 
464 |                 old_action_log_probs_batch_before_alpha, 
465 |                 old_action_log_probs_batch_after_alpha,
466 |                 action_log_probs, 
467 |                 advantage, 
468 |                 curr_e_clip):
469 |         
470 |         t_ratio = old_action_log_probs_batch_before_alpha - \
471 |                     old_action_log_probs_batch_after_alpha
472 |         
473 |         if th.any(th.abs(t_ratio) > 4.):
474 |             # ratio can be numerically unstable, just use original ppo;
475 |             # but use policy after RP update as importance sampling distribution;
476 |             ratio = old_action_log_probs_batch_after_alpha - action_log_probs
477 |         else:
478 |             t_ratio = th.exp(t_ratio)
479 |             tmp0 = th.log(t_ratio + 1.)
480 |             tmp1 = tmp0 - old_action_log_probs_batch_before_alpha
481 |             action_log_probs_batch_mid = np.log(2.) - tmp1
482 |             
483 |             ratio = action_log_probs_batch_mid - action_log_probs
484 |             
485 |         ratio = th.clamp(ratio, min=-16., max=16.)        # prevent ratio becoming [inf];
486 |         ratio = th.exp(ratio)
487 | 
488 |         surr1 = advantage * ratio
489 |         surr2 = advantage * th.clamp(ratio, 1.0 - curr_e_clip,
490 |                                 1.0 + curr_e_clip)
491 |         a_loss = th.max(-surr1, -surr2)
492 |         
493 |         return a_loss


--------------------------------------------------------------------------------
/src/gippo/rl_algorithm/base.py:
--------------------------------------------------------------------------------
   1 | '''
   2 | Modified from 
   3 | https://github.com/Denys88/rl_games/blob/master/rl_games/algos_torch/a2c_continuous.py
   4 | '''
   5 | 
   6 | import numpy as np
   7 | import torch as th
   8 | from torch import nn
   9 | from typing import List
  10 | 
  11 | import time
  12 | import gym
  13 | import copy
  14 | import os
  15 | 
  16 | from gippo.utils import RunningMeanStd, AverageMeter, swap_and_flatten01
  17 | from gippo.vecenv import create_vecenv
  18 | from gippo.network import ActorStochasticMLP, CriticMLP
  19 | from gippo.experience import ExperienceBuffer
  20 | from gippo.dataset import CriticDataset
  21 | 
  22 | from torch.utils.tensorboard import SummaryWriter
  23 | 
  24 | save_distribution = False
  25 | 
  26 | class RLAlgorithm:
  27 |     def __init__(self, 
  28 |                 config, 
  29 |                 env_config,
  30 |                 device="cpu", 
  31 |                 log_path=None):
  32 | 
  33 |         '''
  34 |         Basic configs
  35 |         '''
  36 |         self.config = config
  37 |         self.device = device
  38 | 
  39 |         # logging;
  40 |         self.log_path = log_path
  41 |         if self.log_path is None:
  42 |             self.log_path = f'./logdir/{time.strftime("%Y-%m-%d_%H-%M-%S")}'
  43 |         self.nn_dir = os.path.join(self.log_path, 'nn')
  44 |         self.summaries_dir = os.path.join(self.log_path, 'runs')
  45 | 
  46 |         os.makedirs(self.log_path, exist_ok=True)
  47 |         os.makedirs(self.nn_dir, exist_ok=True)
  48 |         os.makedirs(self.summaries_dir, exist_ok=True)
  49 | 
  50 |         self.writer = SummaryWriter(self.summaries_dir)
  51 |         self.save_freq = config.get('save_frequency', 0)
  52 |         self.save_best_after = config.get('save_best_after', 100)
  53 |         self.print_stats = config.get('print_stats', True)
  54 | 
  55 |         # experience buffer size that we are going to use for training;
  56 |         self.horizon_length = config.get('horizon_length', 32)
  57 |         self.num_actors = config.get('num_actors', 1)
  58 |         self.num_agents = config.get('num_agents', 1)
  59 |         self.batch_size = self.horizon_length * self.num_actors * self.num_agents
  60 |         self.batch_size_envs = self.horizon_length * self.num_actors
  61 | 
  62 |         # env configs;
  63 |         self.env_name = env_config['name']
  64 |         self.env_config = env_config.get('config', {})
  65 |         self.env_config['device'] = self.device
  66 |         self.env_config['no_grad'] = not self.use_analytic_grads()
  67 |         self.vec_env = create_vecenv(
  68 |             self.env_name, 
  69 |             self.num_actors, 
  70 |             **self.env_config)
  71 |         self.env_info = self.vec_env.get_env_info()
  72 | 
  73 |         self.value_size = self.env_info.get('value_size', 1)
  74 | 
  75 |         # reshaper and normalization;
  76 |         self.rewards_shaper = config.get("rewards_shaper", None)
  77 |         self.normalize_input = config.get("normalize_input", False)
  78 |         self.normalize_value = config.get("normalize_value", False)
  79 |         if self.normalize_value:
  80 |             self.value_mean_std = RunningMeanStd((1,)).to(self.device)
  81 |         self.normalize_advantage = config.get("normalize_advantage", False)
  82 | 
  83 |         # observation;
  84 |         self.observation_space = self.env_info['observation_space']
  85 |         self.obs_shape = self.observation_space.shape
  86 |         self.obs = None
  87 | 
  88 |         # running stats;
  89 |         self.frame = 0
  90 |         self.update_time = 0
  91 |         self.mean_rewards = self.last_mean_rewards = -100500
  92 |         self.play_time = 0
  93 |         self.epoch_num = 0
  94 | 
  95 |         # training;
  96 |         self.max_epochs = self.config.get('max_epochs', 1e6)
  97 |         self.network = config.get("network", None)
  98 | 
  99 |         '''
 100 |         Our work solves stochastic optimization problem in differentiable environment.
 101 |         '''
 102 |         num_obs = self.obs_shape[0]
 103 |         num_actions = self.env_info['action_space'].shape[0]
 104 | 
 105 |         self.actor = ActorStochasticMLP(num_obs, 
 106 |                                         num_actions, 
 107 |                                         config['network'], 
 108 |                                         device=self.device)
 109 | 
 110 |         self.critic = CriticMLP(num_obs,
 111 |                                 config['network'],
 112 |                                 device=self.device)
 113 | 
 114 |         self.target_critic = copy.deepcopy(self.critic)
 115 |         self.target_critic_alpha = config.get('target_critic_alpha', 0.4)
 116 |         
 117 |         self.all_params = list(self.actor.parameters()) + list(self.critic.parameters())
 118 | 
 119 |         '''
 120 |         Optimizers
 121 |         '''
 122 |             
 123 |         # critic;
 124 |         self.critic_lr = float(config["critic_learning_rate"])
 125 |         self.critic_optimizer = th.optim.Adam(
 126 |             self.critic.parameters(), 
 127 |             betas = config['betas'], 
 128 |             lr = self.critic_lr
 129 |         )
 130 |         self.critic_iterations = config["critic_iterations"]
 131 |         self.critic_num_batch = config["critic_num_batch"]
 132 | 
 133 |         # misc;
 134 |         self.truncate_grads = config["truncate_grads"]
 135 |         self.grad_norm = config["grad_norm"]
 136 | 
 137 |         # learning rate scheduler;
 138 |         self.lr_schedule = config['lr_schedule']
 139 |         
 140 |         # change to proper running mean std for backpropagation;
 141 |         if self.normalize_input:
 142 |             if isinstance(self.observation_space, gym.spaces.Dict):
 143 |                 raise NotImplementedError()
 144 |             else:
 145 |                 self.obs_rms = RunningMeanStd(shape=self.obs_shape, device=self.device)
 146 |                 
 147 |         if self.normalize_value:
 148 |             self.val_rms = RunningMeanStd(shape=(1,), device=self.device)
 149 | 
 150 |         # episode length;
 151 |         self.episode_max_length = self.vec_env.env.episode_length
 152 | 
 153 |         # statistics;
 154 |         self.games_to_track = 100
 155 |         self.game_rewards = AverageMeter(self.value_size, self.games_to_track).to(self.device)
 156 |         self.game_lengths = AverageMeter(1, self.games_to_track).to(self.device)
 157 | 
 158 |         # GAE params;
 159 |         self.gamma = config['gamma']
 160 |         self.tau = config['tau']
 161 |         
 162 |     def train(self):
 163 |         self.init_tensors()
 164 |         self.last_mean_rewards = -100500
 165 |         start_time = time.time()
 166 |         total_time = 0
 167 |         rep_count = 0
 168 |         self.obs = self.env_reset()
 169 |         self.curr_frames = self.batch_size_envs
 170 | 
 171 |         while True:
 172 |             epoch_num = self.update_epoch()
 173 |             
 174 |             step_time, no_ppo_time, ppo_time, sum_time, ppo_loss = \
 175 |                 self.train_epoch()
 176 |             
 177 |             total_time += sum_time
 178 |             frame = self.frame
 179 | 
 180 |             # cleaning memory to optimize space
 181 |             if self.use_ppo():
 182 |                 self.dataset.update_values_dict(None)
 183 |             
 184 |             print(f"Num steps: {frame + self.curr_frames}")
 185 | 
 186 |             # do we need scaled_time?
 187 |             scaled_time = sum_time #self.num_agents * sum_time
 188 |             scaled_no_ppo_time = no_ppo_time #self.num_agents * play_time
 189 |             curr_frames = self.curr_frames
 190 |             self.frame += curr_frames
 191 | 
 192 |             self.write_stats(total_time, 
 193 |                             epoch_num, 
 194 |                             step_time, 
 195 |                             no_ppo_time, 
 196 |                             ppo_time, 
 197 |                             ppo_loss,
 198 |                             frame, 
 199 |                             scaled_time, 
 200 |                             scaled_no_ppo_time, 
 201 |                             curr_frames)
 202 |             
 203 |             mean_rewards = [0]
 204 |             mean_lengths = 0
 205 | 
 206 |             if self.game_rewards.current_size > 0:
 207 |                 mean_rewards = self.game_rewards.get_mean()
 208 |                 mean_lengths = self.game_lengths.get_mean()
 209 |                 self.mean_rewards = mean_rewards[0]
 210 | 
 211 |                 for i in range(self.value_size):
 212 |                     rewards_name = 'rewards' if i == 0 else 'rewards{0}'.format(i)
 213 |                     self.writer.add_scalar(rewards_name + '/step'.format(i), mean_rewards[i], frame)
 214 |                     self.writer.add_scalar(rewards_name + '/iter'.format(i), mean_rewards[i], epoch_num)
 215 |                     self.writer.add_scalar(rewards_name + '/time'.format(i), mean_rewards[i], total_time)
 216 | 
 217 |                 self.writer.add_scalar('episode_lengths/step', mean_lengths, frame)
 218 |                 self.writer.add_scalar('episode_lengths/iter', mean_lengths, epoch_num)
 219 |                 self.writer.add_scalar('episode_lengths/time', mean_lengths, total_time)
 220 | 
 221 |                 checkpoint_name = self.config['name'] + 'ep' + str(epoch_num) + 'rew' + str(mean_rewards)
 222 | 
 223 |                 if self.save_freq > 0:
 224 |                     if (epoch_num % self.save_freq == 0) and (mean_rewards[0] <= self.last_mean_rewards):
 225 |                         self.save(os.path.join(self.nn_dir, 'last_' + checkpoint_name))
 226 | 
 227 |                 if mean_rewards[0] > self.last_mean_rewards and epoch_num >= self.save_best_after:
 228 |                     print('saving next best rewards: ', mean_rewards)
 229 |                     self.last_mean_rewards = mean_rewards[0]
 230 |                     self.save(os.path.join(self.nn_dir, self.config['name']))
 231 |                     
 232 |             if epoch_num > self.max_epochs:
 233 |                 self.save(os.path.join(self.nn_dir, 'last_' + self.config['name'] + 'ep' + str(epoch_num) + 'rew' + str(mean_rewards)))
 234 |                 print('MAX EPOCHS NUM!')
 235 |                 return self.last_mean_rewards, epoch_num
 236 | 
 237 |             update_time = 0
 238 |             if self.print_stats:
 239 |                 fps_step = curr_frames / step_time
 240 |                 # fps_step_inference = curr_frames / scaled_play_time
 241 |                 fps_total = curr_frames / scaled_time
 242 |                 # print(f'fps step: {fps_step:.1f} fps step and policy inference: {fps_step_inference:.1f}  fps total: {fps_total:.1f} mean reward: {mean_rewards[0]:.2f} mean lengths: {mean_lengths:.1f}')
 243 |                 print(f'epoch: {epoch_num} fps step: {fps_step:.1f} fps total: {fps_total:.1f} mean reward: {mean_rewards[0]:.2f} mean lengths: {mean_lengths:.1f}')
 244 | 
 245 |     def init_tensors(self):
 246 | 
 247 |         # use specialized experience buffer;
 248 |         batch_size = self.num_agents * self.num_actors
 249 |         
 250 |         algo_info = {
 251 |             'num_actors' : self.num_actors,
 252 |             'horizon_length' : self.horizon_length,
 253 |         }
 254 | 
 255 |         self.experience_buffer = ExperienceBuffer(
 256 |             self.env_info, 
 257 |             algo_info, 
 258 |             self.device
 259 |         )
 260 | 
 261 |         current_rewards_shape = (batch_size, self.value_size)
 262 |         self.current_rewards = th.zeros(current_rewards_shape, dtype=th.float32, device=self.device)
 263 |         self.current_lengths = th.zeros(batch_size, dtype=th.float32, device=self.device)
 264 |         self.dones = th.ones((batch_size,), dtype=th.uint8, device=self.device)
 265 | 
 266 |         self.update_list = ['actions', 'neglogpacs', 'values', 'mus', 'sigmas']
 267 |         self.tensor_list = self.update_list + ['obses', 'states', 'dones', 'adv_grads']
 268 | 
 269 |     def cast_obs(self, obs):
 270 |         if isinstance(obs, th.Tensor):
 271 |             self.is_tensor_obses = True
 272 |         elif isinstance(obs, np.ndarray):
 273 |             assert(obs.dtype != np.int8)
 274 |             if obs.dtype == np.uint8:
 275 |                 obs = th.ByteTensor(obs).to(self.device)
 276 |             else:
 277 |                 obs = th.FloatTensor(obs).to(self.device)
 278 |         return obs
 279 |     
 280 |     def obs_to_tensors(self, obs):
 281 |         obs_is_dict = isinstance(obs, dict)
 282 |         if obs_is_dict:
 283 |             raise NotImplementedError()
 284 |         else:
 285 |             upd_obs = self.cast_obs(obs)
 286 |         if not obs_is_dict or 'obs' not in obs:    
 287 |             upd_obs = {'obs' : upd_obs}
 288 |         return upd_obs
 289 |      
 290 |     def env_reset(self):
 291 |         obs = self.vec_env.reset()
 292 |         obs = self.obs_to_tensors(obs)
 293 |         return obs
 294 | 
 295 |     def update_epoch(self):
 296 |         self.epoch_num += 1
 297 |         return self.epoch_num
 298 |     
 299 |     def train_epoch(self):
 300 | 
 301 |         self.vec_env.set_train_info(self.frame, self)
 302 | 
 303 |         no_ppo_time_start = time.time()
 304 | 
 305 |         # set learning rate;
 306 |         # if self.gi_lr_schedule == 'linear':
 307 |         #     if self.gi_algorithm in ['shac-only', 'grad-ppo-shac', 'basic-lr', 'basic-rp', 'basic-combination']:
 308 |         #         actor_lr = (1e-5 - self.actor_lr) * float(self.epoch_num / self.max_epochs) + self.actor_lr
 309 |         #     else:
 310 |         #         actor_lr = self.actor_lr
 311 |         #     critic_lr = (1e-5 - self.critic_lr) * float(self.epoch_num / self.max_epochs) + self.critic_lr
 312 |         # else:
 313 |         #     actor_lr = self.actor_lr
 314 |         #     critic_lr = self.critic_lr
 315 |         
 316 |         # for param_group in self.actor_optimizer.param_groups:
 317 |         #     param_group['lr'] = actor_lr
 318 |         # for param_group in self.critic_optimizer.param_groups:
 319 |         #     param_group['lr'] = critic_lr
 320 | 
 321 |         # self.writer.add_scalar("info/gi_actor_lr", actor_lr, self.epoch_num)
 322 |         # self.writer.add_scalar("info/gi_critic_lr", critic_lr, self.epoch_num)
 323 |         
 324 |         # # rp actor lr and alpha;
 325 |         
 326 |         # self.writer.add_scalar("info_alpha/actor_lr", self.actor_lr, self.epoch_num)
 327 |         # self.writer.add_scalar("info_alpha/alpha", self.gi_curr_alpha, self.epoch_num)
 328 | 
 329 |         '''
 330 |         Train actor critic using methods other than PPO.
 331 |         When we use PPO-based methods (PPO, GI-PPO),
 332 |         we additionally collect experience to use in PPO
 333 |         updates afterwards.
 334 |         '''
 335 |         batch_dict = self.train_actor_critic_no_ppo()
 336 |         no_ppo_time_end = time.time()
 337 | 
 338 |         self.curr_frames = batch_dict.pop('played_frames')
 339 |         
 340 |         '''
 341 |         Train actor using PPO-based algorithms using 
 342 |         collected experience above.
 343 |         '''
 344 |         ppo_time_start = time.time()
 345 |         if self.use_ppo():
 346 |             ppo_loss = self.train_actor_ppo(batch_dict)
 347 |         else:
 348 |             # placeholders;
 349 |             ppo_loss = [th.zeros((1,), dtype=th.float32, device=self.device)]
 350 |         ppo_time_end = time.time()
 351 | 
 352 |         no_ppo_time = no_ppo_time_end - no_ppo_time_start
 353 |         ppo_time = ppo_time_end - ppo_time_start
 354 |         total_time = ppo_time_end - no_ppo_time_start
 355 |         
 356 |         # update (rp) alpha and actor lr;
 357 |         
 358 |         # self.gi_curr_alpha = self.next_alpha
 359 |         # self.actor_lr = self.next_actor_lr
 360 | 
 361 |         return batch_dict['step_time'], \
 362 |                 no_ppo_time, \
 363 |                 ppo_time, \
 364 |                 total_time, \
 365 |                 ppo_loss
 366 | 
 367 |     def train_actor_critic_no_ppo(self):
 368 | 
 369 |         epinfos = []
 370 |         update_list = self.update_list
 371 | 
 372 |         step_time = 0.0
 373 | 
 374 |         # indicator for steps that grad computation starts;
 375 |         grad_start = th.zeros_like(self.experience_buffer.tensor_dict['dones'])
 376 |         
 377 |         grad_obses = []
 378 |         grad_values = []
 379 |         grad_next_values = []
 380 |         grad_actions = []
 381 |         grad_rewards = []
 382 |         grad_fdones = []
 383 |         grad_rp_eps = []
 384 | 
 385 |         # use frozen [obs_rms] and [value_rms] during this one function call;
 386 |         curr_obs_rms = None
 387 |         curr_val_rms = None
 388 |         if self.normalize_input:
 389 |             with th.no_grad():
 390 |                 curr_obs_rms = copy.deepcopy(self.obs_rms)
 391 |         if self.normalize_value:
 392 |             with th.no_grad():
 393 |                 curr_val_rms = copy.deepcopy(self.val_rms)
 394 | 
 395 |         # start with clean grads;
 396 |         self.obs = self.vec_env.env.initialize_trajectory()
 397 |         self.obs = self.obs_to_tensors(self.obs)
 398 |         grad_start[0, :] = 1.0
 399 | 
 400 |         for n in range(self.horizon_length):
 401 | 
 402 |             if n > 0:
 403 |                 grad_start[n, :] = self.dones
 404 | 
 405 |             # get action for current observation;
 406 |             if self.use_analytic_grads():
 407 |                 res_dict = self.get_action_values(
 408 |                     self.obs, 
 409 |                     curr_obs_rms, 
 410 |                     curr_val_rms
 411 |                 )
 412 |             else:
 413 |                 with th.no_grad():
 414 |                     res_dict = self.get_action_values(
 415 |                         self.obs, 
 416 |                         curr_obs_rms, 
 417 |                         curr_val_rms
 418 |                     )
 419 |                 
 420 |             # we store tensor objects with gradients;
 421 |             grad_obses.append(res_dict['obs'])
 422 |             grad_values.append(res_dict['values'])
 423 |             grad_actions.append(res_dict['actions'])
 424 |             grad_fdones.append(self.dones.float())
 425 |             grad_rp_eps.append(res_dict['rp_eps'])
 426 | 
 427 |             # [obs] is an observation of the current time step;
 428 |             # store processed obs, which might have been normalized already;
 429 |             self.experience_buffer.update_data('obses', n, res_dict['obs'])
 430 | 
 431 |             # [dones] indicate if this step is the start of a new episode;
 432 |             self.experience_buffer.update_data('dones', n, self.dones)
 433 | 
 434 |             for k in update_list:
 435 |                 self.experience_buffer.update_data(k, n, res_dict[k]) 
 436 |             
 437 |             # take action;
 438 |             step_time_start = time.time()
 439 |             actions = th.tanh(grad_actions[-1])
 440 |             
 441 |             if self.use_analytic_grads():
 442 |                 self.obs, rewards, self.dones, infos = self.vec_env.step(actions)
 443 |             else:
 444 |                 with th.no_grad():
 445 |                     self.obs, rewards, self.dones, infos = self.vec_env.step(actions)
 446 |             
 447 |             self.obs = self.obs_to_tensors(self.obs)
 448 |             rewards = rewards.unsqueeze(-1)
 449 |             step_time_end = time.time()
 450 |             step_time += (step_time_end - step_time_start)
 451 | 
 452 |             # compute value of next state;
 453 |             if self.use_analytic_grads():
 454 |                 next_obs = infos['obs_before_reset']
 455 |             else:
 456 |                 next_obs = self.obs['obs']
 457 |             
 458 |             if self.normalize_input:
 459 |                 # do not update rms here;
 460 |                 next_obs = curr_obs_rms.normalize(next_obs)
 461 |             next_value = self.target_critic(next_obs)
 462 |             if self.normalize_value:
 463 |                 next_value = curr_val_rms.normalize(next_value, True)
 464 |             
 465 |             # even though [next_value] can wrong when it is based on
 466 |             # a [next_obs] that is at the start of new episode,
 467 |             # we deal with it by making it zero when it was an early termination;
 468 |             grad_next_values.append(next_value)
 469 | 
 470 |             done_env_ids = self.dones.nonzero(as_tuple = False).squeeze(-1)
 471 |             for id in done_env_ids:
 472 |                 if th.isnan(next_obs[id]).sum() > 0 \
 473 |                     or th.isinf(next_obs[id]).sum() > 0 \
 474 |                     or (th.abs(next_obs[id]) > 1e6).sum() > 0: # ugly fix for nan values
 475 |                     grad_next_values[-1][id] = 0.
 476 |                 elif self.current_lengths[id] < self.episode_max_length - 1:    # early termination
 477 |                     grad_next_values[-1][id] = 0.
 478 |             
 479 |             # add default reward;
 480 |             grad_rewards.append(rewards)
 481 | 
 482 |             # @TODO: do not use reward shaper for now;
 483 |             self.experience_buffer.update_data('rewards', n, rewards)
 484 | 
 485 |             self.current_rewards += rewards.detach()
 486 |             self.current_lengths += 1
 487 |             all_done_indices = self.dones.nonzero(as_tuple=False)
 488 |             done_indices = all_done_indices[::self.num_agents]
 489 | 
 490 |             self.game_rewards.update(self.current_rewards[done_indices])
 491 |             self.game_lengths.update(self.current_lengths[done_indices])
 492 |             
 493 |             not_dones = 1.0 - self.dones.float()
 494 | 
 495 |             self.current_rewards = self.current_rewards * not_dones.unsqueeze(1)
 496 |             self.current_lengths = self.current_lengths * not_dones
 497 | 
 498 |         '''
 499 |         Update actor and critic networks (but no PPO yet).
 500 | 
 501 |         Actor update differs between different algorithms,
 502 |         but critic update is shared between all algorithms.
 503 |         '''
 504 | 
 505 |         # start and end of current subsequence;
 506 |         last_fdones = self.dones.float()
 507 | 
 508 |         self.train_actor_no_ppo(grad_start,
 509 |                                 grad_obses,
 510 |                                 grad_rp_eps,
 511 |                                 grad_actions,
 512 |                                 grad_values,
 513 |                                 grad_next_values,
 514 |                                 grad_rewards,
 515 |                                 grad_fdones,
 516 |                                 last_fdones)
 517 |         
 518 |         grad_advs = \
 519 |             self.train_critic(grad_obses,
 520 |                             grad_actions,
 521 |                             grad_values,
 522 |                             grad_next_values,
 523 |                             grad_rewards,
 524 |                             grad_fdones,
 525 |                             last_fdones)
 526 | 
 527 |         self.update_target_critic()
 528 |         self.clear_experience_buffer_grads()
 529 | 
 530 |         # sort out [batch_dict];
 531 |         with th.no_grad():
 532 | 
 533 |             batch_dict = self.experience_buffer.get_transformed_list(swap_and_flatten01, self.tensor_list)
 534 | 
 535 |             for i in range(len(grad_advs)):
 536 |                 grad_advs[i] = grad_advs[i].unsqueeze(0)
 537 |             batch_dict['advantages'] = swap_and_flatten01(th.cat(grad_advs, dim=0).detach())
 538 |             batch_dict['played_frames'] = self.batch_size
 539 |             batch_dict['step_time'] = step_time
 540 | 
 541 |         return batch_dict
 542 | 
 543 |     def use_analytic_grads(self):
 544 |         '''
 545 |         Whether current RL algorithm requires analytic gradients
 546 |         from differentiable environment.
 547 |         '''
 548 |         raise NotImplementedError()
 549 | 
 550 |     def neglogp(self, x, mean, std, logstd):
 551 |         '''
 552 |         Negative log probability of a batch of actions under a Gaussian policy.
 553 |         '''
 554 | 
 555 |         assert x.ndim == 2 and mean.ndim == 2 and std.ndim == 2 and logstd.ndim == 2, ""
 556 |         # assert x.shape[0] == mean.shape[0] and x.shape[0] == std.shape[0] and x.shape[0] == logstd.shape[0], ""
 557 | 
 558 |         return 0.5 * (((x - mean) / std)**2).sum(dim=-1) \
 559 |             + 0.5 * np.log(2.0 * np.pi) * x.size()[-1] \
 560 |             + logstd.sum(dim=-1)
 561 | 
 562 |     def get_action_values(self, 
 563 |                         obs, 
 564 |                         obs_rms: RunningMeanStd, 
 565 |                         val_rms: RunningMeanStd):
 566 |         
 567 |         # normalize input if needed, we update rms only here;
 568 |         processed_obs = obs['obs']
 569 |         if self.normalize_input:
 570 |             # update rms;
 571 |             with th.no_grad():
 572 |                 self.obs_rms.update(processed_obs)
 573 |             processed_obs = obs_rms.normalize(processed_obs)
 574 |         
 575 |         # [std] is a vector of length [action_dim], which is shared by all the envs;
 576 |         actions, mu, std, eps = self.actor.forward_with_dist(processed_obs, deterministic=False)
 577 |         if std.ndim == 1:
 578 |             std = std.unsqueeze(0)                      
 579 |             std = std.expand(mu.shape[0], -1).clone()      # make size of [std] same as [actions] and [mu];
 580 |         neglogp = self.neglogp(actions, mu, std, th.log(std))
 581 | 
 582 |         # self.target_critic.eval()
 583 |         values = self.target_critic(processed_obs)
 584 |         
 585 |         # if using normalize value, target_critic learns to give normalized state values;
 586 |         # therefore, unnormalize the resulting value;
 587 |         if self.normalize_value:
 588 |             values = val_rms.normalize(values, True)
 589 | 
 590 |         res_dict = {
 591 |             "obs": processed_obs,
 592 |             "actions": actions,
 593 |             "mus": mu,
 594 |             "sigmas": std,
 595 |             "neglogpacs": neglogp,
 596 |             "values": values,
 597 |             "rnn_states": None,
 598 |             'rp_eps': eps,
 599 |         }
 600 | 
 601 |         return res_dict
 602 | 
 603 |     def train_actor_no_ppo(self,
 604 |                             grad_start: th.Tensor,
 605 |                             grad_obses: List[th.Tensor],
 606 |                             grad_rp_eps: List[th.Tensor],
 607 |                             grad_actions: List[th.Tensor],
 608 |                             grad_values: List[th.Tensor],
 609 |                             grad_next_values: List[th.Tensor],
 610 |                             grad_rewards: List[th.Tensor],
 611 |                             grad_fdones: List[th.Tensor],
 612 |                             last_fdones: th.Tensor):
 613 |         
 614 |         '''
 615 |         Train actor based on other methods than PPO with current experience.
 616 |         '''
 617 |         
 618 |         raise NotImplementedError()
 619 | 
 620 |     def train_critic(self,
 621 |                         grad_obses: List[th.Tensor],
 622 |                         grad_actions: List[th.Tensor],
 623 |                         grad_values: List[th.Tensor],
 624 |                         grad_next_values: List[th.Tensor],
 625 |                         grad_rewards: List[th.Tensor],
 626 |                         grad_fdones: List[th.Tensor],
 627 |                         last_fdones: th.Tensor):
 628 |         
 629 |         with th.no_grad():
 630 |         
 631 |             # compute advantage and add it to state value to get target values;
 632 |             curr_grad_advs = self.grad_advantages(self.tau,
 633 |                                                     grad_values,
 634 |                                                     grad_next_values, 
 635 |                                                     grad_rewards,
 636 |                                                     grad_fdones,
 637 |                                                     last_fdones)
 638 |             grad_advs = curr_grad_advs
 639 | 
 640 |             target_values = []
 641 |             for i in range(len(curr_grad_advs)):
 642 |                 target_values.append(curr_grad_advs[i] + grad_values[i])
 643 | 
 644 |             th_obs = th.cat(grad_obses, dim=0)
 645 |             th_target_values = th.cat(target_values, dim=0)
 646 |             
 647 |             # update value rms here once;
 648 |             if self.normalize_value:
 649 |                 self.val_rms.update(th_target_values)
 650 |             
 651 |             batch_size = len(th_target_values) // self.critic_num_batch
 652 |             critic_dataset = CriticDataset(batch_size, th_obs, th_target_values)
 653 | 
 654 |         self.critic.train()
 655 |         critic_loss = 0
 656 |         for j in range(self.critic_iterations):
 657 |             
 658 |             total_critic_loss = 0
 659 |             batch_cnt = 0
 660 |             
 661 |             for i in range(len(critic_dataset)):
 662 |             
 663 |                 batch_sample = critic_dataset[i]
 664 |                 self.critic_optimizer.zero_grad()
 665 | 
 666 |                 predicted_values = self.critic(batch_sample['obs']).squeeze(-1)
 667 |                 if self.normalize_value:
 668 |                     # predicted_values = curr_val_rms.normalize(predicted_values, True)
 669 |                     predicted_values = self.val_rms.normalize(predicted_values, True)
 670 |                 
 671 |                 target_values = batch_sample['target_values']
 672 |                 training_critic_loss = th.mean((predicted_values - target_values) ** 2, dim=0)
 673 |                 training_critic_loss.backward()
 674 |                 
 675 |                 # ugly fix for simulation nan problem
 676 |                 for params in self.critic.parameters():
 677 |                     params.grad.nan_to_num_(0.0, 0.0, 0.0)
 678 | 
 679 |                 if self.truncate_grads:
 680 |                     nn.utils.clip_grad_norm_(self.critic.parameters(), self.grad_norm)
 681 | 
 682 |                 self.critic_optimizer.step()
 683 | 
 684 |                 total_critic_loss += training_critic_loss
 685 |                 batch_cnt += 1
 686 |             
 687 |             # critic_loss = (total_critic_loss / batch_cnt).detach().cpu().item()
 688 |             # if self.print_stats:
 689 |             #     print('value iter {}/{}, loss = {:7.6f}'.format(j + 1, self.critic_iterations, critic_loss), end='\r')
 690 | 
 691 |         return grad_advs
 692 | 
 693 |     def update_target_critic(self):
 694 |         with th.no_grad():
 695 |             alpha = self.target_critic_alpha
 696 |             for param, param_targ in zip(self.critic.parameters(), self.target_critic.parameters()):
 697 |                 param_targ.data.mul_(alpha)
 698 |                 param_targ.data.add_((1. - alpha) * param.data)
 699 | 
 700 |     # def get_critic_values(self, obs, use_target_critic: bool, obs_rms_train: bool):
 701 | 
 702 |     #     if use_target_critic:   
 703 |     #         critic = self.target_critic
 704 |     #         # critic.eval()
 705 |     #     else:
 706 |     #         critic = self.critic
 707 | 
 708 |     #     if self.normalize_input:
 709 | 
 710 |     #         if obs_rms_train:
 711 |     #             self.running_mean_std.train()
 712 |     #         else:
 713 |     #             self.running_mean_std.eval()
 714 | 
 715 |     #     processed_obs = self._preproc_obs(obs)
 716 |     #     values = critic(processed_obs)
 717 | 
 718 |     #     if self.normalize_value:
 719 |     #         values = self.value_mean_std(values, True)
 720 | 
 721 |     #     return values
 722 | 
 723 |     def grad_advantages(self, gae_tau, mb_extrinsic_values, mb_next_extrinsic_values, mb_rewards, mb_fdones, last_fdones):
 724 | 
 725 |         num_step = len(mb_extrinsic_values)
 726 |         mb_advs = []
 727 |         
 728 |         # GAE;
 729 |         lastgaelam = 0
 730 |         for t in reversed(range(num_step)):
 731 |             if t == num_step - 1:
 732 |                 nextnonterminal = 1.0 - last_fdones
 733 |             else:
 734 |                 nextnonterminal = 1.0 - mb_fdones[t+1]
 735 |             nextnonterminal = nextnonterminal.unsqueeze(1)
 736 | 
 737 |             nextvalues = mb_next_extrinsic_values[t]
 738 | 
 739 |             '''
 740 |             In computing delta, we do not use [nextnonterinal] because 
 741 |             [nextvalues] should be zero if the episode was finished 
 742 |             before the maximum episode length.
 743 |             
 744 |             If the episode was finished by going over horizon, we have
 745 |             to deal with the [nextvalues] that is not zero, but 
 746 |             [nextnonterminal] is still 0.
 747 | 
 748 |             Therefore, we do not consider [nextnonterminal] here.
 749 |             '''
 750 |             delta = mb_rewards[t] + self.gamma * nextvalues - mb_extrinsic_values[t]
 751 |             mb_adv = lastgaelam = delta + self.gamma * gae_tau * nextnonterminal * lastgaelam
 752 |             mb_advs.append(mb_adv)
 753 | 
 754 |         mb_advs.reverse()
 755 |         return mb_advs
 756 | 
 757 |     def grad_advantages_first_terms_sum(self, grad_advs, grad_start):
 758 | 
 759 |         num_timestep = grad_start.shape[0]
 760 |         num_actors = grad_start.shape[1]
 761 | 
 762 |         adv_sum = 0
 763 | 
 764 |         for i in range(num_timestep):
 765 |             for j in range(num_actors):
 766 |                 if grad_start[i, j]:
 767 |                     adv_sum = adv_sum + grad_advs[i][j]
 768 | 
 769 |         return adv_sum
 770 | 
 771 |     def clear_experience_buffer_grads(self):
 772 | 
 773 |         '''
 774 |         Clear computation graph attached to the tensors in the experience buffer.
 775 |         '''
 776 | 
 777 |         with th.no_grad():
 778 | 
 779 |             for k in self.experience_buffer.tensor_dict.keys():
 780 | 
 781 |                 if not isinstance(self.experience_buffer.tensor_dict[k], th.Tensor):
 782 | 
 783 |                     continue
 784 | 
 785 |                 self.experience_buffer.tensor_dict[k] = self.experience_buffer.tensor_dict[k].detach()
 786 | 
 787 |     def train_actor_ppo(self, batch_dict):
 788 | 
 789 |         raise NotImplementedError()
 790 | 
 791 |     def prepare_dataset(self, batch_dict):
 792 |         
 793 |         obses = batch_dict['obses']
 794 |         advantages = batch_dict['advantages']
 795 |         dones = batch_dict['dones']
 796 |         values = batch_dict['values']
 797 |         actions = batch_dict['actions']
 798 |         neglogpacs = batch_dict['neglogpacs']
 799 |         mus = batch_dict['mus']
 800 |         sigmas = batch_dict['sigmas']
 801 |         
 802 |         advantages = th.sum(advantages, axis=1)
 803 |         unnormalized_advantages = advantages
 804 | 
 805 |         if self.normalize_advantage:
 806 |             advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)
 807 |                 
 808 |         dataset_dict = {}
 809 |         dataset_dict['old_values'] = values
 810 |         dataset_dict['advantages'] = advantages
 811 |         dataset_dict['actions'] = actions
 812 |         dataset_dict['obs'] = obses
 813 |         
 814 |         dataset_dict['old_mu'] = mus
 815 |         dataset_dict['old_sigma'] = sigmas
 816 |         dataset_dict['old_logp_actions'] = neglogpacs
 817 | 
 818 |         return dataset_dict
 819 | 
 820 |         if self.gi_algorithm == "ppo-only":
 821 | 
 822 |             dataset_dict['mu'] = mus
 823 |             dataset_dict['sigma'] = sigmas
 824 |             dataset_dict['logp_actions'] = neglogpacs
 825 | 
 826 |         elif self.gi_algorithm == "grad-ppo-shac":
 827 | 
 828 |             with torch.no_grad():
 829 |                 n_mus, n_sigmas = self.actor.forward_dist(obses)
 830 |                 if n_sigmas.ndim == 1:
 831 |                     n_sigmas = n_sigmas.unsqueeze(0)                      
 832 |                     n_sigmas = n_sigmas.expand(mus.shape[0], -1).clone()
 833 |                 
 834 |                 n_neglogpacs = self.neglogp(actions, n_mus, n_sigmas, torch.log(n_sigmas))
 835 | 
 836 |                 dataset_dict['mu'] = n_mus
 837 |                 dataset_dict['sigma'] = n_sigmas
 838 |                 dataset_dict['logp_actions'] = n_neglogpacs
 839 | 
 840 |         # compute [mus] and [sigmas] again here because we could have
 841 |         # updated policy in [play_steps] using RP gradients;
 842 |         # find out if the updated policy is still close enough to the
 843 |         # original policy, because PPO assumes it;
 844 |         # if it is not close enough, we decrease [alpha];
 845 |         
 846 |         elif self.gi_algorithm == "grad-ppo-alpha":
 847 |         
 848 |             with torch.no_grad():
 849 |                 n_mus, n_sigmas = self.actor.forward_dist(obses)
 850 |                 if n_sigmas.ndim == 1:
 851 |                     n_sigmas = n_sigmas.unsqueeze(0)                      
 852 |                     n_sigmas = n_sigmas.expand(mus.shape[0], -1).clone()
 853 |                 
 854 |                 n_neglogpacs = self.neglogp(actions, n_mus, n_sigmas, torch.log(n_sigmas))
 855 |                 
 856 |                 # find out distance between current policy and old policy;
 857 |                 
 858 |                 pac_ratio = torch.exp(torch.clamp(neglogpacs - n_neglogpacs, max=16.))  # prevent [inf];
 859 |                 out_of_range_pac_ratio = torch.logical_or(pac_ratio < (1. - self.e_clip), 
 860 |                                                           pac_ratio > (1. + self.e_clip))
 861 |                 out_of_range_pac_ratio = torch.count_nonzero(out_of_range_pac_ratio) / actions.shape[0]
 862 |                 
 863 |                 self.writer.add_scalar("info_alpha/oor_pac_ratio", out_of_range_pac_ratio, self.epoch_num)
 864 |                         
 865 |                 # find out if current policy is better than old policy in terms of lr gradients;
 866 |                 
 867 |                 est_curr_performance = torch.sum(unnormalized_advantages * pac_ratio) - torch.sum(unnormalized_advantages)
 868 |                 # est_curr_performance = torch.sum(advantages * pac_ratio) - torch.sum(advantages)
 869 |                 
 870 |                 n_est_curr_performance = self.est_curr_performace_rms.normalize(est_curr_performance)
 871 |                 self.est_curr_performace_rms.update(est_curr_performance.unsqueeze(0))
 872 |                 
 873 |                 self.writer.add_scalar("info_alpha/est_curr_performance", est_curr_performance, self.epoch_num)
 874 |                 self.writer.add_scalar("info_alpha/est_curr_performance_n", n_est_curr_performance, self.epoch_num)
 875 |                 
 876 |                 # if current policy is too far from old policy or is worse than old policy,
 877 |                 # decrease alpha;
 878 |                 
 879 |                 if out_of_range_pac_ratio > self.gi_max_dist_rp_lr or \
 880 |                     (est_curr_performance < 0 and n_est_curr_performance < -1.):
 881 |                     
 882 |                     self.next_alpha = self.gi_curr_alpha / self.gi_update_factor
 883 |                     if self.gi_dynamic_alpha_scheduler in ['dynamic0', 'dynamic2']:
 884 |                         self.next_actor_lr = self.actor_lr / self.gi_update_factor
 885 |                     self.next_alpha = np.clip(self.next_alpha, self.gi_min_alpha, self.gi_max_alpha)
 886 |                 
 887 |                 dataset_dict['mu'] = n_mus
 888 |                 dataset_dict['sigma'] = n_sigmas
 889 |                 dataset_dict['logp_actions'] = n_neglogpacs
 890 |                     
 891 |         self.dataset.update_values_dict(dataset_dict)
 892 | 
 893 |         if self.has_central_value:
 894 |             raise NotImplementedError()
 895 | 
 896 |     # def get_full_state_weights(self):
 897 |         
 898 |     #     state = super().get_full_state_weights()
 899 | 
 900 |     #     state['gi_actor'] = self.actor.state_dict()
 901 |     #     state['gi_critic'] = self.critic.state_dict()
 902 |     #     state['gi_target_critic'] = self.target_critic.state_dict()
 903 |     #     if self.normalize_input:
 904 |     #         state['gi_obs_rms'] = self.obs_rms        
 905 |     #     return state
 906 | 
 907 |     # def set_full_state_weights(self, weights):
 908 |         
 909 |     #     super().set_full_state_weights(weights)
 910 | 
 911 |     #     self.actor.load_state_dict(weights['gi_actor'])
 912 |     #     self.critic.load_state_dict(weights['gi_critic'])
 913 |     #     self.target_critic.load_state_dict(weights['gi_target_critic'])
 914 |     #     if self.normalize_input:
 915 |     #         self.obs_rms = weights['gi_obs_rms'].to(self.ppo_device)
 916 |     
 917 |     # def calc_gradients(self, input_dict):
 918 | 
 919 |     #     # =================================================
 920 | 
 921 |     #     value_preds_batch = input_dict['old_values']
 922 |     #     advantage = input_dict['advantages']
 923 |     #     actions_batch = input_dict['actions']
 924 |     #     obs_batch = input_dict['obs']
 925 | 
 926 |     #     # these old mu and sigma are used to compute new policy's KL div from
 927 |     #     # the old policy, which could be used to update learning rate later;
 928 |     #     # it is not directly involved in policy updates;
 929 |     #     old_mu_batch = input_dict['mu']
 930 |     #     old_sigma_batch = input_dict['sigma']
 931 |         
 932 |     #     if self.gi_algorithm == "grad-ppo-alpha":
 933 |     #         old_action_log_probs_batch_0 = input_dict['old_logp_actions']       # action log probs before alpha update;
 934 |     #         old_action_log_probs_batch_1 = input_dict['logp_actions']           # action log probs after alpha update;
 935 |     #     else:
 936 |     #         old_action_log_probs_batch = input_dict['old_logp_actions']         # original action log probs;
 937 |         
 938 |     #     lr_mul = 1.0
 939 |     #     curr_e_clip = lr_mul * self.e_clip
 940 | 
 941 |     #     if self.is_rnn:
 942 |     #         raise NotImplementedError()
 943 |         
 944 |     #     for param in self.actor.parameters():
 945 |     #         if torch.any(torch.isnan(param.data)) or torch.any(torch.isinf(param.data)):
 946 |     #             print("Invalid param 1")
 947 |     #             exit(-1)
 948 |             
 949 |     #     # get current policy's actions;
 950 |     #     curr_mu, curr_std = self.actor.forward_dist(obs_batch)
 951 |     #     if curr_std.ndim == 1:
 952 |     #         curr_std = curr_std.unsqueeze(0)                      
 953 |     #         curr_std = curr_std.expand(curr_mu.shape[0], -1).clone()
 954 |     #     neglogp = self.neglogp(actions_batch, curr_mu, curr_std, torch.log(curr_std))
 955 |             
 956 |     #     # min_std = float(1e-5)
 957 |     #     # tmp_curr_std = curr_std
 958 |     #     # while True:
 959 |     #     #     neglogp = self.neglogp(actions_batch, curr_mu, tmp_curr_std, torch.log(tmp_curr_std))
 960 |     #     #     if torch.any(torch.isnan(neglogp)) or torch.any(torch.isinf(neglogp)):
 961 |                 
 962 |     #     #         # isnan_ind = torch.isnan(neglogp)
 963 |     #     #         # isinf_ind = torch.isinf(neglogp)
 964 |     #     #         # # print(actions_batch[isnan_ind])
 965 |     #     #         # # print(curr_mu[isnan_ind])
 966 |     #     #         # # print(tmp_curr_std[isnan_ind])
 967 | 
 968 |     #     #         # # print(actions_batch[isinf_ind])
 969 |     #     #         # # print(curr_mu[isinf_ind])
 970 |     #     #         # # print(tmp_curr_std[isinf_ind])
 971 |                 
 972 |     #     #         print(min_std)
 973 |     #     #         tmp_curr_std = torch.clamp(curr_std, min=min_std)
 974 |     #     #         min_std *= 2.
 975 |     #     #         exit(-1)
 976 |     #     #     else:
 977 |     #     #         break
 978 | 
 979 |     #     if self.gi_algorithm == "grad-ppo-alpha":
 980 |     #         a_loss = _grad_common_losses.actor_loss_alpha(old_action_log_probs_batch_0, 
 981 |     #                                                                    old_action_log_probs_batch_1,
 982 |     #                                                                    neglogp, 
 983 |     #                                                                    advantage, 
 984 |     #                                                                    self.ppo, 
 985 |     #                                                                    curr_e_clip)
 986 |     #     else:
 987 |     #         a_loss = _grad_common_losses.actor_loss(old_action_log_probs_batch, 
 988 |     #                                                 neglogp, 
 989 |     #                                                 advantage, 
 990 |     #                                                 self.ppo, 
 991 |     #                                                 curr_e_clip)
 992 |             
 993 |     #     c_loss = torch.zeros((1,), device=self.ppo_device)
 994 |     #     b_loss = self.bound_loss(curr_mu)
 995 | 
 996 |     #     # do not have entropy coef for now;
 997 |     #     losses, sum_mask = torch_ext.apply_masks([a_loss.unsqueeze(1), b_loss.unsqueeze(1)], None)
 998 |     #     a_loss, b_loss = losses[0], losses[1]
 999 | 
1000 |     #     entropy = torch.zeros((1,), device=self.ppo_device)
1001 |     #     assert self.entropy_coef == 0., ""
1002 | 
1003 |     #     # we only use actor loss here for fair comparison;
1004 |     #     loss = a_loss
1005 |         
1006 |     #     self.ppo_optimizer.zero_grad()
1007 |     #     if self.multi_gpu:
1008 |     #         raise NotImplementedError()
1009 |     #     else:
1010 |     #         for param in self.actor.parameters():
1011 |     #             param.grad = None
1012 | 
1013 |     #     loss.backward()
1014 |         
1015 |     #     #TODO: Refactor this ugliest code of they year
1016 |     #     if self.truncate_grads:
1017 |     #         if self.multi_gpu:
1018 |     #             raise NotImplementedError()
1019 |     #         else:
1020 |     #             nn.utils.clip_grad_norm_(self.actor.parameters(), self.grad_norm)
1021 |     #             self.ppo_optimizer.step()
1022 |     #     else:
1023 |     #         self.ppo_optimizer.step()
1024 | 
1025 |     #     for param in self.actor.parameters():
1026 |     #         if torch.any(torch.isnan(param.data)) or torch.any(torch.isinf(param.data)):
1027 | 
1028 |     #             print("Invalid param 2")
1029 |     #             print(loss)
1030 |     #             print(a_loss)
1031 |                 
1032 |     #             # print(_grad_common_losses.actor_loss_alpha(old_action_log_probs_batch_0, 
1033 |     #             #                                             old_action_log_probs_batch_1,
1034 |     #             #                                             neglogp, 
1035 |     #             #                                             advantage, 
1036 |     #             #                                             self.ppo, 
1037 |     #             #                                             curr_e_clip))
1038 |     #             exit(-1)
1039 | 
1040 |     #     with torch.no_grad():
1041 |     #         reduce_kl = not self.is_rnn
1042 |     #         kl_dist = torch_ext.policy_kl(curr_mu.detach(), curr_std.detach(), old_mu_batch, old_sigma_batch, reduce_kl)
1043 |     #         if self.is_rnn:
1044 |     #             raise NotImplementedError()
1045 |                 
1046 |     #     self.train_result = (a_loss, c_loss, entropy, \
1047 |     #         kl_dist, self.last_lr, lr_mul, \
1048 |     #         curr_mu.detach(), curr_std.detach(), b_loss)
1049 | 
1050 |     # def update_lr(self, lr):
1051 |     #     if self.multi_gpu:
1052 |     #         lr_tensor = torch.tensor([lr])
1053 |     #         self.hvd.broadcast_value(lr_tensor, 'learning_rate')
1054 |     #         lr = lr_tensor.item()
1055 | 
1056 |     #     for param_group in self.ppo_optimizer.param_groups:
1057 |     #         param_group['lr'] = lr
1058 |             
1059 |     # def differentiate_grad_advantages(self, 
1060 |     #                                 grad_actions: torch.Tensor, 
1061 |     #                                 grad_advs: torch.Tensor, 
1062 |     #                                 grad_start: torch.Tensor,
1063 |     #                                 debug: bool=False):
1064 | 
1065 |     #     '''
1066 |     #     Compute first-order gradients of [grad_advs] w.r.t. [grad_actions] using automatic differentiation.
1067 |     #     '''
1068 | 
1069 |     #     num_timestep = grad_start.shape[0]
1070 |     #     num_actor = grad_start.shape[1]
1071 | 
1072 |     #     adv_sum: torch.Tensor = self.grad_advantages_first_terms_sum(grad_advs, grad_start)
1073 | 
1074 |     #     # compute gradients;
1075 |         
1076 |     #     # first-order gradient;
1077 |         
1078 |     #     # adv_gradient = torch.autograd.grad(adv_sum, grad_actions, retain_graph=debug)
1079 |     #     # adv_gradient = torch.stack(adv_gradient)
1080 |         
1081 |     #     for ga in grad_actions:
1082 |     #         ga.retain_grad()
1083 |     #     adv_sum.backward(retain_graph=debug)
1084 |     #     adv_gradient = []
1085 |     #     for ga in grad_actions:
1086 |     #         adv_gradient.append(ga.grad)
1087 |     #     adv_gradient = torch.stack(adv_gradient)
1088 |         
1089 |     #     # reweight grads;
1090 | 
1091 |     #     with torch.no_grad():
1092 | 
1093 |     #         c = (1.0 / (self.gamma * self.tau))
1094 |     #         cv = torch.ones((num_actor, 1), device=adv_gradient.device)
1095 | 
1096 |     #         for nt in range(num_timestep):
1097 | 
1098 |     #             # if new episode has been started, set [cv] to 1; 
1099 |     #             for na in range(num_actor):
1100 |     #                 if grad_start[nt, na]:
1101 |     #                     cv[na, 0] = 1.0
1102 | 
1103 |     #             adv_gradient[nt] = adv_gradient[nt] * cv
1104 |     #             cv = cv * c
1105 |                 
1106 |     #     if debug:
1107 |             
1108 |     #         # compute gradients in brute force and compare;
1109 |     #         # this is to prove correctness of efficient computation of GAE-based advantage w.r.t. actions;
1110 |         
1111 |     #         for i in range(num_timestep):
1112 |                 
1113 |     #             debug_adv_sum = grad_advs[i].sum()
1114 |                 
1115 |     #             debug_grad_adv_gradient = torch.autograd.grad(debug_adv_sum, grad_actions[i], retain_graph=True)[0]
1116 |     #             debug_grad_adv_gradient_norm = torch.norm(debug_grad_adv_gradient, p=2, dim=-1)
1117 |                 
1118 |     #             debug_grad_error = torch.norm(debug_grad_adv_gradient - adv_gradient[i], p=2, dim=-1)
1119 |     #             debug_grad_error_ratio = debug_grad_error / debug_grad_adv_gradient_norm
1120 |                 
1121 |     #             assert torch.all(debug_grad_error_ratio < 0.01), \
1122 |     #                 "Gradient of advantage possibly wrong"
1123 |                         
1124 |     #     adv_gradient = adv_gradient.detach()
1125 |         
1126 |     #     return adv_gradient
1127 |     
1128 |     # def action_eps_jacobian(self, mu, sigma, eps):
1129 |         
1130 |     #     jacobian = torch.zeros((eps.shape[0], eps.shape[1], eps.shape[1]))
1131 |         
1132 |     #     for d in range(eps.shape[1]):
1133 |             
1134 |     #         if sigma.ndim == 1:
1135 |     #             jacobian[:, d, d] = sigma[d].detach()
1136 |     #         elif sigma.ndim == 2:
1137 |     #             jacobian[:, d, d] = sigma[:, d].detach()
1138 |             
1139 |     #     return jacobian
1140 |         
1141 |     #     '''
1142 |     #     distr = GradNormal(mu, sigma)
1143 |     #     eps.requires_grad = True
1144 |     #     actions = distr.eps_to_action(eps)
1145 |         
1146 |     #     jacobian = torch.zeros((eps.shape[0], actions.shape[1], eps.shape[1]))
1147 |         
1148 |     #     for d in range(actions.shape[1]):
1149 |     #         target = torch.sum(actions[:, d])
1150 |     #         grad = torch.autograd.grad(target, eps, retain_graph=True)
1151 |     #         grad = torch.stack(grad)
1152 |     #         jacobian[:, d, :] = grad
1153 |             
1154 |     #     return jacobian
1155 |     #     '''
1156 |     
1157 |     def use_ppo(self):
1158 |         '''
1159 |         Whether or not to use PPO.
1160 |         '''
1161 |         raise NotImplementedError()
1162 | 
1163 |     '''
1164 |     Logging
1165 |     '''
1166 |     def write_stats(self, total_time, epoch_num, step_time, no_ppo_time, ppo_time, ppo_loss, frame, scaled_time, scaled_play_time, curr_frames):
1167 |         
1168 |         mean_ppo_loss = th.tensor(ppo_loss).mean().item()
1169 | 
1170 |         self.writer.add_scalar('performance/step_inference_rl_update_fps', curr_frames / scaled_time, frame)
1171 |         self.writer.add_scalar('performance/step_inference_fps', curr_frames / scaled_play_time, frame)
1172 |         self.writer.add_scalar('performance/step_fps', curr_frames / step_time, frame)
1173 |         self.writer.add_scalar('performance/no_ppo_time', no_ppo_time, frame)
1174 |         self.writer.add_scalar('performance/ppo_time', ppo_time, frame)
1175 |         self.writer.add_scalar('performance/step_time', step_time, frame)
1176 |         self.writer.add_scalar('losses/ppo_loss', mean_ppo_loss, frame)
1177 |         self.writer.add_scalar('info/epochs', epoch_num, frame)
1178 | 
1179 |         if self.use_ppo():
1180 |             self.writer.add_scalar('info/e_clip', self.e_clip, frame)
1181 |         # self.algo_observer.after_print_stats(frame, epoch_num, total_time)
1182 | 
1183 |     def get_weights(self):
1184 |         state = self.get_stats_weights()
1185 |         state['actor'] = self.actor.state_dict()
1186 |         state['critic'] = self.critic.state_dict()
1187 |         return state
1188 | 
1189 |     def get_stats_weights(self):
1190 |         state = {}
1191 |         # if self.normalize_input:
1192 |         #     state['running_mean_std'] = self.running_mean_std.state_dict()
1193 |         # if self.normalize_value:
1194 |         #     state['reward_mean_std'] = self.value_mean_std.state_dict()
1195 |         return state
1196 | 
1197 |     def get_optimizers_state(self):
1198 |         state = {}
1199 |         state['critic'] = self.critic_optimizer.state_dict()
1200 |         return state
1201 | 
1202 |     def get_full_state_weights(self):
1203 |         state = self.get_weights()
1204 |         state['epoch'] = self.epoch_num
1205 |         state['optimizers'] = self.get_optimizers_state()
1206 |         state['frame'] = self.frame
1207 | 
1208 |         # This is actually the best reward ever achieved. last_mean_rewards is perhaps not the best variable name
1209 |         # We save it to the checkpoint to prevent overriding the "best ever" checkpoint upon experiment restart
1210 |         state['last_mean_rewards'] = self.last_mean_rewards
1211 | 
1212 |         env_state = self.vec_env.get_env_state()
1213 |         state['env_state'] = env_state
1214 | 
1215 |         return state
1216 |     
1217 |     def safe_filesystem_op(self, func, *args, **kwargs):
1218 |         """
1219 |         This is to prevent spurious crashes related to saving checkpoints or restoring from checkpoints in a Network
1220 |         Filesystem environment (i.e. NGC cloud or SLURM)
1221 |         """
1222 |         num_attempts = 5
1223 |         for attempt in range(num_attempts):
1224 |             try:
1225 |                 return func(*args, **kwargs)
1226 |             except Exception as exc:
1227 |                 print(f'Exception {exc} when trying to execute {func} with args:{args} and kwargs:{kwargs}...')
1228 |                 wait_sec = 2 ** attempt
1229 |                 print(f'Waiting {wait_sec} before trying again...')
1230 |                 time.sleep(wait_sec)
1231 | 
1232 |         raise RuntimeError(f'Could not execute {func}, give up after {num_attempts} attempts...')
1233 |     
1234 |     def safe_save(self, state, filename):
1235 |         return self.safe_filesystem_op(th.save, state, filename)
1236 | 
1237 |     def save_checkpoint(self, filename, state):
1238 |         print("=> saving checkpoint '{}'".format(filename + '.pth'))
1239 |         self.safe_save(state, filename + '.pth')
1240 |     
1241 |     def save(self, fn):
1242 |         state = self.get_full_state_weights()
1243 |         self.save_checkpoint(fn, state)


--------------------------------------------------------------------------------