├── core
    ├── __init__.py
    ├── agents
    │   ├── heuristic
    │   │   ├── __init__.py
    │   │   └── load_balance
    │   │   │   ├── __init__.py
    │   │   │   ├── base.py
    │   │   │   ├── random_allocate.py
    │   │   │   ├── least_work.py
    │   │   │   ├── earliest_completion_time.py
    │   │   │   └── shortest_processing_time.py
    │   ├── __init__.py
    │   ├── models
    │   │   ├── __init__.py
    │   │   ├── cnn_base.py
    │   │   ├── mlp_base.py
    │   │   ├── bncnn.py
    │   │   └── base.py
    │   └── pg.py
    ├── storage
    │   ├── __init__.py
    │   ├── lacie_storage.py
    │   └── base_storage.py
    ├── algorithms
    │   ├── lacie
    │   │   ├── __init__.py
    │   │   ├── lacie_a2c.py
    │   │   ├── base_lacie.py
    │   │   └── lacie_ppo.py
    │   ├── __init__.py
    │   ├── input_dependent_baseline
    │   │   ├── __init__.py
    │   │   ├── mib_a2c.py
    │   │   ├── mib_ppo.py
    │   │   └── base_meta_critic.py
    │   ├── base_algo.py
    │   ├── a2c_acktr.py
    │   ├── ppo.py
    │   └── kfac.py
    ├── envs
    │   ├── __init__.py
    │   ├── load_balance_wrappers.py
    │   └── park_envs.py
    ├── utils.py
    ├── distributions.py
    └── arguments.py
├── utils
    ├── __init__.py
    └── plot.py
├── _config.yml
├── assets
    └── logo.png
├── scripts
    ├── grid_search
    │   ├── ppo
    │   │   ├── lacie
    │   │   │   ├── run.sh
    │   │   │   ├── config_1.sh
    │   │   │   ├── config_2.sh
    │   │   │   ├── config_3.sh
    │   │   │   ├── config_6.sh
    │   │   │   ├── config_7.sh
    │   │   │   ├── config_4.sh
    │   │   │   └── config_5.sh
    │   │   └── vanilla
    │   │   │   ├── run.sh
    │   │   │   ├── config_7.sh
    │   │   │   ├── config_3.sh
    │   │   │   ├── config_4.sh
    │   │   │   ├── config_8.sh
    │   │   │   ├── config_2.sh
    │   │   │   ├── config_5.sh
    │   │   │   ├── config_1.sh
    │   │   │   └── config_6.sh
    │   └── a2c
    │   │   ├── vanilla
    │   │       ├── config_2.sh
    │   │       ├── config_1.sh
    │   │       ├── config_3.sh
    │   │       └── config_4.sh
    │   │   └── lacie
    │   │       ├── config_1.sh
    │   │       ├── config_2.sh
    │   │       ├── config_3.sh
    │   │       ├── config_4.sh
    │   │       ├── config_5.sh
    │   │       ├── config_6.sh
    │   │       └── config_7.sh
    ├── lacie_a2c_load_balance.sh
    └── lacie_ppo_load_balance.sh
├── README.md
├── .gitignore
├── evaluation.py
├── LICENSE
└── main.py


/core/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/core/agents/heuristic/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-cayman


--------------------------------------------------------------------------------
/core/agents/__init__.py:
--------------------------------------------------------------------------------
1 | from .pg import Policy
2 | 


--------------------------------------------------------------------------------
/core/storage/__init__.py:
--------------------------------------------------------------------------------
1 | from .base_storage import RolloutStorage
2 | from .lacie_storage import LacieStorage
3 | 


--------------------------------------------------------------------------------
/assets/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lehduong/Job-Scheduling-with-Reinforcement-Learning/HEAD/assets/logo.png


--------------------------------------------------------------------------------
/core/algorithms/lacie/__init__.py:
--------------------------------------------------------------------------------
1 | from .lacie_a2c import LACIE_A2C, LACIE_A2C_Memory
2 | from .lacie_ppo import LACIE_PPO, LACIE_PPO_Memory
3 | 


--------------------------------------------------------------------------------
/core/envs/__init__.py:
--------------------------------------------------------------------------------
1 | from .park_envs import make_env
2 | from .park_envs import make_vec_envs
3 | from .park_envs import PARK_ENV_LIST
4 | 


--------------------------------------------------------------------------------
/core/agents/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .base import NNBase
2 | from .cnn_base import CNNBase
3 | from .mlp_base import MLPBase
4 | from .bncnn import BNCNN
5 | 


--------------------------------------------------------------------------------
/core/algorithms/__init__.py:
--------------------------------------------------------------------------------
1 | from .a2c_acktr import A2C_ACKTR
2 | from .ppo import PPO
3 | from .input_dependent_baseline import MIB_A2C, MIB_PPO
4 | from .lacie import LACIE_A2C, LACIE_PPO, LACIE_A2C_Memory, LACIE_PPO_Memory
5 | 


--------------------------------------------------------------------------------
/core/agents/heuristic/load_balance/__init__.py:
--------------------------------------------------------------------------------
1 | from .least_work import LeastWorkAgent
2 | from .shortest_processing_time import ShortestProcessingTimeAgent
3 | from .random_allocate import RandomAllocateAgent
4 | from .earliest_completion_time import EarliestCompletionTimeAgent
5 | 


--------------------------------------------------------------------------------
/core/algorithms/input_dependent_baseline/__init__.py:
--------------------------------------------------------------------------------
1 | from core.algorithms.input_dependent_baseline.mib_a2c import MIB_A2C
2 | from core.algorithms.input_dependent_baseline.mib_ppo import MIB_PPO
3 | from core.algorithms.input_dependent_baseline.base_meta_critic import ActorMetaCriticAlgo
4 | 


--------------------------------------------------------------------------------
/core/agents/heuristic/load_balance/base.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC
 2 | 
 3 | 
 4 | class HeuristicAgent(ABC):
 5 |     def __init__(self):
 6 |         pass
 7 | 
 8 |     def act(self, states):
 9 |         """
10 |             Give actions for given states
11 |             :param states: torch tensor of shape num_envs x (num_servers+1)
12 |         """
13 |         pass
14 | 


--------------------------------------------------------------------------------
/core/agents/heuristic/load_balance/random_allocate.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from .base import HeuristicAgent
 4 | 
 5 | 
 6 | class RandomAllocateAgent(HeuristicAgent):
 7 |     def act(self, states):
 8 |         num_env = states.shape[0]
 9 |         num_servers = states.shape[1] - 2
10 | 
11 |         return torch.randint(0, num_servers, (num_env, 1)).to(states.device)
12 | 


--------------------------------------------------------------------------------
/scripts/grid_search/ppo/lacie/run.sh:
--------------------------------------------------------------------------------
1 | bash scripts/grid_search/ppo/lacie/config_1.sh && \
2 | bash scripts/grid_search/ppo/lacie/config_2.sh && \
3 | bash scripts/grid_search/ppo/lacie/config_3.sh && \
4 | bash scripts/grid_search/ppo/lacie/config_4.sh && \
5 | bash scripts/grid_search/ppo/lacie/config_5.sh && \
6 | bash scripts/grid_search/ppo/lacie/config_6.sh && \
7 | bash scripts/grid_search/ppo/lacie/config_7.sh


--------------------------------------------------------------------------------
/scripts/grid_search/ppo/vanilla/run.sh:
--------------------------------------------------------------------------------
1 | bash scripts/grid_search/ppo/vanilla/config_1.sh && \
2 | bash scripts/grid_search/ppo/vanilla/config_2.sh && \
3 | bash scripts/grid_search/ppo/vanilla/config_3.sh && \
4 | bash scripts/grid_search/ppo/vanilla/config_4.sh && \
5 | bash scripts/grid_search/ppo/vanilla/config_5.sh && \
6 | bash scripts/grid_search/ppo/vanilla/config_6.sh && \
7 | bash scripts/grid_search/ppo/vanilla/config_7.sh && \
8 | bash scripts/grid_search/ppo/vanilla/config_8.sh


--------------------------------------------------------------------------------
/core/agents/heuristic/load_balance/least_work.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | 
 4 | from .base import HeuristicAgent
 5 | 
 6 | 
 7 | class LeastWorkAgent(HeuristicAgent):
 8 |     def act(self, states):
 9 |         """
10 |             Give actions for given states
11 |             :param states: torch tensor of shape num_envs x (num_servers+1)
12 |             :return: np.array of shape num_env x 1
13 |         """
14 |         return torch.argmin(states[:, :-2], dim=1, keepdims=True).to(states.device)
15 | 


--------------------------------------------------------------------------------
/core/agents/heuristic/load_balance/earliest_completion_time.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from .shortest_processing_time import ShortestProcessingTimeAgent
 4 | 
 5 | 
 6 | class EarliestCompletionTimeAgent(ShortestProcessingTimeAgent):
 7 |     def act(self, states):
 8 |         """
 9 |             Give actions for given states
10 |             :param states: torch tensor of shape num_envs x (num_servers+1)
11 |             :return: np.array of shape num_env x 1
12 |         """
13 |         processing_time = states[:, -
14 |                                  2].reshape(-1, 1) / self.service_rates.to(states.device)
15 |         completion_time = states[:, :-2] + processing_time
16 | 
17 |         return torch.argmin(completion_time, dim=1, keepdims=True)
18 | 


--------------------------------------------------------------------------------
/scripts/grid_search/ppo/vanilla/config_7.sh:
--------------------------------------------------------------------------------
 1 | python main.py --num-stream-jobs 1000 --num-stream-jobs-factor 1.05\
 2 |                 --num-curriculum-time 1 \
 3 |                 --algo ppo \
 4 |                 --clip-param 0.3\
 5 |                 --ppo-epoch 4\
 6 |                 --num-env-steps 50000000\
 7 |                 --gamma 1\
 8 |                 --entropy-coef 0.01\
 9 |                 --regularize-coef 1\
10 |                 --load-balance-service-rates 0.15 0.25 0.35 0.45 0.55 0.65 0.75 0.85 0.95 1.05 \
11 |                 --reward-norm-factor 10000\
12 |                 --lr 0.00025\
13 |                 --num-mini-batch 4\
14 |                 --num-process 16 --num-steps 1000 --log-interval 5 \
15 |                 --seed 100 --use-memory-to-pred-weights --use-linear-lr-decay\
16 |                 --log-dir ppo_7


--------------------------------------------------------------------------------
/scripts/grid_search/ppo/vanilla/config_3.sh:
--------------------------------------------------------------------------------
 1 | python main.py --num-stream-jobs 1000 --num-stream-jobs-factor 1.05\
 2 |                 --num-curriculum-time 1 \
 3 |                 --algo ppo \
 4 |                 --clip-param 0.1\
 5 |                 --ppo-epoch 4\
 6 |                 --num-env-steps 50000000\
 7 |                 --gamma 1\
 8 |                 --entropy-coef 0.01\
 9 |                 --regularize-coef 1\
10 |                 --load-balance-service-rates 0.15 0.25 0.35 0.45 0.55 0.65 0.75 0.85 0.95 1.05 \
11 |                 --reward-norm-factor 10000\
12 |                 --lr 0.0005\
13 |                 --num-mini-batch 4\
14 |                 --num-process 16 --num-steps 1000 --log-interval 5 \
15 |                 --seed 100 --use-memory-to-pred-weights --use-linear-lr-decay\
16 |                 --log-dir ppo_3     


--------------------------------------------------------------------------------
/scripts/grid_search/ppo/vanilla/config_4.sh:
--------------------------------------------------------------------------------
 1 | python main.py --num-stream-jobs 1000 --num-stream-jobs-factor 1.05\
 2 |                 --num-curriculum-time 1 \
 3 |                 --algo ppo \
 4 |                 --clip-param 0.1\
 5 |                 --ppo-epoch 4\
 6 |                 --num-env-steps 50000000\
 7 |                 --gamma 1\
 8 |                 --entropy-coef 0.01\
 9 |                 --regularize-coef 1\
10 |                 --load-balance-service-rates 0.15 0.25 0.35 0.45 0.55 0.65 0.75 0.85 0.95 1.05 \
11 |                 --reward-norm-factor 10000\
12 |                 --lr 0.0001\
13 |                 --num-mini-batch 4\
14 |                 --num-process 16 --num-steps 1000 --log-interval 5 \
15 |                 --seed 100 --use-memory-to-pred-weights --use-linear-lr-decay\
16 |                 --log-dir ppo_4   


--------------------------------------------------------------------------------
/core/agents/heuristic/load_balance/shortest_processing_time.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | 
 4 | from .base import HeuristicAgent
 5 | 
 6 | 
 7 | class ShortestProcessingTimeAgent(HeuristicAgent):
 8 |     def __init__(self, service_rates=[0.15, 0.25, 0.35, 0.45, 0.55, 0.65, 0.75, 0.85, 0.95, 1.05]):
 9 |         self.service_rates = torch.tensor(service_rates).reshape(1, -1)
10 | 
11 |     def act(self, states):
12 |         """
13 |             Give actions for given states
14 |             :param states: torch tensor of shape num_envs x (num_servers+1)
15 |             :return: np.array of shape num_env x 1
16 |         """
17 |         processing_time = states[:, -
18 |                                  2].reshape(-1, 1) / self.service_rates.to(states.device)
19 | 
20 |         return torch.argmin(processing_time, dim=1, keepdims=True)
21 | 


--------------------------------------------------------------------------------
/scripts/grid_search/ppo/vanilla/config_8.sh:
--------------------------------------------------------------------------------
 1 | python main.py --num-stream-jobs 1000 --num-stream-jobs-factor 1.05\
 2 |                 --num-curriculum-time 1 \
 3 |                 --algo ppo \
 4 |                 --clip-param 0.2\
 5 |                 --ppo-epoch 8\
 6 |                 --num-env-steps 50000000\
 7 |                 --gamma 1\
 8 |                 --entropy-coef 0.01\
 9 |                 --regularize-coef 1\
10 |                 --load-balance-service-rates 0.15 0.25 0.35 0.45 0.55 0.65 0.75 0.85 0.95 1.05 \
11 |                 --reward-norm-factor 10000\
12 |                 --lr 0.000125\
13 |                 --num-mini-batch 4\
14 |                 --num-process 16 --num-steps 1000 --log-interval 5 \
15 |                 --seed 100 --use-memory-to-pred-weights --use-linear-lr-decay\
16 |                 --log-dir ppo_8                   


--------------------------------------------------------------------------------
/scripts/grid_search/ppo/vanilla/config_2.sh:
--------------------------------------------------------------------------------
 1 | python main.py --num-stream-jobs 1000 --num-stream-jobs-factor 1.05\
 2 |                 --num-curriculum-time 1 \
 3 |                 --algo ppo \
 4 |                 --clip-param 0.1\
 5 |                 --ppo-epoch 8\
 6 |                 --num-env-steps 50000000\
 7 |                 --gamma 1\
 8 |                 --entropy-coef 0.01\
 9 |                 --regularize-coef 1\
10 |                 --load-balance-service-rates 0.15 0.25 0.35 0.45 0.55 0.65 0.75 0.85 0.95 1.05 \
11 |                 --reward-norm-factor 10000\
12 |                 --lr 0.000125\
13 |                 --num-mini-batch 4\
14 |                 --num-process 16 --num-steps 1000 --log-interval 5 \
15 |                 --seed 100 --use-memory-to-pred-weights --use-linear-lr-decay\
16 |                 --log-dir ppo_2                       


--------------------------------------------------------------------------------
/scripts/grid_search/ppo/vanilla/config_5.sh:
--------------------------------------------------------------------------------
 1 | python main.py --num-stream-jobs 1000 --num-stream-jobs-factor 1.05\
 2 |                 --num-curriculum-time 1 \
 3 |                 --algo ppo \
 4 |                 --clip-param 0.1\
 5 |                 --ppo-epoch 4\
 6 |                 --num-env-steps 50000000\
 7 |                 --gamma 1\
 8 |                 --entropy-coef 0.001\
 9 |                 --regularize-coef 1\
10 |                 --load-balance-service-rates 0.15 0.25 0.35 0.45 0.55 0.65 0.75 0.85 0.95 1.05 \
11 |                 --reward-norm-factor 10000\
12 |                 --lr 0.00025\
13 |                 --num-mini-batch 4\
14 |                 --num-process 16 --num-steps 1000 --log-interval 5 \
15 |                 --seed 100 --use-memory-to-pred-weights --use-linear-lr-decay\
16 |                 --log-dir ppo_5                       


--------------------------------------------------------------------------------
/scripts/grid_search/ppo/vanilla/config_1.sh:
--------------------------------------------------------------------------------
 1 | python main.py --num-stream-jobs 1000 --num-stream-jobs-factor 1.05\
 2 |                 --num-curriculum-time 1 \
 3 |                 --algo ppo \
 4 |                 --clip-param 0.1\
 5 |                 --ppo-epoch 4\
 6 |                 --num-env-steps 50000000\
 7 |                 --gamma 1\
 8 |                 --entropy-coef 0.01\
 9 |                 --regularize-coef 1\
10 |                 --load-balance-service-rates 0.15 0.25 0.35 0.45 0.55 0.65 0.75 0.85 0.95 1.05 \
11 |                 --reward-norm-factor 10000\
12 |                 --lr 0.00025\
13 |                 --num-mini-batch 4\
14 |                 --num-process 16 --num-steps 1000 --log-interval 5 \
15 |                 --seed 100 --use-memory-to-pred-weights --use-linear-lr-decay\
16 |                 --log-dir ppo_1                                   


--------------------------------------------------------------------------------
/scripts/grid_search/ppo/vanilla/config_6.sh:
--------------------------------------------------------------------------------
 1 | python main.py --num-stream-jobs 1000 --num-stream-jobs-factor 1.05\
 2 |                 --num-curriculum-time 1 \
 3 |                 --algo ppo \
 4 |                 --clip-param 0.2\
 5 |                 --ppo-epoch 4\
 6 |                 --num-env-steps 50000000\
 7 |                 --gamma 1\
 8 |                 --entropy-coef 0.01\
 9 |                 --regularize-coef 1\
10 |                 --load-balance-service-rates 0.15 0.25 0.35 0.45 0.55 0.65 0.75 0.85 0.95 1.05 \
11 |                 --reward-norm-factor 10000\
12 |                 --lr 0.00025\
13 |                 --num-mini-batch 4\
14 |                 --num-process 16 --num-steps 1000 --log-interval 5 \
15 |                 --seed 100 --use-memory-to-pred-weights --use-linear-lr-decay\
16 |                 --log-dir ppo_6                                


--------------------------------------------------------------------------------
/scripts/grid_search/a2c/vanilla/config_2.sh:
--------------------------------------------------------------------------------
 1 | python main.py --num-stream-jobs 1000 --num-stream-jobs-factor 1.05\
 2 |                 --num-curriculum-time 1 \
 3 |                 --algo a2c \
 4 |                 --num-env-steps 50000000\
 5 |                 --gamma 1\
 6 |                 --entropy-coef 0.01\
 7 |                 --regularize-coef 1\
 8 |                 --load-balance-service-rates 0.15 0.25 0.35 0.45 0.55 0.65 0.75 0.85 0.95 1.05 \
 9 |                 --reward-norm-factor 10000\
10 |                 --lr 0.00075\
11 |                 --num-mini-batch 32\
12 |                 --adapt-lr 1e-3\
13 |                 --num-inner-steps 5\
14 |                 --lacie-batch-size 64\
15 |                 --lacie-buffer-size 400\
16 |                 --lacie-num-iter 40\
17 |                 --num-process 16 --num-steps 1000 --log-interval 5 \
18 |                 --seed 100 --use-memory-to-pred-weights --use-linear-lr-decay\
19 |                 --log-dir a2c/vanilla/config_2


--------------------------------------------------------------------------------
/scripts/grid_search/a2c/vanilla/config_1.sh:
--------------------------------------------------------------------------------
 1 | python main.py --num-stream-jobs 1000 --num-stream-jobs-factor 1.05\
 2 |                 --num-curriculum-time 1 \
 3 |                 --algo a2c \
 4 |                 --num-env-steps 50000000\
 5 |                 --gamma 1\
 6 |                 --entropy-coef 0.01\
 7 |                 --regularize-coef 1\
 8 |                 --load-balance-service-rates 0.15 0.25 0.35 0.45 0.55 0.65 0.75 0.85 0.95 1.05 \
 9 |                 --reward-norm-factor 10000\
10 |                 --lr 0.001\
11 |                 --num-mini-batch 32\
12 |                 --adapt-lr 1e-3\
13 |                 --num-inner-steps 5\
14 |                 --lacie-batch-size 64\
15 |                 --lacie-buffer-size 400\
16 |                 --lacie-num-iter 40\
17 |                 --num-process 16 --num-steps 1000 --log-interval 5 \
18 |                 --seed 100 --use-memory-to-pred-weights --use-linear-lr-decay\
19 |                 --log-dir a2c/vanilla/config_1
20 | 


--------------------------------------------------------------------------------
/scripts/grid_search/a2c/vanilla/config_3.sh:
--------------------------------------------------------------------------------
 1 | python main.py --num-stream-jobs 1000 --num-stream-jobs-factor 1.05\
 2 |                 --num-curriculum-time 1 \
 3 |                 --algo a2c \
 4 |                 --num-env-steps 50000000\
 5 |                 --gamma 1\
 6 |                 --entropy-coef 0.01\
 7 |                 --regularize-coef 1\
 8 |                 --load-balance-service-rates 0.15 0.25 0.35 0.45 0.55 0.65 0.75 0.85 0.95 1.05 \
 9 |                 --reward-norm-factor 10000\
10 |                 --lr 0.0005\
11 |                 --num-mini-batch 32\
12 |                 --adapt-lr 1e-3\
13 |                 --num-inner-steps 5\
14 |                 --lacie-batch-size 64\
15 |                 --lacie-buffer-size 400\
16 |                 --lacie-num-iter 40\
17 |                 --num-process 16 --num-steps 1000 --log-interval 5 \
18 |                 --seed 100 --use-memory-to-pred-weights --use-linear-lr-decay\
19 |                 --log-dir a2c/vanilla/config_3
20 | 


--------------------------------------------------------------------------------
/scripts/grid_search/a2c/vanilla/config_4.sh:
--------------------------------------------------------------------------------
 1 | python main.py --num-stream-jobs 1000 --num-stream-jobs-factor 1.05\
 2 |                 --num-curriculum-time 1 \
 3 |                 --algo a2c \
 4 |                 --num-env-steps 50000000\
 5 |                 --gamma 1\
 6 |                 --entropy-coef 0.01\
 7 |                 --regularize-coef 1\
 8 |                 --load-balance-service-rates 0.15 0.25 0.35 0.45 0.55 0.65 0.75 0.85 0.95 1.05 \
 9 |                 --reward-norm-factor 10000\
10 |                 --lr 0.002\
11 |                 --num-mini-batch 32\
12 |                 --adapt-lr 1e-3\
13 |                 --num-inner-steps 5\
14 |                 --lacie-batch-size 64\
15 |                 --lacie-buffer-size 400\
16 |                 --lacie-num-iter 40\
17 |                 --num-process 16 --num-steps 1000 --log-interval 5 \
18 |                 --seed 100 --use-memory-to-pred-weights --use-linear-lr-decay\
19 |                 --log-dir a2c/vanilla/config_4
20 | 


--------------------------------------------------------------------------------
/scripts/lacie_a2c_load_balance.sh:
--------------------------------------------------------------------------------
 1 | python main.py --num-stream-jobs 1000 --num-stream-jobs-factor 1.05\
 2 |                 --num-curriculum-time 1 \
 3 |                 --algo lacie_a2c_memory \
 4 |                 --num-env-steps 200000000\
 5 |                 --gamma 1\
 6 |                 --entropy-coef 0.01\
 7 |                 --regularize-coef 1\
 8 |                 --load-balance-service-rates 0.15 0.25 0.35 0.45 0.55 0.65 0.75 0.85 0.95 1.05 \
 9 |                 --reward-norm-factor 10000\
10 |                 --lr 0.001\
11 |                 --num-mini-batch 32\
12 |                 --eval-interval 100\
13 |                 --adapt-lr 1e-3\
14 |                 --num-inner-steps 5\
15 |                 --lacie-batch-size 64\
16 |                 --lacie-buffer-size 400\
17 |                 --lacie-num-iter 40\
18 |                 --num-process 16 --num-steps 1000 --log-interval 5 \
19 |                 --seed 100 --use-memory-to-pred-weights --use-linear-lr-decay\
20 |                 --log-dir lacie_a2c
21 | 


--------------------------------------------------------------------------------
/scripts/grid_search/a2c/lacie/config_1.sh:
--------------------------------------------------------------------------------
 1 | python main.py --num-stream-jobs 1000 --num-stream-jobs-factor 1.05\
 2 |                 --num-curriculum-time 1 \
 3 |                 --algo lacie_a2c_memory \
 4 |                 --num-env-steps 50000000\
 5 |                 --gamma 1\
 6 |                 --entropy-coef 0.01\
 7 |                 --regularize-coef 1\
 8 |                 --load-balance-service-rates 0.15 0.25 0.35 0.45 0.55 0.65 0.75 0.85 0.95 1.05 \
 9 |                 --reward-norm-factor 10000\
10 |                 --lr 0.001\
11 |                 --cpc-lr 0.001\
12 |                 --num-mini-batch 32\
13 |                 --adapt-lr 1e-3\
14 |                 --num-inner-steps 5\
15 |                 --lacie-batch-size 64\
16 |                 --lacie-buffer-size 400\
17 |                 --lacie-num-iter 40\
18 |                 --num-process 16 --num-steps 1000 --log-interval 5 \
19 |                 --seed 100 --use-memory-to-pred-weights --use-linear-lr-decay\
20 |                 --log-dir lacie_a2c_1
21 | 


--------------------------------------------------------------------------------
/scripts/grid_search/a2c/lacie/config_2.sh:
--------------------------------------------------------------------------------
 1 | python main.py --num-stream-jobs 1000 --num-stream-jobs-factor 1.05\
 2 |                 --num-curriculum-time 1 \
 3 |                 --algo lacie_a2c_memory \
 4 |                 --num-env-steps 50000000\
 5 |                 --gamma 1\
 6 |                 --entropy-coef 0.01\
 7 |                 --regularize-coef 1\
 8 |                 --load-balance-service-rates 0.15 0.25 0.35 0.45 0.55 0.65 0.75 0.85 0.95 1.05 \
 9 |                 --reward-norm-factor 10000\
10 |                 --lr 0.00075\
11 |                 --cpc-lr 0.001\
12 |                 --num-mini-batch 32\
13 |                 --adapt-lr 1e-3\
14 |                 --num-inner-steps 5\
15 |                 --lacie-batch-size 64\
16 |                 --lacie-buffer-size 400\
17 |                 --lacie-num-iter 40\
18 |                 --num-process 16 --num-steps 1000 --log-interval 5 \
19 |                 --seed 100 --use-memory-to-pred-weights --use-linear-lr-decay\
20 |                 --log-dir lacie_a2c_2
21 | 


--------------------------------------------------------------------------------
/scripts/grid_search/a2c/lacie/config_3.sh:
--------------------------------------------------------------------------------
 1 | python main.py --num-stream-jobs 1000 --num-stream-jobs-factor 1.05\
 2 |                 --num-curriculum-time 1 \
 3 |                 --algo lacie_a2c_memory \
 4 |                 --num-env-steps 50000000\
 5 |                 --gamma 1\
 6 |                 --entropy-coef 0.01\
 7 |                 --regularize-coef 1\
 8 |                 --load-balance-service-rates 0.15 0.25 0.35 0.45 0.55 0.65 0.75 0.85 0.95 1.05 \
 9 |                 --reward-norm-factor 10000\
10 |                 --lr 0.0005\
11 |                 --cpc-lr 0.001\
12 |                 --num-mini-batch 32\
13 |                 --adapt-lr 1e-3\
14 |                 --num-inner-steps 5\
15 |                 --lacie-batch-size 64\
16 |                 --lacie-buffer-size 400\
17 |                 --lacie-num-iter 40\
18 |                 --num-process 16 --num-steps 1000 --log-interval 5 \
19 |                 --seed 100 --use-memory-to-pred-weights --use-linear-lr-decay\
20 |                 --log-dir lacie_a2c_3
21 | 


--------------------------------------------------------------------------------
/scripts/grid_search/a2c/lacie/config_4.sh:
--------------------------------------------------------------------------------
 1 | python main.py --num-stream-jobs 1000 --num-stream-jobs-factor 1.05\
 2 |                 --num-curriculum-time 1 \
 3 |                 --algo lacie_a2c_memory \
 4 |                 --num-env-steps 50000000\
 5 |                 --gamma 1\
 6 |                 --entropy-coef 0.01\
 7 |                 --regularize-coef 1\
 8 |                 --load-balance-service-rates 0.15 0.25 0.35 0.45 0.55 0.65 0.75 0.85 0.95 1.05 \
 9 |                 --reward-norm-factor 10000\
10 |                 --lr 0.002\
11 |                 --cpc-lr 0.001\
12 |                 --num-mini-batch 32\
13 |                 --adapt-lr 1e-3\
14 |                 --num-inner-steps 5\
15 |                 --lacie-batch-size 64\
16 |                 --lacie-buffer-size 400\
17 |                 --lacie-num-iter 40\
18 |                 --num-process 16 --num-steps 1000 --log-interval 5 \
19 |                 --seed 100 --use-memory-to-pred-weights --use-linear-lr-decay\
20 |                 --log-dir lacie_a2c_4
21 | 


--------------------------------------------------------------------------------
/scripts/grid_search/a2c/lacie/config_5.sh:
--------------------------------------------------------------------------------
 1 | python main.py --num-stream-jobs 1000 --num-stream-jobs-factor 1.05\
 2 |                 --num-curriculum-time 1 \
 3 |                 --algo lacie_a2c_memory \
 4 |                 --num-env-steps 50000000\
 5 |                 --gamma 1\
 6 |                 --entropy-coef 0.01\
 7 |                 --regularize-coef 1\
 8 |                 --load-balance-service-rates 0.15 0.25 0.35 0.45 0.55 0.65 0.75 0.85 0.95 1.05 \
 9 |                 --reward-norm-factor 10000\
10 |                 --lr 0.001\
11 |                 --cpc-lr 0.00075\
12 |                 --num-mini-batch 32\
13 |                 --adapt-lr 1e-3\
14 |                 --num-inner-steps 5\
15 |                 --lacie-batch-size 64\
16 |                 --lacie-buffer-size 400\
17 |                 --lacie-num-iter 40\
18 |                 --num-process 16 --num-steps 1000 --log-interval 5 \
19 |                 --seed 100 --use-memory-to-pred-weights --use-linear-lr-decay\
20 |                 --log-dir lacie_a2c_5
21 | 


--------------------------------------------------------------------------------
/scripts/grid_search/a2c/lacie/config_6.sh:
--------------------------------------------------------------------------------
 1 | python main.py --num-stream-jobs 1000 --num-stream-jobs-factor 1.05\
 2 |                 --num-curriculum-time 1 \
 3 |                 --algo lacie_a2c_memory \
 4 |                 --num-env-steps 50000000\
 5 |                 --gamma 1\
 6 |                 --entropy-coef 0.01\
 7 |                 --regularize-coef 1\
 8 |                 --load-balance-service-rates 0.15 0.25 0.35 0.45 0.55 0.65 0.75 0.85 0.95 1.05 \
 9 |                 --reward-norm-factor 10000\
10 |                 --lr 0.001\
11 |                 --cpc-lr 0.002\
12 |                 --num-mini-batch 32\
13 |                 --adapt-lr 1e-3\
14 |                 --num-inner-steps 5\
15 |                 --lacie-batch-size 64\
16 |                 --lacie-buffer-size 400\
17 |                 --lacie-num-iter 40\
18 |                 --num-process 16 --num-steps 1000 --log-interval 5 \
19 |                 --seed 100 --use-memory-to-pred-weights --use-linear-lr-decay\
20 |                 --log-dir lacie_a2c_6
21 | 


--------------------------------------------------------------------------------
/scripts/grid_search/a2c/lacie/config_7.sh:
--------------------------------------------------------------------------------
 1 | python main.py --num-stream-jobs 1000 --num-stream-jobs-factor 1.05\
 2 |                 --num-curriculum-time 1 \
 3 |                 --algo lacie_a2c_memory \
 4 |                 --num-env-steps 50000000\
 5 |                 --gamma 1\
 6 |                 --entropy-coef 0.01\
 7 |                 --regularize-coef 1\
 8 |                 --load-balance-service-rates 0.15 0.25 0.35 0.45 0.55 0.65 0.75 0.85 0.95 1.05 \
 9 |                 --reward-norm-factor 10000\
10 |                 --lr 0.001\
11 |                 --cpc-lr 0.0005\
12 |                 --num-mini-batch 32\
13 |                 --adapt-lr 1e-3\
14 |                 --num-inner-steps 5\
15 |                 --lacie-batch-size 64\
16 |                 --lacie-buffer-size 400\
17 |                 --lacie-num-iter 40\
18 |                 --num-process 16 --num-steps 1000 --log-interval 5 \
19 |                 --seed 100 --use-memory-to-pred-weights --use-linear-lr-decay\
20 |                 --log-dir lacie_a2c_7
21 | 


--------------------------------------------------------------------------------
/scripts/grid_search/ppo/lacie/config_1.sh:
--------------------------------------------------------------------------------
 1 | python main.py --num-stream-jobs 1000 --num-stream-jobs-factor 1.05\
 2 |                 --num-curriculum-time 1 \
 3 |                 --algo lacie_ppo_memory \
 4 |                 --clip-param 0.1\
 5 |                 --ppo-epoch 4\
 6 |                 --num-env-steps 50000000\
 7 |                 --gamma 1\
 8 |                 --entropy-coef 0.01\
 9 |                 --regularize-coef 1\
10 |                 --load-balance-service-rates 0.15 0.25 0.35 0.45 0.55 0.65 0.75 0.85 0.95 1.05 \
11 |                 --reward-norm-factor 10000\
12 |                 --lr 0.0001\
13 |                 --cpc-lr 0.001\
14 |                 --num-mini-batch 4\
15 |                 --adapt-lr 1e-3\
16 |                 --num-inner-steps 5\
17 |                 --lacie-batch-size 64\
18 |                 --lacie-buffer-size 400\
19 |                 --lacie-num-iter 40\
20 |                 --num-process 16 --num-steps 1000 --log-interval 5 \
21 |                 --seed 100 --use-memory-to-pred-weights --use-linear-lr-decay\
22 |                 --log-dir lacie_ppo_1


--------------------------------------------------------------------------------
/scripts/grid_search/ppo/lacie/config_2.sh:
--------------------------------------------------------------------------------
 1 | python main.py --num-stream-jobs 1000 --num-stream-jobs-factor 1.05\
 2 |                 --num-curriculum-time 1 \
 3 |                 --algo lacie_ppo_memory \
 4 |                 --clip-param 0.1\
 5 |                 --ppo-epoch 4\
 6 |                 --num-env-steps 50000000\
 7 |                 --gamma 1\
 8 |                 --entropy-coef 0.01\
 9 |                 --regularize-coef 1\
10 |                 --load-balance-service-rates 0.15 0.25 0.35 0.45 0.55 0.65 0.75 0.85 0.95 1.05 \
11 |                 --reward-norm-factor 10000\
12 |                 --lr 0.0001\
13 |                 --cpc-lr 0.001\
14 |                 --num-mini-batch 4\
15 |                 --adapt-lr 1e-3\
16 |                 --num-inner-steps 5\
17 |                 --lacie-batch-size 32\
18 |                 --lacie-buffer-size 400\
19 |                 --lacie-num-iter 40\
20 |                 --num-process 16 --num-steps 1000 --log-interval 5 \
21 |                 --seed 100 --use-memory-to-pred-weights --use-linear-lr-decay\
22 |                 --log-dir lacie_ppo_2


--------------------------------------------------------------------------------
/scripts/grid_search/ppo/lacie/config_3.sh:
--------------------------------------------------------------------------------
 1 | python main.py --num-stream-jobs 1000 --num-stream-jobs-factor 1.05\
 2 |                 --num-curriculum-time 1 \
 3 |                 --algo lacie_ppo_memory \
 4 |                 --clip-param 0.1\
 5 |                 --ppo-epoch 4\
 6 |                 --num-env-steps 50000000\
 7 |                 --gamma 1\
 8 |                 --entropy-coef 0.01\
 9 |                 --regularize-coef 1\
10 |                 --load-balance-service-rates 0.15 0.25 0.35 0.45 0.55 0.65 0.75 0.85 0.95 1.05 \
11 |                 --reward-norm-factor 10000\
12 |                 --lr 0.0001\
13 |                 --cpc-lr 0.001\
14 |                 --num-mini-batch 4\
15 |                 --adapt-lr 1e-3\
16 |                 --num-inner-steps 5\
17 |                 --lacie-batch-size 16\
18 |                 --lacie-buffer-size 400\
19 |                 --lacie-num-iter 40\
20 |                 --num-process 16 --num-steps 1000 --log-interval 5 \
21 |                 --seed 100 --use-memory-to-pred-weights --use-linear-lr-decay\
22 |                 --log-dir lacie_ppo_3


--------------------------------------------------------------------------------
/scripts/grid_search/ppo/lacie/config_6.sh:
--------------------------------------------------------------------------------
 1 | python main.py --num-stream-jobs 1000 --num-stream-jobs-factor 1.05\
 2 |                 --num-curriculum-time 1 \
 3 |                 --algo lacie_ppo_memory \
 4 |                 --clip-param 0.1\
 5 |                 --ppo-epoch 4\
 6 |                 --num-env-steps 50000000\
 7 |                 --gamma 1\
 8 |                 --entropy-coef 0.01\
 9 |                 --regularize-coef 1\
10 |                 --load-balance-service-rates 0.15 0.25 0.35 0.45 0.55 0.65 0.75 0.85 0.95 1.05 \
11 |                 --reward-norm-factor 10000\
12 |                 --lr 0.0001\
13 |                 --cpc-lr 0.001\
14 |                 --num-mini-batch 4\
15 |                 --adapt-lr 1e-3\
16 |                 --num-inner-steps 5\
17 |                 --lacie-batch-size 16\
18 |                 --lacie-buffer-size 400\
19 |                 --lacie-num-iter 25\
20 |                 --num-process 16 --num-steps 1000 --log-interval 5 \
21 |                 --seed 100 --use-memory-to-pred-weights --use-linear-lr-decay\
22 |                 --log-dir lacie_ppo_6


--------------------------------------------------------------------------------
/scripts/grid_search/ppo/lacie/config_7.sh:
--------------------------------------------------------------------------------
 1 | python main.py --num-stream-jobs 1000 --num-stream-jobs-factor 1.05\
 2 |                 --num-curriculum-time 1 \
 3 |                 --algo lacie_ppo_memory \
 4 |                 --clip-param 0.1\
 5 |                 --ppo-epoch 4\
 6 |                 --num-env-steps 50000000\
 7 |                 --gamma 1\
 8 |                 --entropy-coef 0.01\
 9 |                 --regularize-coef 1\
10 |                 --load-balance-service-rates 0.15 0.25 0.35 0.45 0.55 0.65 0.75 0.85 0.95 1.05 \
11 |                 --reward-norm-factor 10000\
12 |                 --lr 0.0001\
13 |                 --cpc-lr 0.001\
14 |                 --num-mini-batch 4\
15 |                 --adapt-lr 1e-3\
16 |                 --num-inner-steps 5\
17 |                 --lacie-batch-size 16\
18 |                 --lacie-buffer-size 400\
19 |                 --lacie-num-iter 75\
20 |                 --num-process 16 --num-steps 1000 --log-interval 5 \
21 |                 --seed 100 --use-memory-to-pred-weights --use-linear-lr-decay\
22 |                 --log-dir lacie_ppo_7


--------------------------------------------------------------------------------
/scripts/lacie_ppo_load_balance.sh:
--------------------------------------------------------------------------------
 1 | python main.py --num-stream-jobs 1000 --num-stream-jobs-factor 1.05\
 2 |                 --num-curriculum-time 1 \
 3 |                 --algo lacie_ppo_memory \
 4 |                 --clip-param 0.1\
 5 |                 --ppo-epoch 4\
 6 |                 --num-env-steps 200000000\
 7 |                 --gamma 1\
 8 |                 --entropy-coef 0.01\
 9 |                 --regularize-coef 1\
10 |                 --load-balance-service-rates 0.15 0.25 0.35 0.45 0.55 0.65 0.75 0.85 0.95 1.05 \
11 |                 --reward-norm-factor 10000\
12 |                 --lr 0.00025\
13 |                 --num-mini-batch 4\
14 |                 --eval-interval 100\
15 |                 --adapt-lr 1e-3\
16 |                 --num-inner-steps 5\
17 |                 --lacie-batch-size 64\
18 |                 --lacie-buffer-size 400\
19 |                 --lacie-num-iter 40\
20 |                 --num-process 16 --num-steps 1000 --log-interval 5 \
21 |                 --seed 100 --use-memory-to-pred-weights --use-linear-lr-decay\
22 |                 --log-dir lacie_ppo


--------------------------------------------------------------------------------
/scripts/grid_search/ppo/lacie/config_4.sh:
--------------------------------------------------------------------------------
 1 | python main.py --num-stream-jobs 1000 --num-stream-jobs-factor 1.05\
 2 |                 --num-curriculum-time 1 \
 3 |                 --algo lacie_ppo_memory \
 4 |                 --clip-param 0.1\
 5 |                 --ppo-epoch 4\
 6 |                 --num-env-steps 50000000\
 7 |                 --gamma 1\
 8 |                 --entropy-coef 0.01\
 9 |                 --regularize-coef 1\
10 |                 --load-balance-service-rates 0.15 0.25 0.35 0.45 0.55 0.65 0.75 0.85 0.95 1.05 \
11 |                 --reward-norm-factor 10000\
12 |                 --lr 0.0001\
13 |                 --cpc-lr 0.0005\
14 |                 --num-mini-batch 4\
15 |                 --adapt-lr 1e-3\
16 |                 --num-inner-steps 5\
17 |                 --lacie-batch-size 32\
18 |                 --lacie-buffer-size 400\
19 |                 --lacie-num-iter 40\
20 |                 --num-process 16 --num-steps 1000 --log-interval 5 \
21 |                 --seed 100 --use-memory-to-pred-weights --use-linear-lr-decay\
22 |                 --log-dir lacie_ppo_4


--------------------------------------------------------------------------------
/scripts/grid_search/ppo/lacie/config_5.sh:
--------------------------------------------------------------------------------
 1 | python main.py --num-stream-jobs 1000 --num-stream-jobs-factor 1.05\
 2 |                 --num-curriculum-time 1 \
 3 |                 --algo lacie_ppo_memory \
 4 |                 --clip-param 0.1\
 5 |                 --ppo-epoch 4\
 6 |                 --num-env-steps 50000000\
 7 |                 --gamma 1\
 8 |                 --entropy-coef 0.01\
 9 |                 --regularize-coef 1\
10 |                 --load-balance-service-rates 0.15 0.25 0.35 0.45 0.55 0.65 0.75 0.85 0.95 1.05 \
11 |                 --reward-norm-factor 10000\
12 |                 --lr 0.0001\
13 |                 --cpc-lr 0.0001\
14 |                 --num-mini-batch 4\
15 |                 --adapt-lr 1e-3\
16 |                 --num-inner-steps 5\
17 |                 --lacie-batch-size 32\
18 |                 --lacie-buffer-size 400\
19 |                 --lacie-num-iter 40\
20 |                 --num-process 16 --num-steps 1000 --log-interval 5 \
21 |                 --seed 100 --use-memory-to-pred-weights --use-linear-lr-decay\
22 |                 --log-dir lacie_ppo_5


--------------------------------------------------------------------------------
/core/agents/models/cnn_base.py:
--------------------------------------------------------------------------------
 1 | from torch import nn
 2 | from core.agents.models.base import NNBase, Flatten
 3 | from core.utils import init
 4 | 
 5 | class CNNBase(NNBase):
 6 |     def __init__(self, num_inputs, recurrent=False, hidden_size=512):
 7 |         super(CNNBase, self).__init__(recurrent, hidden_size, hidden_size)
 8 | 
 9 |         init_ = lambda m: init(m, nn.init.orthogonal_, lambda x: nn.init.
10 |                                constant_(x, 0), nn.init.calculate_gain('relu'))
11 | 
12 |         self.main = nn.Sequential(
13 |             init_(nn.Conv2d(num_inputs, 32, 8, stride=4)), nn.ReLU(),
14 |             init_(nn.Conv2d(32, 64, 4, stride=2)), nn.ReLU(),
15 |             init_(nn.Conv2d(64, 32, 3, stride=1)), nn.ReLU(), Flatten(),
16 |             init_(nn.Linear(32 * 7 * 7, hidden_size)), nn.ReLU())
17 | 
18 |         init_ = lambda m: init(m, nn.init.orthogonal_, lambda x: nn.init.
19 |                                constant_(x, 0))
20 | 
21 |         self.critic_linear = init_(nn.Linear(hidden_size, 1))
22 | 
23 |         self.train()
24 | 
25 |     def forward(self, inputs, rnn_hxs, masks):
26 |         x = self.main(inputs / 255.0)
27 | 
28 |         if self.is_recurrent:
29 |             x, rnn_hxs = self._forward_gru(x, rnn_hxs, masks)
30 | 
31 |         return self.critic_linear(x), x, rnn_hxs
32 | 
33 | 


--------------------------------------------------------------------------------
/core/agents/models/mlp_base.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from torch import nn
 3 | from core.agents.models.base import NNBase
 4 | from core.utils import init
 5 | 
 6 | 
 7 | class MLPBase(NNBase):
 8 |     def __init__(self, num_inputs, recurrent=False, hidden_size=64):
 9 |         super(MLPBase, self).__init__(recurrent, num_inputs, hidden_size)
10 | 
11 |         def init_(m): return init(m, nn.init.orthogonal_, lambda x: nn.init.
12 |                                   constant_(x, 0), np.sqrt(2))
13 | 
14 |         if recurrent:
15 |             num_inputs = hidden_size
16 | 
17 |         self.actor = nn.Sequential(
18 |             init_(nn.Linear(num_inputs, hidden_size)), nn.Tanh(),
19 |             init_(nn.Linear(hidden_size, hidden_size)), nn.Tanh(),
20 |         )
21 | 
22 |         self.critic = nn.Sequential(
23 |             init_(nn.Linear(num_inputs, hidden_size)), nn.Tanh(),
24 |             init_(nn.Linear(hidden_size, hidden_size)), nn.Tanh(),
25 |             init_(nn.Linear(hidden_size, hidden_size)), nn.Tanh(),
26 |             init_(nn.Linear(hidden_size, 1)))
27 | 
28 |         self.train()
29 | 
30 |     def forward(self, inputs, rnn_hxs, masks):
31 |         x = inputs
32 | 
33 |         if self.is_recurrent:
34 |             x, rnn_hxs = self._forward_gru(x, rnn_hxs, masks)
35 | 
36 |         value = self.critic(x)
37 |         hidden_actor = self.actor(x)
38 | 
39 |         return value, hidden_actor, rnn_hxs
40 | 


--------------------------------------------------------------------------------
/core/algorithms/base_algo.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC
 2 | from torch import optim, nn
 3 | 
 4 | import torch
 5 | 
 6 | 
 7 | class BaseAlgo(ABC):
 8 |     IL_DECAY_RATE = 0.995  # decay factor of imitation learning
 9 |     ENTROPY_DECAY_RATE = 0.999
10 |     MIN_ENTROPY_COEF = 0.0001
11 | 
12 |     def __init__(self,
13 |                  actor_critic,
14 |                  lr,
15 |                  value_coef,
16 |                  entropy_coef,
17 |                  expert=None,
18 |                  il_coef=1):
19 |         self.actor_critic = actor_critic
20 |         self.optimizer = optim.Adam(actor_critic.parameters(), lr)
21 | 
22 |         self.value_coef = value_coef
23 |         self.entropy_coef = entropy_coef
24 | 
25 |         self.il_coef = il_coef
26 |         self.expert = expert
27 |         self.il_criterion = nn.CrossEntropyLoss()
28 | 
29 |     def update(self, rollouts):
30 |         pass
31 | 
32 |     def imitation_learning(self, inputs, rnn_hxs, masks, expert):
33 |         """
34 |             Imitation learning loss
35 | 
36 |         :param inputs: state observations
37 | 
38 |         :param rnn_hxs: rnn hidden state
39 | 
40 |         :param masks: mask the final state with 0 value
41 | 
42 |         :param expert: a trained or heuristic agent
43 | 
44 |         :return: log probability of expert's actions
45 |         """
46 |         _, actor_features, _ = self.actor_critic.base(inputs, rnn_hxs, masks)
47 |         dist = self.actor_critic.dist(actor_features)
48 | 
49 |         expert_actions = expert.act(inputs)
50 | 
51 |         il_loss = self.il_criterion(dist.probs, expert_actions.reshape(-1))
52 |         accuracy = (torch.argmax(dist.probs, dim=1) ==
53 |                     expert_actions.reshape(-1)).float().sum()/expert_actions.shape[0]
54 | 
55 |         return il_loss, accuracy
56 | 
57 |     def after_update(self):
58 |         self.il_coef *= self.IL_DECAY_RATE
59 |         self.entropy_coef = max(
60 |             self.entropy_coef * self.ENTROPY_DECAY_RATE, self.MIN_ENTROPY_COEF)
61 | 


--------------------------------------------------------------------------------
/core/utils.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | import os
 3 | import json
 4 | 
 5 | import torch
 6 | import torch.nn as nn
 7 | 
 8 | from core.envs.park_envs import VecNormalize
 9 | 
10 | 
11 | # Get a render function
12 | def get_render_func(venv):
13 |     if hasattr(venv, 'envs'):
14 |         return venv.envs[0].render
15 |     elif hasattr(venv, 'venv'):
16 |         return get_render_func(venv.venv)
17 |     elif hasattr(venv, 'env'):
18 |         return get_render_func(venv.env)
19 | 
20 |     return None
21 | 
22 | 
23 | def get_vec_normalize(venv):
24 |     if isinstance(venv, VecNormalize):
25 |         return venv
26 |     elif hasattr(venv, 'venv'):
27 |         return get_vec_normalize(venv.venv)
28 | 
29 |     return None
30 | 
31 | 
32 | # Necessary for my KFAC implementation.
33 | class AddBias(nn.Module):
34 |     def __init__(self, bias):
35 |         super(AddBias, self).__init__()
36 |         self._bias = nn.Parameter(bias.unsqueeze(1))
37 | 
38 |     def forward(self, x):
39 |         if x.dim() == 2:
40 |             bias = self._bias.t().view(1, -1)
41 |         else:
42 |             bias = self._bias.t().view(1, -1, 1, 1)
43 | 
44 |         return x + bias
45 | 
46 | 
47 | def update_linear_schedule(optimizer, epoch, total_num_epochs, initial_lr):
48 |     """Decreases the learning rate linearly"""
49 |     lr = initial_lr - (initial_lr * (epoch / float(total_num_epochs)))
50 |     for param_group in optimizer.param_groups:
51 |         param_group['lr'] = lr
52 |     return lr
53 | 
54 | 
55 | def init(module, weight_init, bias_init, gain=1):
56 |     weight_init(module.weight.data, gain=gain)
57 |     if hasattr(module, 'bias') and (not module.bias is None):
58 |         bias_init(module.bias.data)
59 |     return module
60 | 
61 | 
62 | def cleanup_log_dir(log_dir):
63 |     try:
64 |         os.makedirs(log_dir)
65 |     except OSError:
66 |         files = glob.glob(os.path.join(log_dir, '*.monitor.csv'))
67 |         for f in files:
68 |             os.remove(f)
69 | 
70 | 
71 | def dump_config(argparser, output_path):
72 |     with open(output_path, 'w') as f:
73 |         json.dump(argparser.__dict__, f, indent=2)
74 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Learning to Assign Credit in Input-driven Environment (LACIE) reduce the variance of estimation of advantages value in noisy MDP with hindsight distribution.
 2 | 
 3 | ## Input-driven MDP
 4 | Input-driven MDP are the Markov processes governed by not only agent's actions but also stochastic, exogenous input processes [1]. These environments have high variance inheritantly making it hard to learn optimal policy.
 5 | 
 6 | This repository implemented:
 7 | 
 8 | + Input-dependence baseline as in proposed in [1].
 9 | 
10 | + **Lacie** - an algorithm that learn to weight the advantages of each rollout in hindsight with respect to future input sequences.
11 | 
12 | ## Install Dependencies
13 | 
14 | 1. Install Pytorch 
15 | 
16 | ```bash
17 | pip install torch torchvision
18 | ```
19 | 
20 | 2. install Tensorflow 2
21 | 
22 | ```bash
23 | pip install tensorflow=2.2
24 | ```
25 | or 
26 | ```bash
27 | pip install tensorflow-gpu=2.2
28 | ```
29 | 
30 | 3. Install [OpenAI baseline](https://github.com/openai/baselines/tree/tf2) (Tensorflow 2 version)
31 | ```bash
32 | git clone https://github.com/openai/baselines.git -b tf2 && \
33 | cd baselines && \
34 | pip install -e .
35 | ```
36 | 
37 | **Note**: I haven't tested the code on Tensorflow 1 yet but it should work as well.
38 | 
39 | 4. Install [Park Platform](https://github.com/park-project/park). I modified the platform slightly to make it compatible with OpenAI's baseline.
40 | ```bash
41 | git clone https://github.com/lehduong/park &&\
42 | cd park && \
43 | pip install -e .
44 | ```
45 | 
46 | ## Run experiments
47 | See `scripts` for examples.
48 | 
49 | ## Results:
50 | **Reward** of A2C+Lacie (yellow) vs A2C (blue)
51 | ![reward](assets/reward.svg)
52 | 
53 | **Value loss** of A2C+Lacie (yellow) vs A2C (blue) during training:
54 | ![train-value-loss](assets/train_value_loss.svg)
55 | 
56 | ## Reference
57 | 
58 |  [1] [Variance Reduction for Reinforcement Learning in Input-Driven Environments](https://openreview.net/forum?id=Hyg1G2AqtQ).
59 | 
60 | ## Acknowledgement
61 | The started code is based on [ikostrikov's repository](https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail).
62 | 
63 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | ex/
131 | ex_eval/
132 | trained_models/
133 | .DS_Store
134 | .idea/
135 | .vscode/
136 | logs/


--------------------------------------------------------------------------------
/core/agents/models/bncnn.py:
--------------------------------------------------------------------------------
 1 | from torch import nn
 2 | from core.agents.models.base import NNBase, Flatten
 3 | from core.utils import init
 4 | 
 5 | class BNCNN(NNBase):
 6 |     def __init__(self, num_inputs, recurrent=False, hidden_size=512):
 7 |         super().__init__(recurrent, hidden_size, hidden_size)
 8 | 
 9 |         # shared weight that encode state to vector
10 |         self.main = nn.Sequential(
11 |             self.init_weight(nn.Conv2d(num_inputs, 32, 4, stride=2, bias=False)), 
12 |             self.init_weight(nn.BatchNorm2d(32)), # 32x41x41
13 |             nn.ReLU(), 
14 |             self.init_weight(nn.Conv2d(32, 64, 5, stride=2, bias=False)), 
15 |             self.init_weight(nn.BatchNorm2d(64)), # 64x19x19
16 |             nn.ReLU(), 
17 |             self.init_weight(nn.Conv2d(64, 128, 3, stride=2, bias=False)), 
18 |             self.init_weight(nn.BatchNorm2d(128)), # 128x9x9
19 |             nn.ReLU(), 
20 |             self.init_weight(nn.Conv2d(128, 256, 5, stride=2, bias=False)), 
21 |             self.init_weight(nn.BatchNorm2d(256)), 
22 |             nn.ReLU(), 
23 |             Flatten(), # 256x3x3
24 |             self.init_weight(nn.Linear(256*3*3, hidden_size)), 
25 |             nn.ReLU())
26 | 
27 |         self.critic_linear = nn.Sequential(
28 |             self.init_weight(nn.Linear(hidden_size, hidden_size)),
29 |             nn.ReLU(),
30 |             self.init_weight(nn.Linear(hidden_size, hidden_size)),
31 |             nn.ReLU(),
32 |             self.init_weight(nn.Linear(hidden_size, 1))
33 |         )
34 | 
35 |         # encoder for learning contrastive predictive objective
36 |         self.contrastive_encoder = nn.Sequential(
37 |             self.init_weight(nn.Linear(hidden_size, hidden_size)),
38 |             nn.ReLU(),
39 |             self.init_weight(nn.Linear(hidden_size, hidden_size)),
40 |             nn.ReLU(),
41 |             self.init_weight(nn.Linear(hidden_size, hidden_size))
42 |         )
43 |         self.train()
44 |     
45 |     def init_weight(self, layer):
46 |         init_ = lambda m: init(m, nn.init.orthogonal_, lambda x: nn.init.
47 |                                constant_(x, 0), nn.init.calculate_gain('relu'))
48 |         if isinstance(layer, nn.Conv2d) or isinstance(layer, nn.Linear):
49 |             return init_(layer)
50 |         elif isinstance(layer, nn.BatchNorm2d):
51 |             layer.weight.data.fill_(1)
52 |             if hasattr(layer, 'bias'):
53 |                 layer.bias.data.zero_()
54 |             return layer 
55 | 
56 |     def forward(self, inputs, rnn_hxs, masks):
57 |         x = self.main(inputs / 255.0)
58 | 
59 |         if self.is_recurrent:
60 |             x, rnn_hxs = self._forward_gru(x, rnn_hxs, masks)
61 | 
62 |         return self.critic_linear(x), self.contrastive_encoder(x), rnn_hxs
63 | 


--------------------------------------------------------------------------------
/core/agents/pg.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | import torch.nn as nn
 4 | 
 5 | from core.distributions import Bernoulli, Categorical, DiagGaussian
 6 | from core.utils import init
 7 | from core.agents.models import CNNBase, MLPBase
 8 | 
 9 | 
10 | class Policy(nn.Module):
11 |     def __init__(self, obs_shape, action_space, base=None, base_kwargs=None):
12 |         super(Policy, self).__init__()
13 |         # create base network for acting
14 |         if base_kwargs is None:
15 |             base_kwargs = {}
16 |         if base is None:
17 |             if len(obs_shape) == 3:
18 |                 base = CNNBase
19 |             elif len(obs_shape) == 1:
20 |                 base = MLPBase
21 |             else:
22 |                 raise NotImplementedError(
23 |                     "Expect the observation's shape to be either 3 or 1 but got {}".format(len(obs_shape)))
24 | 
25 |         self.base = base(obs_shape[0], **base_kwargs)
26 | 
27 |         # action sampling mechanism
28 |         if action_space.__class__.__name__ == "Discrete":
29 |             num_outputs = action_space.n
30 |             self.dist = Categorical(self.base.output_size, num_outputs)
31 |         elif action_space.__class__.__name__ == "Box":
32 |             num_outputs = action_space.shape[0]
33 |             self.dist = DiagGaussian(self.base.output_size, num_outputs)
34 |         elif action_space.__class__.__name__ == "MultiBinary":
35 |             num_outputs = action_space.shape[0]
36 |             self.dist = Bernoulli(self.base.output_size, num_outputs)
37 |         else:
38 |             raise NotImplementedError
39 | 
40 |         self.obs_shape = obs_shape
41 |         self.action_space = action_space
42 | 
43 |     @property
44 |     def is_recurrent(self):
45 |         return self.base.is_recurrent
46 | 
47 |     @property
48 |     def recurrent_hidden_state_size(self):
49 |         """Size of rnn_hx."""
50 |         return self.base.recurrent_hidden_state_size
51 | 
52 |     def forward(self, inputs, rnn_hxs, masks):
53 |         raise NotImplementedError
54 | 
55 |     def act(self, inputs, rnn_hxs, masks, deterministic=False):
56 |         value, actor_features, rnn_hxs = self.base(inputs, rnn_hxs, masks)
57 |         dist = self.dist(actor_features)
58 | 
59 |         # if deterministic greedily choose the most optimal solution otherwise sampling with probability proportional to cummulate reward
60 |         if deterministic:
61 |             action = dist.mode()
62 |         else:
63 |             action = dist.sample()
64 | 
65 |         action_log_probs = dist.log_probs(action)
66 |         dist_entropy = dist.entropy().mean()
67 | 
68 |         return value, action, action_log_probs, rnn_hxs
69 | 
70 |     def get_value(self, inputs, rnn_hxs, masks):
71 |         value, _, _ = self.base(inputs, rnn_hxs, masks)
72 |         return value
73 | 
74 |     def evaluate_actions(self, inputs, rnn_hxs, masks, action):
75 |         value, actor_features, rnn_hxs = self.base(inputs, rnn_hxs, masks)
76 |         dist = self.dist(actor_features)
77 | 
78 |         action_log_probs = dist.log_probs(action)
79 |         dist_entropy = dist.entropy().mean()
80 | 
81 |         return value, action_log_probs, dist_entropy, rnn_hxs
82 | 


--------------------------------------------------------------------------------
/core/envs/load_balance_wrappers.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import random
 3 | import numpy as np
 4 | 
 5 | 
 6 | class ProcessLoadBalanceObservation(gym.ObservationWrapper):
 7 |     """
 8 |         Normalize and clip the observation of LoadBalance environment
 9 |         :param job_size_norm_factor: float - divide job_size by this factor
10 |         :param highest_server_obs: float - clip the server (in observation) having load higher than this value
11 |         :param highest_job_obs: float - clip the job (in observation) having size greater than this value
12 |     """
13 | 
14 |     def __init__(self,
15 |                  env,
16 |                  job_size_norm_factor,
17 |                  server_load_norm_factor,
18 |                  highest_server_obs,
19 |                  highest_job_obs,
20 |                  elapsed_time_norm_factor,
21 |                  highest_elapsed_time):
22 |         super().__init__(env)
23 |         self.job_size_norm_factor = job_size_norm_factor
24 |         self.server_load_norm_factor = server_load_norm_factor
25 |         self.elapsed_time_norm_factor = elapsed_time_norm_factor
26 |         self.highest_server_obs = highest_server_obs
27 |         self.highest_job_obs = highest_job_obs
28 |         self.highest_elapsed_time = highest_elapsed_time
29 | 
30 |         # compute clip threshold
31 |         num_server = len(env.servers)
32 |         self.threshold = np.array(
33 |             [self.highest_server_obs] * num_server +
34 |             [self.highest_job_obs] +
35 |             [self.highest_elapsed_time])
36 |         # compute the normalize vector
37 |         self.norm_vec = np.array(
38 |             [self.server_load_norm_factor] * num_server +
39 |             [self.job_size_norm_factor] +
40 |             [self.elapsed_time_norm_factor])
41 | 
42 |     def observation(self, observation):
43 |         # normalized
44 |         observation = observation/self.norm_vec
45 |         return np.minimum(observation, self.threshold)
46 | 
47 | 
48 | class LoadBalanceRandomReset(gym.Wrapper):
49 |     def __init__(self, env, max_random_steps=50):
50 |         """Sample initial states by taking random number of no-ops on reset.
51 |         """
52 |         super().__init__(env)
53 |         self.max_random_steps = max_random_steps
54 | 
55 |     def reset(self, **kwargs):
56 |         """ Do no-op action for a number of steps in [1, noop_max]."""
57 |         obs = self.env.reset(**kwargs)
58 | 
59 |         # stochastically change number of random steps each time resetting the env
60 |         num_random_steps = np.random.randint(0, self.max_random_steps)
61 | 
62 |         for _ in range(num_random_steps):
63 |             obs, _, done, _ = self.env.step(
64 |                 random.randint(0, len(self.env.servers)-1))
65 |             if done:
66 |                 obs = self.env.reset(**kwargs)
67 |         return obs
68 | 
69 | 
70 | class RewardNormalize(gym.RewardWrapper):
71 |     """
72 |         Divide the reward by a fixed value
73 |     """
74 | 
75 |     def __init__(self, env, norm_factor):
76 |         super().__init__(env)
77 |         self.norm_factor = norm_factor
78 | 
79 |     def reward(self, reward):
80 |         return reward/self.norm_factor
81 | 
82 | 
83 | class FixJobSequence(gym.Wrapper):
84 |     """
85 |         Set the random seed of environment to a fixed value every time it reset\
86 |             thus, the job arrival sequence would be unchanged
87 |     """
88 | 
89 |     def __init__(self, env, seed=0):
90 |         super().__init__(env)
91 |         self.random_seed = seed
92 | 
93 |     def reset(self):
94 |         self.env.seed(self.random_seed)
95 |         return self.env.reset()
96 | 


--------------------------------------------------------------------------------
/core/algorithms/input_dependent_baseline/mib_a2c.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | from .base_meta_critic import ActorMetaCriticAlgo
 5 | 
 6 | DECAY_RATE = 0.995
 7 | 
 8 | 
 9 | class MIB_A2C(ActorMetaCriticAlgo):
10 |     """
11 |         Meta Input-dependent Baseline A2C. \
12 |         This A2C class leverages input-dependent baseline, which is learned with meta learning, \
13 |             to reduce variance when updating parameters
14 |     """
15 | 
16 |     def __init__(self,
17 |                  actor_critic,
18 |                  entropy_coef,
19 |                  lr=1e-3,
20 |                  adapt_lr=1e-3,
21 |                  num_inner_steps=5,
22 |                  max_grad_norm=None,
23 |                  expert=None,
24 |                  il_coef=10):
25 |         super().__init__(actor_critic=actor_critic,
26 |                          lr=lr,
27 |                          adapt_lr=adapt_lr,
28 |                          num_inner_steps=num_inner_steps)
29 |         self.entropy_coef = entropy_coef
30 |         self.max_grad_norm = max_grad_norm
31 | 
32 |         self.expert = expert
33 |         self.il_coef = il_coef
34 | 
35 |     def update(self, rollouts):
36 |         obs_shape = rollouts.obs.size()[2:]
37 |         action_shape = rollouts.actions.size()[-1]
38 |         num_steps, num_processes, _ = rollouts.rewards.size()
39 | 
40 |         # action loss + entropy loss
41 |         values, value_loss = self.train_meta_critic_and_predict_values(
42 |             rollouts)
43 |         _, action_log_probs, dist_entropy, _ = self.actor_critic.evaluate_actions(
44 |             rollouts.obs[:-1].view(-1, *obs_shape),
45 |             rollouts.recurrent_hidden_states[0].view(
46 |                 -1, self.actor_critic.recurrent_hidden_state_size),
47 |             rollouts.masks[:-1].view(-1, 1),
48 |             rollouts.actions.view(-1, action_shape))
49 | 
50 |         values = values.view(num_steps, num_processes, 1)
51 |         action_log_probs = action_log_probs.view(num_steps, num_processes, 1)
52 | 
53 |         advantages = rollouts.returns[:-1] - values
54 |         # Normalize advantages?
55 |         advantages = (advantages - advantages.mean())/(advantages.std() + 1e-5)
56 | 
57 |         action_loss = -(advantages.detach() * action_log_probs).mean()
58 | 
59 |         # imitation learning
60 |         imitation_loss, accuracy = torch.tensor(0).to(rollouts.obs.device), 0
61 |         if self.expert:
62 |             imitation_loss, accuracy = self.imitation_learning(
63 |                 rollouts.obs[:-1].view(-1, *obs_shape),
64 |                 rollouts.recurrent_hidden_states[0].view(
65 |                     -1, self.actor_critic.recurrent_hidden_state_size),
66 |                 rollouts.masks[:-1].view(-1, 1),
67 |                 self.expert)
68 |         # -----------------------------------------------------
69 | 
70 |         self.optimizer.zero_grad()
71 | 
72 |         # total loss
73 |         loss = action_loss + self.il_coef * \
74 |             imitation_loss - self.entropy_coef * dist_entropy
75 |         loss.backward()
76 | 
77 |         nn.utils.clip_grad_norm_(self.actor_critic.parameters(),
78 |                                  self.max_grad_norm)
79 | 
80 |         self.optimizer.step()
81 | 
82 |         # reduce the weight of imitation learning during training process
83 |         self.il_coef = self.il_coef * DECAY_RATE
84 | 
85 |         return {
86 |             'value loss': value_loss,
87 |             'action loss': action_loss.item(),
88 |             'entropy loss': dist_entropy.item(),
89 |             'imitation loss': imitation_loss.item(),
90 |             'accuracy': accuracy
91 |         }
92 | 


--------------------------------------------------------------------------------
/core/distributions.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | import torch.nn.functional as F
  6 | 
  7 | from core.utils import AddBias, init
  8 | 
  9 | """
 10 | Modify standard PyTorch distributions so they are compatible with this code.
 11 | """
 12 | 
 13 | #
 14 | # Standardize distribution interfaces
 15 | #
 16 | 
 17 | # Categorical
 18 | 
 19 | 
 20 | class FixedCategorical(torch.distributions.Categorical):
 21 |     def sample(self):
 22 |         return super().sample().unsqueeze(-1)
 23 | 
 24 |     def log_probs(self, actions):
 25 |         return (
 26 |             super()
 27 |             .log_prob(actions.squeeze(-1))
 28 |             .view(actions.size(0), -1)
 29 |             .sum(-1)
 30 |             .unsqueeze(-1)
 31 |         )
 32 | 
 33 |     def mode(self):
 34 |         return self.probs.argmax(dim=-1, keepdim=True)
 35 | 
 36 | 
 37 | # Normal
 38 | class FixedNormal(torch.distributions.Normal):
 39 |     def log_probs(self, actions):
 40 |         return super().log_prob(actions).sum(-1, keepdim=True)
 41 | 
 42 |     def entrop(self):
 43 |         return super.entropy().sum(-1)
 44 | 
 45 |     def mode(self):
 46 |         return self.mean
 47 | 
 48 | 
 49 | # Bernoulli
 50 | class FixedBernoulli(torch.distributions.Bernoulli):
 51 |     def log_probs(self, actions):
 52 |         return super.log_prob(actions).view(actions.size(0), -1).sum(-1).unsqueeze(-1)
 53 | 
 54 |     def entropy(self):
 55 |         return super().entropy().sum(-1)
 56 | 
 57 |     def mode(self):
 58 |         return torch.gt(self.probs, 0.5).float()
 59 | 
 60 | 
 61 | class Categorical(nn.Module):
 62 |     def __init__(self, num_inputs, num_outputs):
 63 |         super(Categorical, self).__init__()
 64 | 
 65 |         def init_(m): return init(
 66 |             m,
 67 |             nn.init.orthogonal_,
 68 |             lambda x: nn.init.constant_(x, 0),
 69 |             gain=0.01)
 70 | 
 71 |         self.linear = nn.Sequential(
 72 |             init_(nn.Linear(num_inputs, num_outputs))
 73 |         )
 74 | 
 75 |         # hack
 76 |         # The weight of last layer will be initialized 100 times smaller than regular layer
 77 |         # according to Sec 3.2 https://arxiv.org/abs/2006.05990
 78 |         self.linear[0].weight.data = self.linear[0].weight.data*0.01
 79 | 
 80 |     def forward(self, x):
 81 |         x = self.linear(x)
 82 |         return FixedCategorical(logits=x)
 83 | 
 84 | 
 85 | class DiagGaussian(nn.Module):
 86 |     def __init__(self, num_inputs, num_outputs):
 87 |         super(DiagGaussian, self).__init__()
 88 | 
 89 |         def init_(m): return init(m, nn.init.orthogonal_, lambda x: nn.init.
 90 |                                   constant_(x, 0))
 91 | 
 92 |         self.fc_mean = init_(nn.Linear(num_inputs, num_outputs))
 93 |         self.logstd = AddBias(torch.zeros(num_outputs))
 94 | 
 95 |     def forward(self, x):
 96 |         action_mean = self.fc_mean(x)
 97 | 
 98 |         #  An ugly hack for my KFAC implementation.
 99 |         zeros = torch.zeros(action_mean.size())
100 |         if x.is_cuda:
101 |             zeros = zeros.cuda()
102 | 
103 |         action_logstd = self.logstd(zeros)
104 |         return FixedNormal(action_mean, action_logstd.exp())
105 | 
106 | 
107 | class Bernoulli(nn.Module):
108 |     def __init__(self, num_inputs, num_outputs):
109 |         super(Bernoulli, self).__init__()
110 | 
111 |         def init_(m): return init(m, nn.init.orthogonal_, lambda x: nn.init.
112 |                                   constant_(x, 0))
113 | 
114 |         self.linear = init_(nn.Linear(num_inputs, num_outputs))
115 | 
116 |     def forward(self, x):
117 |         x = self.linear(x)
118 |         return FixedBernoulli(logits=x)
119 | 


--------------------------------------------------------------------------------
/core/agents/models/base.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch import nn 
  3 | 
  4 | class Flatten(nn.Module):
  5 |     def forward(self, x):
  6 |         return x.view(x.size(0), -1)
  7 | 
  8 | 
  9 | class NNBase(nn.Module):
 10 |     def __init__(self, recurrent, recurrent_input_size, hidden_size):
 11 |         super(NNBase, self).__init__()
 12 | 
 13 |         self._hidden_size = hidden_size
 14 |         self._recurrent = recurrent
 15 | 
 16 |         if recurrent:
 17 |             self.gru = nn.GRU(recurrent_input_size, hidden_size)
 18 | 
 19 |     @property
 20 |     def is_recurrent(self):
 21 |         return self._recurrent
 22 | 
 23 |     @property
 24 |     def recurrent_hidden_state_size(self):
 25 |         if self._recurrent:
 26 |             return self._hidden_size
 27 |         return 1
 28 | 
 29 |     @property
 30 |     def output_size(self):
 31 |         return self._hidden_size
 32 | 
 33 |     def _forward_gru(self, x, hxs, masks):
 34 |         if x.size(0) == hxs.size(0):
 35 |             x, hxs = self.gru(x.unsqueeze(0), (hxs * masks).unsqueeze(0))
 36 |             x = x.squeeze(0)
 37 |             hxs = hxs.squeeze(0)
 38 |         else:
 39 |             # x is a (T, N, -1) tensor that has been flatten to (T * N, -1)
 40 |             N = hxs.size(0)
 41 |             T = int(x.size(0) / N)
 42 | 
 43 |             # unflatten
 44 |             x = x.view(T, N, x.size(1))
 45 | 
 46 |             # Same deal with masks
 47 |             masks = masks.view(T, N)
 48 | 
 49 |             # Let's figure out which steps in the sequence have a zero for any agent
 50 |             # We will always assume t=0 has a zero in it as that makes the logic cleaner
 51 |             has_zeros = ((masks[1:] == 0.0) \
 52 |                             .any(dim=-1)
 53 |                             .nonzero()
 54 |                             .squeeze()
 55 |                             .cpu())
 56 | 
 57 |             # +1 to correct the masks[1:]
 58 |             if has_zeros.dim() == 0:
 59 |                 # Deal with scalar
 60 |                 has_zeros = [has_zeros.item() + 1]
 61 |             else:
 62 |                 has_zeros = (has_zeros + 1).numpy().tolist()
 63 | 
 64 |             # add t=0 and t=T to the list
 65 |             has_zeros = [0] + has_zeros + [T]
 66 | 
 67 |             hxs = hxs.unsqueeze(0)
 68 |             outputs = []
 69 |             self.gru.flatten_parameters()
 70 |             for i in range(len(has_zeros) - 1):
 71 |                 # We can now process steps that don't have any zeros in masks together!
 72 |                 # This is much faster
 73 |                 start_idx = has_zeros[i]
 74 |                 end_idx = has_zeros[i + 1]
 75 | 
 76 |                 rnn_scores, hxs = self.gru(
 77 |                     x[start_idx:end_idx],
 78 |                     hxs * masks[start_idx].view(1, -1, 1))
 79 | 
 80 |                 outputs.append(rnn_scores)
 81 | 
 82 |             # assert len(outputs) == T
 83 |             # x is a (T, N, -1) tensor
 84 |             x = torch.cat(outputs, dim=0)
 85 |             # flatten
 86 |             x = x.view(T * N, -1)
 87 |             hxs = hxs.squeeze(0)
 88 | 
 89 |         return x, hxs
 90 |     
 91 |     def init_weight(self, layer):
 92 |         init_ = lambda m: init(m, nn.init.orthogonal_, lambda x: nn.init.
 93 |                                constant_(x, 0), nn.init.calculate_gain('relu'))
 94 |         if isinstance(layer, nn.Conv2d) or isinstance(layer, nn.Linear):
 95 |             return init_(layer)
 96 |         elif isinstance(layer, nn.BatchNorm2d):
 97 |             layer.weight.data.fill_(1)
 98 |             if hasattr(layer, 'bias'):
 99 |                 layer.bias.data.zero_()
100 |             return layer 
101 | 
102 | 


--------------------------------------------------------------------------------
/core/algorithms/a2c_acktr.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.optim as optim
 4 | 
 5 | from core.algorithms.kfac import KFACOptimizer
 6 | from core.algorithms.base_algo import BaseAlgo
 7 | 
 8 | 
 9 | class A2C_ACKTR(BaseAlgo):
10 |     def __init__(self,
11 |                  actor_critic,
12 |                  value_loss_coef,
13 |                  entropy_coef,
14 |                  lr=None,
15 |                  eps=None,
16 |                  alpha=None,
17 |                  max_grad_norm=None,
18 |                  acktr=False,
19 |                  expert=None,
20 |                  il_coef=1):
21 |         super().__init__(actor_critic, lr, value_loss_coef, entropy_coef, expert, il_coef)
22 |         self.acktr = acktr
23 | 
24 |         self.max_grad_norm = max_grad_norm
25 | 
26 |         if acktr:
27 |             self.optimizer = KFACOptimizer(actor_critic)
28 | 
29 |     def update(self, rollouts):
30 |         obs_shape = rollouts.obs.size()[2:]
31 |         action_shape = rollouts.actions.size()[-1]
32 |         num_steps, num_processes, _ = rollouts.rewards.size()
33 | 
34 |         values, action_log_probs, dist_entropy, _ = self.actor_critic.evaluate_actions(
35 |             rollouts.obs[:-1].view(-1, *obs_shape),
36 |             rollouts.recurrent_hidden_states[:-1].view(
37 |                 -1, self.actor_critic.recurrent_hidden_state_size),
38 |             rollouts.masks[:-1].view(-1, 1),
39 |             rollouts.actions.view(-1, action_shape))
40 | 
41 |         values = values.view(num_steps, num_processes, 1)
42 |         action_log_probs = action_log_probs.view(num_steps, num_processes, 1)
43 | 
44 |         advantages = rollouts.returns[:-1] - values
45 |         value_loss = advantages.pow(2).mean()
46 | 
47 |         action_loss = -(advantages.detach() * action_log_probs).mean()
48 | 
49 |         # imitation learning
50 |         imitation_loss, accuracy = torch.tensor(0).to(rollouts.obs.device), 0
51 |         if self.expert:
52 |             imitation_loss, accuracy = self.imitation_learning(
53 |                 rollouts.obs[:-1].view(-1, *obs_shape),
54 |                 rollouts.recurrent_hidden_states[0].view(
55 |                     -1, self.actor_critic.recurrent_hidden_state_size),
56 |                 rollouts.masks[:-1].view(-1, 1),
57 |                 self.expert)
58 | 
59 |         if self.acktr and self.optimizer.steps % self.optimizer.Ts == 0:
60 |             # Compute fisher, see Martens 2014
61 |             self.actor_critic.zero_grad()
62 |             pg_fisher_loss = -action_log_probs.mean()
63 | 
64 |             value_noise = torch.randn(values.size())
65 |             if values.is_cuda:
66 |                 value_noise = value_noise.cuda()
67 | 
68 |             sample_values = values + value_noise
69 |             vf_fisher_loss = -(values - sample_values.detach()).pow(2).mean()
70 | 
71 |             fisher_loss = pg_fisher_loss + vf_fisher_loss
72 |             self.optimizer.acc_stats = True
73 |             fisher_loss.backward(retain_graph=True)
74 |             self.optimizer.acc_stats = False
75 | 
76 |         self.optimizer.zero_grad()
77 |         (imitation_loss * self.il_coef + value_loss * self.value_coef + action_loss -
78 |          dist_entropy * self.entropy_coef).backward()
79 | 
80 |         if self.acktr == False:
81 |             nn.utils.clip_grad_norm_(self.actor_critic.parameters(),
82 |                                      self.max_grad_norm)
83 | 
84 |         self.optimizer.step()
85 |         self.after_update()
86 | 
87 |         return {
88 |             'value loss': value_loss.item(),
89 |             'action loss': action_loss.item(),
90 |             'entropy loss': dist_entropy.item(),
91 |             'imitation loss': imitation_loss.item(),
92 |             'accuracy': accuracy
93 |         }
94 | 


--------------------------------------------------------------------------------
/core/storage/lacie_storage.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | 
 4 | 
 5 | class LacieStorage(object):
 6 |     def __init__(self, num_steps, obs_shape, action_space,
 7 |                  max_size=10000,
 8 |                  batch_size=64,
 9 |                  n_processes=16):
10 |         # obs
11 |         self.obs = torch.zeros(max_size, num_steps + 1, * obs_shape)
12 | 
13 |         # action
14 |         if action_space.__class__.__name__ == 'Discrete':
15 |             action_shape = 1
16 |         else:
17 |             action_shape = action_space.shape[0]
18 |         self.actions = torch.zeros(max_size, num_steps, action_shape)
19 |         if action_space.__class__.__name__ == 'Discrete':
20 |             self.actions = self.actions.long()
21 | 
22 |         # mask
23 |         self.masks = torch.ones(max_size, num_steps + 1, 1)
24 | 
25 |         # advantages
26 |         self.advantages = torch.zeros(max_size, num_steps, 1)
27 | 
28 |         self.ptr, self.size, self.max_size = 0, 0, max_size
29 | 
30 |         self.batch_size = batch_size
31 |         self.n_processes = n_processes
32 | 
33 |     def to(self, device):
34 |         self.obs = self.obs.to(device)
35 |         self.actions = self.actions.to(device)
36 |         self.masks = self.masks.to(device)
37 |         self.advantages = self.advantages.to(device)
38 | 
39 |     def insert(self, rollouts, advantages):
40 |         """
41 |             Update the buffer with new rollouts from Storages mem
42 |             :param obs: torch.Tensor of shape (num_steps + 1, n_processes, obs_shape)
43 |             :param actions: torch.Tensor of shape (num_steps, n_processes, action_shape)
44 |             :param masks: torch.Tensor of shape (num_steps + 1, n_processes, 1)
45 |             :param advantages: torch.Tensor of shape (num_steps + 1, n_processes, 1)
46 |         """
47 |         obs = rollouts.obs.permute(1, 0, 2)
48 |         actions = rollouts.actions.permute(1, 0, 2)
49 |         masks = rollouts.masks.permute(1, 0, 2)
50 |         advantages = advantages.permute(1, 0, 2)
51 |         n = obs.shape[0]
52 | 
53 |         idxs = np.arange(self.ptr, self.ptr + n) % self.max_size
54 |         self.obs[idxs] = obs
55 |         self.actions[idxs] = actions
56 |         self.masks[idxs] = masks
57 |         self.advantages[idxs] = advantages
58 |         self.ptr = (self.ptr + n) % self.max_size
59 | 
60 |         self.size = min(self.size + n, self.max_size)
61 | 
62 |     def sample(self):
63 |         idxs = np.random.choice(
64 |             self.size, min(self.batch_size, self.size))
65 |         batch = dict(obs=self.obs[idxs],
66 |                      actions=self.actions[idxs],
67 |                      advantages=self.advantages[idxs],
68 |                      masks=self.masks[idxs])
69 | 
70 |         # permute tensor to shape n_steps x batch_size x shape
71 |         return {k: v.permute(1, 0, 2) for k, v in batch.items()}
72 | 
73 |     def sample_most_recent(self):
74 |         if self.size < self.batch_size:
75 |             idxs = np.arange(0, self.size)
76 |         else:
77 |             idxs = np.arange(self.ptr - self.batch_size,
78 |                              self.ptr) % self.max_size
79 |         # the first n_procecsses indexes will be used to storage current rollout
80 |         # the rest are most recent rollouts
81 |         idxs = np.concatenate(
82 |             [
83 |                 idxs[-self.n_processes:],
84 |                 idxs[:-self.n_processes]
85 |             ]
86 |         )
87 |         batch = dict(obs=self.obs[idxs],
88 |                      actions=self.actions[idxs],
89 |                      advantages=self.advantages[idxs],
90 |                      masks=self.masks[idxs])
91 | 
92 |         # permute tensor to shape n_steps x batch_size x shape
93 |         return {k: v.permute(1, 0, 2) for k, v in batch.items()}
94 | 


--------------------------------------------------------------------------------
/evaluation.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | import copy
  4 | 
  5 | from core import utils
  6 | from core.envs import make_vec_envs
  7 | from core.agents.heuristic.load_balance import LeastWorkAgent, \
  8 |     ShortestProcessingTimeAgent, RandomAllocateAgent, EarliestCompletionTimeAgent
  9 | 
 10 | NUM_EVAL_EPISODES = 64
 11 | 
 12 | 
 13 | def evaluate(actor_critic, env_name, seed, num_processes, eval_log_dir,
 14 |              device, env_args=None):
 15 |     seed = seed if env_args.fix_job_sequence else seed + num_processes
 16 |     num_processes = 1 if env_args.fix_job_sequence else num_processes
 17 | 
 18 |     returns = benchmark_heuristic([LeastWorkAgent(),
 19 |                                    RandomAllocateAgent(),
 20 |                                    EarliestCompletionTimeAgent(
 21 |                                        env_args.load_balance_service_rates)],
 22 |                                   env_name=env_name,
 23 |                                   seed=seed,
 24 |                                   num_processes=num_processes,
 25 |                                   log_dir=eval_log_dir,
 26 |                                   device=device,
 27 |                                   args=env_args)
 28 |     # benchmark heuristic
 29 |     # least_work
 30 |     eval_envs = make_vec_envs(env_name=env_name,
 31 |                               seed=seed,
 32 |                               num_processes=num_processes,
 33 |                               log_dir=eval_log_dir,
 34 |                               device=device,
 35 |                               allow_early_resets=True,
 36 |                               train=False,
 37 |                               args=env_args)
 38 |     eval_episode_rewards = []
 39 | 
 40 |     obs = eval_envs.reset()
 41 |     eval_recurrent_hidden_states = torch.zeros(
 42 |         num_processes, actor_critic.recurrent_hidden_state_size, device=device)
 43 |     eval_masks = torch.zeros(num_processes, 1, device=device)
 44 | 
 45 |     while len(eval_episode_rewards) < NUM_EVAL_EPISODES:
 46 |         with torch.no_grad():
 47 |             _, action, _, eval_recurrent_hidden_states = actor_critic.act(
 48 |                 obs,
 49 |                 eval_recurrent_hidden_states,
 50 |                 eval_masks,
 51 |                 deterministic=True)
 52 | 
 53 |         # Obser reward and next obs
 54 |         # FIXME: debug why actions must be moved to cpu?
 55 |         obs, _, done, infos = eval_envs.step(action.cpu())
 56 | 
 57 |         eval_masks = torch.tensor(
 58 |             [[0.0] if done_ else [1.0] for done_ in done],
 59 |             dtype=torch.float32,
 60 |             device=device)
 61 | 
 62 |         for info in infos:
 63 |             if 'episode' in info.keys():
 64 |                 eval_episode_rewards.append(info['episode']['r'])
 65 | 
 66 |     eval_envs.close()
 67 |     returns['RLAgent'] = eval_episode_rewards
 68 | 
 69 |     # print out the result
 70 |     for k, v in returns.items():
 71 |         print("   => Evaluate {} using {} episodes: mean reward {:.5f}".format(
 72 |             k, len(v), np.mean(v)))
 73 |     return returns
 74 | 
 75 | 
 76 | def benchmark_single_heuristic(agent, eval_envs):
 77 |     """
 78 |         Compute return of a single heuristic agent
 79 |     """
 80 |     obs = eval_envs.reset()
 81 |     eval_episode_rewards = []
 82 | 
 83 |     while len(eval_episode_rewards) < NUM_EVAL_EPISODES:
 84 |         action = agent.act(obs)
 85 |         # Obser reward and next obs
 86 | 
 87 |         obs, _, done, infos = eval_envs.step(action.cpu())
 88 | 
 89 |         for info in infos:
 90 |             if 'episode' in info.keys():
 91 |                 eval_episode_rewards.append(info['episode']['r'])
 92 | 
 93 |     eval_envs.close()
 94 | 
 95 |     return eval_episode_rewards
 96 | 
 97 | 
 98 | def benchmark_heuristic(agents, **kwargs):
 99 |     """
100 |         Compute return of all heuristics
101 |     """
102 |     ret = {}
103 |     for agent in agents:
104 |         envs = make_vec_envs(env_name=kwargs['env_name'],
105 |                              seed=kwargs['seed'],
106 |                              num_processes=kwargs['num_processes'],
107 |                              log_dir=kwargs['log_dir'],
108 |                              device=kwargs['device'],
109 |                              allow_early_resets=True,
110 |                              train=False,
111 |                              args=kwargs['args'])
112 | 
113 |         eval_episode_rewards = benchmark_single_heuristic(agent, envs)
114 |         # append the result to return dictionary
115 |         ret[agent.__class__.__name__] = eval_episode_rewards
116 | 
117 |     return ret
118 | 


--------------------------------------------------------------------------------
/core/algorithms/input_dependent_baseline/mib_ppo.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | 
  4 | from .base_meta_critic import ActorMetaCriticAlgo
  5 | 
  6 | DECAY_RATE = 0.995
  7 | 
  8 | 
  9 | class MIB_PPO(ActorMetaCriticAlgo):
 10 |     def __init__(self,
 11 |                  actor_critic,
 12 |                  clip_param,
 13 |                  ppo_epoch,
 14 |                  num_mini_batch,
 15 |                  entropy_coef,
 16 |                  lr=None,
 17 |                  adapt_lr=None,
 18 |                  num_inner_steps=5,
 19 |                  max_grad_norm=None,
 20 |                  expert=None,
 21 |                  il_coef=10):
 22 | 
 23 |         super().__init__(actor_critic, lr, adapt_lr, num_inner_steps)
 24 | 
 25 |         # PPO Args
 26 |         self.clip_param = clip_param
 27 |         self.ppo_epoch = ppo_epoch
 28 |         self.num_mini_batch = num_mini_batch
 29 | 
 30 |         self.entropy_coef = entropy_coef
 31 |         self.max_grad_norm = max_grad_norm
 32 | 
 33 |         self.expert = expert
 34 |         self.il_coef = il_coef
 35 | 
 36 |     def update(self, rollouts):
 37 |         obs_shape = rollouts.obs.size()[2:]
 38 |         action_shape = rollouts.actions.size()[-1]
 39 |         num_steps, num_processes, _ = rollouts.rewards.size()
 40 | 
 41 |         # action loss + entropy loss
 42 |         value_preds, value_loss = self.train_meta_critic_and_predict_values(
 43 |             rollouts)
 44 |         _, action_log_probs, dist_entropy, _ = self.actor_critic.evaluate_actions(
 45 |             rollouts.obs[:-1].view(-1, *obs_shape),
 46 |             rollouts.recurrent_hidden_states[0].view(
 47 |                 -1, self.actor_critic.recurrent_hidden_state_size),
 48 |             rollouts.masks[:-1].view(-1, 1),
 49 |             rollouts.actions.view(-1, action_shape))
 50 | 
 51 |         value_preds = value_preds.view(num_steps, num_processes, 1)
 52 | 
 53 |         advantages = rollouts.returns[:-1] - value_preds
 54 |         advantages = (advantages - advantages.mean()) / (
 55 |             advantages.std() + 1e-5)
 56 | 
 57 |         advantages = advantages.detach()
 58 | 
 59 |         action_loss_epoch = 0
 60 |         dist_entropy_epoch = 0
 61 |         imitation_loss_epoch = 0
 62 |         accuracy_epoch = 0
 63 | 
 64 |         for _ in range(self.ppo_epoch):
 65 |             if self.actor_critic.is_recurrent:
 66 |                 data_generator = rollouts.recurrent_generator(
 67 |                     advantages, self.num_mini_batch)
 68 |             else:
 69 |                 data_generator = rollouts.feed_forward_generator(
 70 |                     advantages, self.num_mini_batch)
 71 | 
 72 |             for sample in data_generator:
 73 |                 obs_batch, recurrent_hidden_states_batch, actions_batch, \
 74 |                     _, _, masks_batch, old_action_log_probs_batch, \
 75 |                     adv_targ = sample
 76 | 
 77 |                 # Reshape to do in a single forward pass for all steps
 78 |                 value_preds, action_log_probs, dist_entropy, _ = self.actor_critic.evaluate_actions(
 79 |                     obs_batch, recurrent_hidden_states_batch, masks_batch,
 80 |                     actions_batch)
 81 | 
 82 |                 ratio = torch.exp(action_log_probs -
 83 |                                   old_action_log_probs_batch)
 84 |                 surr1 = ratio * adv_targ
 85 |                 surr2 = torch.clamp(ratio, 1.0 - self.clip_param,
 86 |                                     1.0 + self.clip_param) * adv_targ
 87 |                 action_loss = -torch.min(surr1, surr2).mean()
 88 | 
 89 |                 # imitation learning
 90 |                 imitation_loss, accuracy = torch.tensor(
 91 |                     0).to(action_loss.device), 0
 92 |                 if self.expert:
 93 |                     imitation_loss, accuracy = self.imitation_learning(
 94 |                         rollouts.obs[:-1].view(-1, *obs_shape),
 95 |                         rollouts.recurrent_hidden_states[0].view(
 96 |                             -1, self.actor_critic.recurrent_hidden_state_size),
 97 |                         rollouts.masks[:-1].view(-1, 1),
 98 |                         self.expert)
 99 | 
100 |                 self.optimizer.zero_grad()
101 |                 (imitation_loss * self.il_coef + action_loss -
102 |                  dist_entropy * self.entropy_coef).backward()
103 |                 nn.utils.clip_grad_norm_(self.actor_critic.parameters(),
104 |                                          self.max_grad_norm)
105 |                 self.optimizer.step()
106 | 
107 |                 action_loss_epoch += action_loss.item()
108 |                 dist_entropy_epoch += dist_entropy.item()
109 |                 imitation_loss_epoch += imitation_loss.item()
110 |                 accuracy_epoch += accuracy
111 | 
112 |         num_updates = self.ppo_epoch * self.num_mini_batch
113 | 
114 |         action_loss_epoch /= num_updates
115 |         dist_entropy_epoch /= num_updates
116 |         imitation_loss_epoch /= num_updates
117 |         accuracy_epoch /= num_updates
118 | 
119 |         self.il_coef *= DECAY_RATE
120 | 
121 |         return {
122 |             "value loss": value_loss,
123 |             "action loss": action_loss_epoch,
124 |             "imitation loss": imitation_loss_epoch,
125 |             "accuracy": accuracy_epoch,
126 |             "entropy loss": dist_entropy_epoch
127 |         }
128 | 


--------------------------------------------------------------------------------
/core/algorithms/ppo.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | import torch.optim as optim
  5 | 
  6 | from .base_algo import BaseAlgo
  7 | 
  8 | 
  9 | class PPO(BaseAlgo):
 10 |     def __init__(self,
 11 |                  actor_critic,
 12 |                  clip_param,
 13 |                  ppo_epoch,
 14 |                  num_mini_batch,
 15 |                  value_loss_coef,
 16 |                  entropy_coef,
 17 |                  lr=None,
 18 |                  eps=None,
 19 |                  max_grad_norm=None,
 20 |                  use_clipped_value_loss=True,
 21 |                  expert=None,
 22 |                  il_coef=1):
 23 |         super().__init__(actor_critic, lr, value_loss_coef, entropy_coef, expert, il_coef)
 24 | 
 25 |         self.clip_param = clip_param
 26 |         self.ppo_epoch = ppo_epoch
 27 |         self.num_mini_batch = num_mini_batch
 28 | 
 29 |         self.max_grad_norm = max_grad_norm
 30 |         self.use_clipped_value_loss = use_clipped_value_loss
 31 | 
 32 |     def update(self, rollouts):
 33 |         obs_shape = rollouts.obs.size()[2:]
 34 |         advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1]
 35 |         advantages = (advantages - advantages.mean()) / (
 36 |             advantages.std() + 1e-5)
 37 | 
 38 |         value_loss_epoch = 0
 39 |         action_loss_epoch = 0
 40 |         dist_entropy_epoch = 0
 41 |         imitation_loss_epoch = 0
 42 |         accuracy_epoch = 0
 43 | 
 44 |         for e in range(self.ppo_epoch):
 45 |             if self.actor_critic.is_recurrent:
 46 |                 data_generator = rollouts.recurrent_generator(
 47 |                     advantages, self.num_mini_batch)
 48 |             else:
 49 |                 data_generator = rollouts.feed_forward_generator(
 50 |                     advantages, self.num_mini_batch)
 51 | 
 52 |             for sample in data_generator:
 53 |                 obs_batch, recurrent_hidden_states_batch, actions_batch, \
 54 |                     value_preds_batch, return_batch, masks_batch, old_action_log_probs_batch, \
 55 |                     adv_targ = sample
 56 | 
 57 |                 # Reshape to do in a single forward pass for all steps
 58 |                 values, action_log_probs, dist_entropy, _ = self.actor_critic.evaluate_actions(
 59 |                     obs_batch, recurrent_hidden_states_batch, masks_batch,
 60 |                     actions_batch)
 61 | 
 62 |                 ratio = torch.exp(action_log_probs -
 63 |                                   old_action_log_probs_batch)
 64 |                 surr1 = ratio * adv_targ
 65 |                 surr2 = torch.clamp(ratio, 1.0 - self.clip_param,
 66 |                                     1.0 + self.clip_param) * adv_targ
 67 |                 action_loss = -torch.min(surr1, surr2).mean()
 68 | 
 69 |                 if self.use_clipped_value_loss:
 70 |                     value_pred_clipped = value_preds_batch + \
 71 |                         (values - value_preds_batch).clamp(-self.clip_param,
 72 |                                                            self.clip_param)
 73 |                     value_losses = (values - return_batch).pow(2)
 74 |                     value_losses_clipped = (
 75 |                         value_pred_clipped - return_batch).pow(2)
 76 |                     value_loss = 0.5 * torch.max(value_losses,
 77 |                                                  value_losses_clipped).mean()
 78 |                 else:
 79 |                     value_loss = 0.5 * (return_batch - values).pow(2).mean()
 80 | 
 81 |                 # imitation learning
 82 |                 imitation_loss, accuracy = torch.tensor(
 83 |                     0).to(action_loss.device), 0
 84 |                 if self.expert:
 85 |                     imitation_loss, accuracy = self.imitation_learning(
 86 |                         rollouts.obs[:-1].view(-1, *obs_shape),
 87 |                         rollouts.recurrent_hidden_states[0].view(
 88 |                             -1, self.actor_critic.recurrent_hidden_state_size),
 89 |                         rollouts.masks[:-1].view(-1, 1),
 90 |                         self.expert)
 91 | 
 92 |                 self.optimizer.zero_grad()
 93 |                 (imitation_loss * self.il_coef * self.value_coef + action_loss -
 94 |                  dist_entropy * self.entropy_coef).backward()
 95 |                 nn.utils.clip_grad_norm_(self.actor_critic.parameters(),
 96 |                                          self.max_grad_norm)
 97 |                 self.optimizer.step()
 98 | 
 99 |                 value_loss_epoch += value_loss.item()
100 |                 action_loss_epoch += action_loss.item()
101 |                 dist_entropy_epoch += dist_entropy.item()
102 |                 imitation_loss_epoch += imitation_loss.item()
103 |                 accuracy_epoch += accuracy
104 | 
105 |         num_updates = self.ppo_epoch * self.num_mini_batch
106 | 
107 |         value_loss_epoch /= num_updates
108 |         action_loss_epoch /= num_updates
109 |         dist_entropy_epoch /= num_updates
110 |         imitation_loss_epoch /= num_updates
111 |         accuracy_epoch /= num_updates
112 | 
113 |         self.after_update()
114 | 
115 |         return {
116 |             "value loss": value_loss_epoch,
117 |             "action loss": action_loss_epoch,
118 |             "entropy loss": dist_entropy_epoch,
119 |             "imitation loss": imitation_loss_epoch,
120 |             "accuracy": accuracy_epoch
121 |         }
122 | 


--------------------------------------------------------------------------------
/core/algorithms/kfac.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | import torch.nn.functional as F
  6 | import torch.optim as optim
  7 | 
  8 | from core.utils import AddBias
  9 | 
 10 | # TODO: In order to make this code faster:
 11 | # 1) Implement _extract_patches as a single cuda kernel
 12 | # 2) Compute QR decomposition in a separate process
 13 | # 3) Actually make a general KFAC optimizer so it fits PyTorch
 14 | 
 15 | 
 16 | def _extract_patches(x, kernel_size, stride, padding):
 17 |     if padding[0] + padding[1] > 0:
 18 |         x = F.pad(x, (padding[1], padding[1], padding[0],
 19 |                       padding[0])).data  # Actually check dims
 20 |     x = x.unfold(2, kernel_size[0], stride[0])
 21 |     x = x.unfold(3, kernel_size[1], stride[1])
 22 |     x = x.transpose_(1, 2).transpose_(2, 3).contiguous()
 23 |     x = x.view(
 24 |         x.size(0), x.size(1), x.size(2),
 25 |         x.size(3) * x.size(4) * x.size(5))
 26 |     return x
 27 | 
 28 | 
 29 | def compute_cov_a(a, classname, layer_info, fast_cnn):
 30 |     batch_size = a.size(0)
 31 | 
 32 |     if classname == 'Conv2d':
 33 |         if fast_cnn:
 34 |             a = _extract_patches(a, *layer_info)
 35 |             a = a.view(a.size(0), -1, a.size(-1))
 36 |             a = a.mean(1)
 37 |         else:
 38 |             a = _extract_patches(a, *layer_info)
 39 |             a = a.view(-1, a.size(-1)).div_(a.size(1)).div_(a.size(2))
 40 |     elif classname == 'AddBias':
 41 |         is_cuda = a.is_cuda
 42 |         a = torch.ones(a.size(0), 1)
 43 |         if is_cuda:
 44 |             a = a.cuda()
 45 | 
 46 |     return a.t() @ (a / batch_size)
 47 | 
 48 | 
 49 | def compute_cov_g(g, classname, layer_info, fast_cnn):
 50 |     batch_size = g.size(0)
 51 | 
 52 |     if classname == 'Conv2d':
 53 |         if fast_cnn:
 54 |             g = g.view(g.size(0), g.size(1), -1)
 55 |             g = g.sum(-1)
 56 |         else:
 57 |             g = g.transpose(1, 2).transpose(2, 3).contiguous()
 58 |             g = g.view(-1, g.size(-1)).mul_(g.size(1)).mul_(g.size(2))
 59 |     elif classname == 'AddBias':
 60 |         g = g.view(g.size(0), g.size(1), -1)
 61 |         g = g.sum(-1)
 62 | 
 63 |     g_ = g * batch_size
 64 |     return g_.t() @ (g_ / g.size(0))
 65 | 
 66 | 
 67 | def update_running_stat(aa, m_aa, momentum):
 68 |     # Do the trick to keep aa unchanged and not create any additional tensors
 69 |     m_aa *= momentum / (1 - momentum)
 70 |     m_aa += aa
 71 |     m_aa *= (1 - momentum)
 72 | 
 73 | 
 74 | class SplitBias(nn.Module):
 75 |     def __init__(self, module):
 76 |         super(SplitBias, self).__init__()
 77 |         self.module = module
 78 |         self.add_bias = AddBias(module.bias.data)
 79 |         self.module.bias = None
 80 | 
 81 |     def forward(self, input):
 82 |         x = self.module(input)
 83 |         x = self.add_bias(x)
 84 |         return x
 85 | 
 86 | 
 87 | class KFACOptimizer(optim.Optimizer):
 88 |     def __init__(self,
 89 |                  model,
 90 |                  lr=0.25,
 91 |                  momentum=0.9,
 92 |                  stat_decay=0.99,
 93 |                  kl_clip=0.001,
 94 |                  damping=1e-2,
 95 |                  weight_decay=0,
 96 |                  fast_cnn=False,
 97 |                  Ts=1,
 98 |                  Tf=10):
 99 |         defaults = dict()
100 | 
101 |         def split_bias(module):
102 |             for mname, child in module.named_children():
103 |                 if hasattr(child, 'bias') and child.bias is not None:
104 |                     module._modules[mname] = SplitBias(child)
105 |                 else:
106 |                     split_bias(child)
107 | 
108 |         split_bias(model)
109 | 
110 |         super(KFACOptimizer, self).__init__(model.parameters(), defaults)
111 | 
112 |         self.known_modules = {'Linear', 'Conv2d', 'AddBias'}
113 | 
114 |         self.modules = []
115 |         self.grad_outputs = {}
116 | 
117 |         self.model = model
118 |         self._prepare_model()
119 | 
120 |         self.steps = 0
121 | 
122 |         self.m_aa, self.m_gg = {}, {}
123 |         self.Q_a, self.Q_g = {}, {}
124 |         self.d_a, self.d_g = {}, {}
125 | 
126 |         self.momentum = momentum
127 |         self.stat_decay = stat_decay
128 | 
129 |         self.lr = lr
130 |         self.kl_clip = kl_clip
131 |         self.damping = damping
132 |         self.weight_decay = weight_decay
133 | 
134 |         self.fast_cnn = fast_cnn
135 | 
136 |         self.Ts = Ts
137 |         self.Tf = Tf
138 | 
139 |         self.optim = optim.SGD(
140 |             model.parameters(),
141 |             lr=self.lr * (1 - self.momentum),
142 |             momentum=self.momentum)
143 | 
144 |     def _save_input(self, module, input):
145 |         if torch.is_grad_enabled() and self.steps % self.Ts == 0:
146 |             classname = module.__class__.__name__
147 |             layer_info = None
148 |             if classname == 'Conv2d':
149 |                 layer_info = (module.kernel_size, module.stride,
150 |                               module.padding)
151 | 
152 |             aa = compute_cov_a(input[0].data, classname, layer_info,
153 |                                self.fast_cnn)
154 | 
155 |             # Initialize buffers
156 |             if self.steps == 0:
157 |                 self.m_aa[module] = aa.clone()
158 | 
159 |             update_running_stat(aa, self.m_aa[module], self.stat_decay)
160 | 
161 |     def _save_grad_output(self, module, grad_input, grad_output):
162 |         # Accumulate statistics for Fisher matrices
163 |         if self.acc_stats:
164 |             classname = module.__class__.__name__
165 |             layer_info = None
166 |             if classname == 'Conv2d':
167 |                 layer_info = (module.kernel_size, module.stride,
168 |                               module.padding)
169 | 
170 |             gg = compute_cov_g(grad_output[0].data, classname, layer_info,
171 |                                self.fast_cnn)
172 | 
173 |             # Initialize buffers
174 |             if self.steps == 0:
175 |                 self.m_gg[module] = gg.clone()
176 | 
177 |             update_running_stat(gg, self.m_gg[module], self.stat_decay)
178 | 
179 |     def _prepare_model(self):
180 |         for module in self.model.modules():
181 |             classname = module.__class__.__name__
182 |             if classname in self.known_modules:
183 |                 assert not ((classname in ['Linear', 'Conv2d']) and module.bias is not None), \
184 |                                     "You must have a bias as a separate layer"
185 | 
186 |                 self.modules.append(module)
187 |                 module.register_forward_pre_hook(self._save_input)
188 |                 module.register_backward_hook(self._save_grad_output)
189 | 
190 |     def step(self):
191 |         # Add weight decay
192 |         if self.weight_decay > 0:
193 |             for p in self.model.parameters():
194 |                 p.grad.data.add_(self.weight_decay, p.data)
195 | 
196 |         updates = {}
197 |         for i, m in enumerate(self.modules):
198 |             assert len(list(m.parameters())
199 |                        ) == 1, "Can handle only one parameter at the moment"
200 |             classname = m.__class__.__name__
201 |             p = next(m.parameters())
202 | 
203 |             la = self.damping + self.weight_decay
204 | 
205 |             if self.steps % self.Tf == 0:
206 |                 # My asynchronous implementation exists, I will add it later.
207 |                 # Experimenting with different ways to this in PyTorch.
208 |                 self.d_a[m], self.Q_a[m] = torch.symeig(
209 |                     self.m_aa[m], eigenvectors=True)
210 |                 self.d_g[m], self.Q_g[m] = torch.symeig(
211 |                     self.m_gg[m], eigenvectors=True)
212 | 
213 |                 self.d_a[m].mul_((self.d_a[m] > 1e-6).float())
214 |                 self.d_g[m].mul_((self.d_g[m] > 1e-6).float())
215 | 
216 |             if classname == 'Conv2d':
217 |                 p_grad_mat = p.grad.data.view(p.grad.data.size(0), -1)
218 |             else:
219 |                 p_grad_mat = p.grad.data
220 | 
221 |             v1 = self.Q_g[m].t() @ p_grad_mat @ self.Q_a[m]
222 |             v2 = v1 / (
223 |                 self.d_g[m].unsqueeze(1) * self.d_a[m].unsqueeze(0) + la)
224 |             v = self.Q_g[m] @ v2 @ self.Q_a[m].t()
225 | 
226 |             v = v.view(p.grad.data.size())
227 |             updates[p] = v
228 | 
229 |         vg_sum = 0
230 |         for p in self.model.parameters():
231 |             v = updates[p]
232 |             vg_sum += (v * p.grad.data * self.lr * self.lr).sum()
233 | 
234 |         nu = min(1, math.sqrt(self.kl_clip / vg_sum))
235 | 
236 |         for p in self.model.parameters():
237 |             v = updates[p]
238 |             p.grad.data.copy_(v)
239 |             p.grad.data.mul_(nu)
240 | 
241 |         self.optim.step()
242 |         self.steps += 1


--------------------------------------------------------------------------------
/core/algorithms/input_dependent_baseline/base_meta_critic.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import copy
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch.optim as optim
  7 | 
  8 | from torch.nn import L1Loss, MSELoss
  9 | from itertools import chain
 10 | 
 11 | 
 12 | class ActorMetaCriticAlgo:
 13 |     """
 14 |         Base class for algorithm (A2C, PPO, etc) which supports adapt meta critic to new \
 15 |             input sequences
 16 |     """
 17 | 
 18 |     def __init__(self,
 19 |                  actor_critic,
 20 |                  lr=7e-4,
 21 |                  adapt_lr=1e-3,
 22 |                  num_inner_steps=5,
 23 |                  adapt_criterion=MSELoss):
 24 |         self.actor_critic = actor_critic
 25 |         self.lr = lr
 26 |         self.optimizer = optim.Adam(self.actor_critic.parameters(), lr=self.lr)
 27 | 
 28 |         # meta critic args
 29 |         self.adapt_lr = adapt_lr
 30 |         self.num_inner_steps = num_inner_steps
 31 |         self.adapt_criterion = adapt_criterion()
 32 | 
 33 |         # imitation learning
 34 |         self.il_criterion = nn.CrossEntropyLoss()
 35 | 
 36 |     def adapt_and_predict(self, task_inputs, task_labels, meta_inputs, meta_labels):
 37 |         """
 38 |             Adapt the meta critic to new input-sequence and predict the values of new observation \
 39 |                 (with same input sequence)
 40 |             For simplicity, we adopt the **First-order MAML** as in https://arxiv.org/abs/1803.02999 \
 41 | 
 42 |             :param task_inputs: tuple of (obs, rnn_hxs, masks) - training inputs of roll out with same input sequence
 43 | 
 44 |             :param task_labels: array of shape (num_steps, num_envs, 1) - the Monte Carlo approximation of values
 45 | 
 46 |             :param meta_inputs: tuple of (obs, rnn_hxs, masks) - testing inputs of roll out with same input sequence as training
 47 | 
 48 |             :param meta_labels: array of shape (num_steps, num_envs, 1) - the Monte Carlo approximation of values
 49 | 
 50 |             :return: value prediction of meta_inputs and meta gradient of critic
 51 |         """
 52 |         # create new net and exclusively update this network
 53 |         fast_net = copy.deepcopy(self.actor_critic.base)
 54 |         task_optimizer = optim.Adam(
 55 |             fast_net.parameters(), lr=self.lr)
 56 | 
 57 |         task_obs, task_rnn_hxs, task_masks = task_inputs
 58 | 
 59 |         # Adapt to new task
 60 |         for _ in range(self.num_inner_steps):
 61 |             task_preds, _, _ = fast_net(task_obs, task_rnn_hxs, task_masks)
 62 |             task_loss = self.adapt_criterion(task_preds, task_labels)
 63 | 
 64 |             # update the fast-adapted network
 65 |             task_optimizer.zero_grad()
 66 |             task_loss.backward()
 67 |             task_optimizer.step()
 68 | 
 69 |         # compute meta grad
 70 |         meta_obs, meta_rnn_hxs, meta_masks = meta_inputs
 71 |         meta_preds, _, _ = fast_net(meta_obs, meta_rnn_hxs, meta_masks)
 72 |         meta_loss = self.adapt_criterion(meta_preds, meta_labels)
 73 |         grads = torch.autograd.grad(
 74 |             meta_loss, fast_net.parameters(), allow_unused=True)
 75 | 
 76 |         # create dictionary contains gradient of meta critic
 77 |         meta_grads = {name: g if g is not None else torch.zeros_like(weight)
 78 |                       for ((name, weight), g)
 79 |                       in zip(fast_net.named_parameters(),
 80 |                              grads)}
 81 | 
 82 |         return meta_preds, meta_grads
 83 | 
 84 |     def train_meta_critic_and_predict_values(self, rollouts):
 85 |         """
 86 |             Train the meta critic with rollout experience and return the predicted values \
 87 |             The adapted algorithm is described in Algorithm 1 of the paper \
 88 |                     https://arxiv.org/abs/1807.02264. We split the rollout into 2 half, \
 89 |                     the critic parameters adapting to the first half will give the prediction \
 90 |                     for second half. The critic parameters adapting to the second half will \
 91 |                     give the prediction for first half.
 92 | 
 93 |             :param rollouts: RolloutStorage's instance
 94 | 
 95 |             :return: input-dependent values
 96 |         """
 97 |         obs_shape = rollouts.obs.size()[2:]
 98 |         _, num_processes, _ = rollouts.rewards.size()
 99 | 
100 |         # prepare input and output of meta learner
101 |         # ie splitting them into 2
102 |         task_pt = int(num_processes/2)
103 |         # first half rollouts
104 |         # num_steps * num_processes * input_shape
105 |         first_obs = rollouts.obs[:-1, :task_pt, ...].reshape(-1, *obs_shape)
106 |         first_rnn_hxs = rollouts.recurrent_hidden_states[0, :task_pt].reshape(
107 |             -1, self.actor_critic.recurrent_hidden_state_size)
108 |         first_mask = rollouts.masks[:-1, :task_pt].reshape(-1, 1)
109 |         first_inputs = (first_obs, first_rnn_hxs, first_mask)
110 |         first_labels = rollouts.returns[:-1, :task_pt, ...].reshape(-1, 1)
111 |         # second half rollouts
112 |         # num_steps * num_processes * input_shape
113 |         second_obs = rollouts.obs[:-1, task_pt:, ...].reshape(-1, *obs_shape)
114 |         second_rnn_hxs = rollouts.recurrent_hidden_states[0, task_pt:].reshape(
115 |             -1, self.actor_critic.recurrent_hidden_state_size)
116 |         second_mask = rollouts.masks[:-1, task_pt:].reshape(-1, 1)
117 |         second_inputs = (second_obs, second_rnn_hxs, second_mask)
118 |         second_labels = rollouts.returns[:-1, task_pt:, ...].reshape(-1, 1)
119 | 
120 |         # train meta network
121 |         # the actor critic object must be instance of MetaCritic class
122 |         second_values, second_meta_grads = self.adapt_and_predict(
123 |             first_inputs, first_labels, second_inputs, second_labels)
124 |         first_values, first_meta_grads = self.adapt_and_predict(
125 |             second_inputs, second_labels, first_inputs, first_labels)
126 |         values = torch.cat((first_values, second_values), dim=0)
127 | 
128 |         # update the meta critic
129 |         self.update_meta_grads(
130 |             [first_meta_grads, second_meta_grads], first_inputs, first_labels)
131 | 
132 |         # compute value loss
133 |         value_loss = self.adapt_criterion(
134 |             values, rollouts.returns[:-1].view(-1, 1))
135 | 
136 |         return values, value_loss.item()
137 | 
138 |     def update_meta_grads(self, grads, dummy_inputs, dummy_labels):
139 |         """
140 |             Set the gradient values from grads (dict) to actor_critic parameters and update 
141 | 
142 |             :param grads: list of OrderedDict - each element is the gradient from a task
143 | 
144 |             :param dummy_inputs: dummy inputs to activate the gradient of meta network
145 | 
146 |             :param dummy_labels: dummy labels to activate the gradient of meta network
147 |         """
148 |         keys = grads[0].keys()
149 |         # multiple loss with value_loss_coef equivalent to multiple this coef with grad
150 |         gradients = {k: sum(grad[k] for grad in grads) for k in keys}
151 | 
152 |         # compute dummy loss
153 |         value_pred, _, _ = self.actor_critic.base(*dummy_inputs)
154 |         loss = self.adapt_criterion(value_pred, dummy_labels)
155 | 
156 |         hooks = []
157 |         for (k, v) in self.actor_critic.base.named_parameters():
158 |             def get_closure():
159 |                 key = k
160 | 
161 |                 def replace_grad(grad):
162 |                     return gradients[key]
163 |                 return replace_grad
164 |             hooks.append(v.register_hook(get_closure()))
165 | 
166 |         # compute grad for curr step
167 |         self.optimizer.zero_grad()
168 |         loss.backward()
169 |         # nn.utils.clip_grad_norm_(self.actor_critic.base.critic.parameters(), self.max_grad_norm)
170 |         self.optimizer.step()
171 | 
172 |         for h in hooks:
173 |             h.remove()
174 | 
175 |     def imitation_learning(self, inputs, rnn_hxs, masks, expert):
176 |         """
177 |             Imitation learning loss
178 | 
179 |         :param inputs: state observations
180 | 
181 |         :param rnn_hxs: rnn hidden state
182 | 
183 |         :param masks: mask the final state with 0 value
184 | 
185 |         :param expert: a trained or heuristic agent
186 | 
187 |         :return: log probability of expert's actions
188 |         """
189 |         _, actor_features, _ = self.actor_critic.base(inputs, rnn_hxs, masks)
190 |         dist = self.actor_critic.dist(actor_features)
191 | 
192 |         expert_actions = expert.act(inputs)
193 | 
194 |         il_loss = self.il_criterion(dist.probs, expert_actions.reshape(-1))
195 |         accuracy = (torch.argmax(dist.probs, dim=1) ==
196 |                     expert_actions.reshape(-1)).float().sum()/expert_actions.shape[0]
197 | 
198 |         return il_loss, accuracy
199 | 


--------------------------------------------------------------------------------
/core/storage/base_storage.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler
  3 | 
  4 | 
  5 | def _flatten_helper(T, N, _tensor):
  6 |     return _tensor.view(T * N, *_tensor.size()[2:])
  7 | 
  8 | 
  9 | class RolloutStorage(object):
 10 |     def __init__(self, num_steps, num_processes, obs_shape, action_space,
 11 |                  recurrent_hidden_state_size):
 12 |         self.obs = torch.zeros(num_steps + 1, num_processes, *obs_shape)
 13 |         self.recurrent_hidden_states = torch.zeros(
 14 |             num_steps + 1, num_processes, recurrent_hidden_state_size)
 15 |         self.rewards = torch.zeros(num_steps, num_processes, 1)
 16 |         self.value_preds = torch.zeros(num_steps + 1, num_processes, 1)
 17 |         self.returns = torch.zeros(num_steps + 1, num_processes, 1)
 18 |         self.action_log_probs = torch.zeros(num_steps, num_processes, 1)
 19 |         if action_space.__class__.__name__ == 'Discrete':
 20 |             action_shape = 1
 21 |         else:
 22 |             action_shape = action_space.shape[0]
 23 |         self.actions = torch.zeros(num_steps, num_processes, action_shape)
 24 |         if action_space.__class__.__name__ == 'Discrete':
 25 |             self.actions = self.actions.long()
 26 |         self.masks = torch.ones(num_steps + 1, num_processes, 1)
 27 | 
 28 |         # Masks that indicate whether it's a true terminal state
 29 |         # or time limit end state
 30 |         self.bad_masks = torch.ones(num_steps + 1, num_processes, 1)
 31 | 
 32 |         self.num_steps = num_steps
 33 |         self.step = 0
 34 | 
 35 |     def to(self, device):
 36 |         self.obs = self.obs.to(device)
 37 |         self.recurrent_hidden_states = self.recurrent_hidden_states.to(device)
 38 |         self.rewards = self.rewards.to(device)
 39 |         self.value_preds = self.value_preds.to(device)
 40 |         self.returns = self.returns.to(device)
 41 |         self.action_log_probs = self.action_log_probs.to(device)
 42 |         self.actions = self.actions.to(device)
 43 |         self.masks = self.masks.to(device)
 44 |         self.bad_masks = self.bad_masks.to(device)
 45 | 
 46 |     def insert(self, obs, recurrent_hidden_states, actions, action_log_probs,
 47 |                value_preds, rewards, masks, bad_masks):
 48 |         self.obs[self.step + 1].copy_(obs)
 49 |         self.recurrent_hidden_states[self.step +
 50 |                                      1].copy_(recurrent_hidden_states)
 51 |         self.actions[self.step].copy_(actions)
 52 |         self.action_log_probs[self.step].copy_(action_log_probs)
 53 |         self.value_preds[self.step].copy_(value_preds)
 54 |         self.rewards[self.step].copy_(rewards)
 55 |         self.masks[self.step + 1].copy_(masks)
 56 |         self.bad_masks[self.step + 1].copy_(bad_masks)
 57 | 
 58 |         self.step = (self.step + 1) % self.num_steps
 59 | 
 60 |     def after_update(self):
 61 |         self.obs[0].copy_(self.obs[-1])
 62 |         self.recurrent_hidden_states[0].copy_(self.recurrent_hidden_states[-1])
 63 |         self.masks[0].copy_(self.masks[-1])
 64 |         self.bad_masks[0].copy_(self.bad_masks[-1])
 65 | 
 66 |     def compute_returns(self,
 67 |                         next_value,
 68 |                         use_gae,
 69 |                         gamma,
 70 |                         gae_lambda,
 71 |                         use_proper_time_limits=True):
 72 |         if use_proper_time_limits:
 73 |             if use_gae:
 74 |                 self.value_preds[-1] = next_value
 75 |                 gae = 0
 76 |                 for step in reversed(range(self.rewards.size(0))):
 77 |                     delta = self.rewards[step] + gamma * self.value_preds[
 78 |                         step + 1] * self.masks[step +
 79 |                                                1] - self.value_preds[step]
 80 |                     gae = delta + gamma * gae_lambda * self.masks[step +
 81 |                                                                   1] * gae
 82 |                     gae = gae * self.bad_masks[step + 1]
 83 |                     self.returns[step] = gae + self.value_preds[step]
 84 |             else:
 85 |                 self.returns[-1] = next_value
 86 |                 for step in reversed(range(self.rewards.size(0))):
 87 |                     self.returns[step] = (self.returns[step + 1] *
 88 |                                           gamma * self.masks[step + 1] + self.rewards[step]) * self.bad_masks[step + 1] \
 89 |                         + (1 - self.bad_masks[step + 1]
 90 |                            ) * self.value_preds[step]
 91 |         else:
 92 |             if use_gae:
 93 |                 self.value_preds[-1] = next_value
 94 |                 gae = 0
 95 |                 for step in reversed(range(self.rewards.size(0))):
 96 |                     delta = self.rewards[step] + gamma * self.value_preds[
 97 |                         step + 1] * self.masks[step +
 98 |                                                1] - self.value_preds[step]
 99 |                     gae = delta + gamma * gae_lambda * self.masks[step +
100 |                                                                   1] * gae
101 |                     self.returns[step] = gae + self.value_preds[step]
102 |             else:
103 |                 self.returns[-1] = next_value
104 |                 for step in reversed(range(self.rewards.size(0))):
105 |                     self.returns[step] = self.returns[step + 1] * \
106 |                         gamma * self.masks[step + 1] + self.rewards[step]
107 | 
108 |     def feed_forward_generator(self,
109 |                                advantages,
110 |                                num_mini_batch=None,
111 |                                mini_batch_size=None):
112 |         num_steps, num_processes = self.rewards.size()[0:2]
113 |         batch_size = num_processes * num_steps
114 | 
115 |         if mini_batch_size is None:
116 |             assert batch_size >= num_mini_batch, (
117 |                 "PPO requires the number of processes ({}) "
118 |                 "* number of steps ({}) = {} "
119 |                 "to be greater than or equal to the number of PPO mini batches ({})."
120 |                 "".format(num_processes, num_steps, num_processes * num_steps,
121 |                           num_mini_batch))
122 |             mini_batch_size = batch_size // num_mini_batch
123 |         sampler = BatchSampler(
124 |             SubsetRandomSampler(range(batch_size)),
125 |             mini_batch_size,
126 |             drop_last=True)
127 |         for indices in sampler:
128 |             obs_batch = self.obs[:-1].view(-1, *self.obs.size()[2:])[indices]
129 |             recurrent_hidden_states_batch = self.recurrent_hidden_states[:-1].view(
130 |                 -1, self.recurrent_hidden_states.size(-1))[indices]
131 |             actions_batch = self.actions.view(-1,
132 |                                               self.actions.size(-1))[indices]
133 |             value_preds_batch = self.value_preds[:-1].view(-1, 1)[indices]
134 |             return_batch = self.returns[:-1].view(-1, 1)[indices]
135 |             masks_batch = self.masks[:-1].view(-1, 1)[indices]
136 |             old_action_log_probs_batch = self.action_log_probs.view(-1,
137 |                                                                     1)[indices]
138 |             if advantages is None:
139 |                 adv_targ = None
140 |             else:
141 |                 adv_targ = advantages.reshape(-1, 1)[indices]
142 | 
143 |             yield obs_batch, recurrent_hidden_states_batch, actions_batch, \
144 |                 value_preds_batch, return_batch, masks_batch, old_action_log_probs_batch, adv_targ
145 | 
146 |     def recurrent_generator(self, advantages, num_mini_batch):
147 |         num_processes = self.rewards.size(1)
148 |         assert num_processes >= num_mini_batch, (
149 |             "PPO requires the number of processes ({}) "
150 |             "to be greater than or equal to the number of "
151 |             "PPO mini batches ({}).".format(num_processes, num_mini_batch))
152 |         num_envs_per_batch = num_processes // num_mini_batch
153 |         perm = torch.randperm(num_processes)
154 |         for start_ind in range(0, num_processes, num_envs_per_batch):
155 |             obs_batch = []
156 |             recurrent_hidden_states_batch = []
157 |             actions_batch = []
158 |             value_preds_batch = []
159 |             return_batch = []
160 |             masks_batch = []
161 |             old_action_log_probs_batch = []
162 |             adv_targ = []
163 | 
164 |             for offset in range(num_envs_per_batch):
165 |                 ind = perm[start_ind + offset]
166 |                 obs_batch.append(self.obs[:-1, ind])
167 |                 recurrent_hidden_states_batch.append(
168 |                     self.recurrent_hidden_states[0:1, ind])
169 |                 actions_batch.append(self.actions[:, ind])
170 |                 value_preds_batch.append(self.value_preds[:-1, ind])
171 |                 return_batch.append(self.returns[:-1, ind])
172 |                 masks_batch.append(self.masks[:-1, ind])
173 |                 old_action_log_probs_batch.append(
174 |                     self.action_log_probs[:, ind])
175 |                 adv_targ.append(advantages[:, ind])
176 | 
177 |             T, N = self.num_steps, num_envs_per_batch
178 |             # These are all tensors of size (T, N, -1)
179 |             obs_batch = torch.stack(obs_batch, 1)
180 |             actions_batch = torch.stack(actions_batch, 1)
181 |             value_preds_batch = torch.stack(value_preds_batch, 1)
182 |             return_batch = torch.stack(return_batch, 1)
183 |             masks_batch = torch.stack(masks_batch, 1)
184 |             old_action_log_probs_batch = torch.stack(
185 |                 old_action_log_probs_batch, 1)
186 |             adv_targ = torch.stack(adv_targ, 1)
187 | 
188 |             # States is just a (N, -1) tensor
189 |             recurrent_hidden_states_batch = torch.stack(
190 |                 recurrent_hidden_states_batch, 1).view(N, -1)
191 | 
192 |             # Flatten the (T, N, ...) tensors to (T * N, ...)
193 |             obs_batch = _flatten_helper(T, N, obs_batch)
194 |             actions_batch = _flatten_helper(T, N, actions_batch)
195 |             value_preds_batch = _flatten_helper(T, N, value_preds_batch)
196 |             return_batch = _flatten_helper(T, N, return_batch)
197 |             masks_batch = _flatten_helper(T, N, masks_batch)
198 |             old_action_log_probs_batch = _flatten_helper(T, N,
199 |                                                          old_action_log_probs_batch)
200 |             adv_targ = _flatten_helper(T, N, adv_targ)
201 | 
202 |             yield obs_batch, recurrent_hidden_states_batch, actions_batch, \
203 |                 value_preds_batch, return_batch, masks_batch, old_action_log_probs_batch, adv_targ
204 | 


--------------------------------------------------------------------------------
/core/envs/park_envs.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Make environments of Park Platform
  3 | """
  4 | import torch
  5 | import numpy as np
  6 | import os
  7 | import park
  8 | import gym
  9 | import random
 10 | 
 11 | from park.spaces.box import Box
 12 | from baselines import bench, logger
 13 | from baselines.common.atari_wrappers import make_atari, wrap_deepmind
 14 | from baselines.common.vec_env import VecEnvWrapper
 15 | from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
 16 | from baselines.common.vec_env.shmem_vec_env import ShmemVecEnv
 17 | from baselines.common.wrappers import TimeLimit
 18 | from baselines.common.vec_env.vec_normalize import \
 19 |     VecNormalize as VecNormalize_
 20 | 
 21 | from .load_balance_wrappers import ProcessLoadBalanceObservation, \
 22 |     LoadBalanceRandomReset, RewardNormalize, FixJobSequence
 23 | 
 24 | 
 25 | PARK_ENV_LIST = ['spark', 'spark_sim',
 26 |                  'load_balance']
 27 | 
 28 | 
 29 | def make_env(env_id,
 30 |              seed,
 31 |              rank,
 32 |              log_dir,
 33 |              allow_early_resets,
 34 |              max_episode_steps=None,
 35 |              args=None,
 36 |              train=True):
 37 |     def _thunk():
 38 |         if env_id not in PARK_ENV_LIST:
 39 |             raise ValueError("Unsupported environment, expect the environment to be one of "
 40 |                              + str(PARK_ENV_LIST)+" but got: "+str(env_id))
 41 |         elif env_id == 'load_balance':
 42 |             # arrange the number of stream jobs
 43 |             env = park.make(env_id,
 44 |                             num_stream_jobs=args.num_stream_jobs,
 45 |                             service_rates=args.load_balance_service_rates)
 46 | 
 47 |             # random act after resetting to diversify the state
 48 |             # only use when training
 49 |             if train:
 50 |                 env = LoadBalanceRandomReset(
 51 |                     env, args.max_random_init_steps)
 52 | 
 53 |             # if using load balance, clip and normalize the observation with this wrapper
 54 |             if args is not None:
 55 |                 env = ProcessLoadBalanceObservation(env,
 56 |                                                     args.job_size_norm_factor,
 57 |                                                     args.server_load_norm_factor,
 58 |                                                     args.highest_server_obs,
 59 |                                                     args.highest_job_obs,
 60 |                                                     args.elapsed_time_norm_factor,
 61 |                                                     args.highest_elapsed_time
 62 |                                                     )
 63 |                 # normalize reward
 64 |                 env = RewardNormalize(env, args.reward_norm_factor)
 65 | 
 66 |                 if args.fix_job_sequence:
 67 |                     # fix job sequence
 68 |                     env = FixJobSequence(env, seed)
 69 | 
 70 |         if max_episode_steps:
 71 |             env = TimeLimit(env, max_episode_steps)
 72 |             # adding information to env for computing return
 73 |             env = TimeLimitMask(env)
 74 | 
 75 |         # IMPORTANT: all environments used same random seed to repeat the input-process
 76 |         if train and args.algo.startswith('mib'):
 77 |             env.seed(seed)
 78 |         else:
 79 |             env.seed(seed + rank)
 80 | 
 81 |         if log_dir is not None:
 82 |             env = bench.Monitor(
 83 |                 env,
 84 |                 os.path.join(log_dir, str(rank)),
 85 |                 allow_early_resets=allow_early_resets)
 86 | 
 87 |         return env
 88 | 
 89 |     return _thunk
 90 | 
 91 | 
 92 | def make_vec_envs(env_name,
 93 |                   seed,
 94 |                   num_processes,
 95 |                   log_dir,
 96 |                   device,
 97 |                   allow_early_resets,
 98 |                   max_episode_steps=None,
 99 |                   args=None,
100 |                   train=True):
101 |     """
102 |         Make vectorized environments 
103 |         :param env_name: str - name of environment
104 |         :param seed: int - random seed of environment
105 |         :num_process: int - number of parallel environment
106 |         :param log_dir: str - path to log directory
107 |         :param device: str - 'cuda' or 'cpu'
108 |         :param allow_early_reset: bool - if apply TimeLimitMask on environments, set this param to True
109 |         :param max_episode_steps: int - maximum number of action in 1 episode
110 |         :param args: ArgsParser - use to specifiy environment args
111 |         :param train: bool - determine if we are using created to train or evaluate
112 |                             if we're training, all environment share same random seed to repeat input sequence
113 |                             otherwise, we diversify the random seed
114 |     """
115 |     envs = [
116 |         make_env(env_id=env_name, seed=seed, rank=i, log_dir=log_dir,
117 |                  allow_early_resets=allow_early_resets,
118 |                  max_episode_steps=max_episode_steps, args=args, train=train)
119 |         for i in range(num_processes)
120 |     ]
121 | 
122 |     if len(envs) > 1:
123 |         envs = ShmemVecEnv(envs, context='fork')
124 |     else:
125 |         envs = DummyVecEnv(envs)
126 | 
127 |     envs = VecPyTorch(envs, device)
128 | 
129 |     return envs
130 | 
131 | 
132 | def load_balance_states_to_inputs(states):
133 |     """
134 |         Transform states of LoadBalance Env to inputs sequences
135 |         :param states: torch.Tensor of shape T x N_processes x (Num_servers + 2)
136 |         :return: torch.Tensor of shape T x N_processes x 2
137 |     """
138 |     return states[:, :, -2:]
139 | 
140 | 
141 | # Checks whether done was caused my timit limits or not
142 | class TimeLimitMask(gym.Wrapper):
143 |     def step(self, action):
144 |         obs, rew, done, info = self.env.step(action)
145 |         if done and self.env._max_episode_steps == self.env._elapsed_steps:
146 |             info['bad_transition'] = True
147 | 
148 |         return obs, rew, done, info
149 | 
150 |     def reset(self, **kwargs):
151 |         return self.env.reset(**kwargs)
152 | 
153 | 
154 | # Can be used to test recurrent policies for Reacher-v2
155 | class MaskGoal(gym.ObservationWrapper):
156 |     def observation(self, observation):
157 |         if self.env._elapsed_steps > 0:
158 |             observation[-2:] = 0
159 |         return observation
160 | 
161 | 
162 | class TransposeObs(gym.ObservationWrapper):
163 |     def __init__(self, env=None):
164 |         """
165 |         Transpose observation space (base class)
166 |         """
167 |         super(TransposeObs, self).__init__(env)
168 | 
169 | 
170 | class TransposeImage(TransposeObs):
171 |     def __init__(self, env=None, op=[2, 0, 1]):
172 |         """
173 |         Transpose observation space for images
174 |         """
175 |         super(TransposeImage, self).__init__(env)
176 |         assert len(op) == 3, "Error: Operation, " + str(op) + ", must be dim3"
177 |         self.op = op
178 |         obs_shape = self.observation_space.shape
179 |         self.observation_space = Box(
180 |             self.observation_space.low[0, 0, 0],
181 |             self.observation_space.high[0, 0, 0], [
182 |                 obs_shape[self.op[0]], obs_shape[self.op[1]],
183 |                 obs_shape[self.op[2]]
184 |             ],
185 |             dtype=self.observation_space.dtype)
186 | 
187 |     def observation(self, ob):
188 |         return ob.transpose(self.op[0], self.op[1], self.op[2])
189 | 
190 | 
191 | class VecPyTorch(VecEnvWrapper):
192 |     def __init__(self, venv, device):
193 |         """Return only every `skip`-th frame"""
194 |         super(VecPyTorch, self).__init__(venv)
195 |         self.device = device
196 |         # TODO: Fix data types
197 | 
198 |     def reset(self):
199 |         obs = self.venv.reset()
200 |         obs = torch.from_numpy(obs).float().to(self.device)
201 |         return obs
202 | 
203 |     def step_async(self, actions):
204 |         if isinstance(actions, torch.LongTensor):
205 |             # Squeeze the dimension for discrete actions
206 |             actions = actions.squeeze(1)
207 |         actions = actions.cpu().numpy()
208 |         self.venv.step_async(actions)
209 | 
210 |     def step_wait(self):
211 |         obs, reward, done, info = self.venv.step_wait()
212 |         obs = torch.from_numpy(obs).float().to(self.device)
213 |         reward = torch.from_numpy(reward).unsqueeze(dim=1).float()
214 |         return obs, reward, done, info
215 | 
216 | 
217 | class VecNormalize(VecNormalize_):
218 |     def __init__(self, *args, **kwargs):
219 |         super(VecNormalize, self).__init__(*args, **kwargs)
220 |         self.training = True
221 | 
222 |     def _obfilt(self, obs, update=True):
223 |         if self.ob_rms:
224 |             if self.training and update:
225 |                 self.ob_rms.update(obs)
226 |             obs = np.clip((obs - self.ob_rms.mean) /
227 |                           np.sqrt(self.ob_rms.var + self.epsilon),
228 |                           -self.clipob, self.clipob)
229 |             return obs
230 |         else:
231 |             return obs
232 | 
233 |     def train(self):
234 |         self.training = True
235 | 
236 |     def eval(self):
237 |         self.training = False
238 | 
239 | 
240 | # Derived from
241 | # https://github.com/openai/baselines/blob/master/baselines/common/vec_env/vec_frame_stack.py
242 | class VecPyTorchFrameStack(VecEnvWrapper):
243 |     def __init__(self, venv, nstack, device=None):
244 |         self.venv = venv
245 |         self.nstack = nstack
246 | 
247 |         wos = venv.observation_space  # wrapped ob space
248 |         self.shape_dim0 = wos.shape[0]
249 | 
250 |         low = np.repeat(wos.low, self.nstack, axis=0)
251 |         high = np.repeat(wos.high, self.nstack, axis=0)
252 | 
253 |         if device is None:
254 |             device = torch.device('cpu')
255 |         self.stacked_obs = torch.zeros((venv.num_envs, ) +
256 |                                        low.shape).to(device)
257 | 
258 |         observation_space = gym.spaces.Box(
259 |             low=low, high=high, dtype=venv.observation_space.dtype)
260 |         VecEnvWrapper.__init__(self, venv, observation_space=observation_space)
261 | 
262 |     def step_wait(self):
263 |         obs, rews, news, infos = self.venv.step_wait()
264 |         self.stacked_obs[:, :-self.shape_dim0] = \
265 |             self.stacked_obs[:, self.shape_dim0:].clone()
266 |         for (i, new) in enumerate(news):
267 |             if new:
268 |                 self.stacked_obs[i] = 0
269 |         self.stacked_obs[:, -self.shape_dim0:] = obs
270 |         return self.stacked_obs, rews, news, infos
271 | 
272 |     def reset(self):
273 |         obs = self.venv.reset()
274 |         if torch.backends.cudnn.deterministic:
275 |             self.stacked_obs = torch.zeros(self.stacked_obs.shape)
276 |         else:
277 |             self.stacked_obs.zero_()
278 |         self.stacked_obs[:, -self.shape_dim0:] = obs
279 |         return self.stacked_obs
280 | 
281 |     def close(self):
282 |         self.venv.close()
283 | 


--------------------------------------------------------------------------------
/core/arguments.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | 
  3 | import torch
  4 | 
  5 | 
  6 | def get_args():
  7 |     parser = argparse.ArgumentParser(description='RL')
  8 |     parser.add_argument(
  9 |         '--algo', default='lacie_a2c_memory', help='algorithm to use: a2c | ppo | acktr')
 10 |     parser.add_argument(
 11 |         '--lr', type=float, default=7e-4, help='learning rate (default: 7e-4)')
 12 |     parser.add_argument(
 13 |         '--cpc-lr', type=float, default=0.001, help='learning rate for contrastive module (default: 1e-3)')
 14 |     parser.add_argument(
 15 |         '--critic-lr', type=float, default=1e-3, help='learning rate of critic (default: 1e-3)')
 16 |     parser.add_argument(
 17 |         '--actor-lr', type=float, default=1e-3, help='learning rate of actor (default: 1e-3)')
 18 |     parser.add_argument(
 19 |         '--eps',
 20 |         type=float,
 21 |         default=1e-5,
 22 |         help='RMSprop optimizer epsilon (default: 1e-5)')
 23 |     parser.add_argument(
 24 |         '--alpha',
 25 |         type=float,
 26 |         default=0.99,
 27 |         help='RMSprop optimizer apha (default: 0.99)')
 28 |     parser.add_argument(
 29 |         '--gamma',
 30 |         type=float,
 31 |         default=0.99,
 32 |         help='discount factor for rewards (default: 0.99)')
 33 |     parser.add_argument(
 34 |         '--use-gae',
 35 |         action='store_true',
 36 |         default=False,
 37 |         help='use generalized advantage estimation')
 38 |     parser.add_argument(
 39 |         '--gae-lambda',
 40 |         type=float,
 41 |         default=0.95,
 42 |         help='gae lambda parameter (default: 0.95)')
 43 |     parser.add_argument(
 44 |         '--entropy-coef',
 45 |         type=float,
 46 |         default=0.01,
 47 |         help='entropy term coefficient (default: 0.01)')
 48 |     parser.add_argument(
 49 |         '--value-loss-coef',
 50 |         type=float,
 51 |         default=0.5,
 52 |         help='value loss coefficient (default: 0.5)')
 53 |     parser.add_argument(
 54 |         '--regularize-coef',
 55 |         type=float,
 56 |         default=0.05,
 57 |         help='cpc regularize loss coefficient (default: 0.05)')
 58 |     parser.add_argument(
 59 |         '--max-grad-norm',
 60 |         type=float,
 61 |         default=0.5,
 62 |         help='max norm of gradients (default: 0.5)')
 63 |     parser.add_argument(
 64 |         '--seed', type=int, default=1, help='random seed (default: 1)')
 65 |     parser.add_argument(
 66 |         '--cuda-deterministic',
 67 |         action='store_true',
 68 |         default=False,
 69 |         help="sets flags for determinism when using CUDA (potentially slow!)")
 70 |     parser.add_argument(
 71 |         '--num-processes',
 72 |         type=int,
 73 |         default=16,
 74 |         help='how many training CPU processes to use (default: 16)')
 75 |     parser.add_argument(
 76 |         '--num-steps',
 77 |         type=int,
 78 |         default=100,
 79 |         help='number of forward steps in A2C (default: 100)')
 80 |     parser.add_argument(
 81 |         '--ppo-epoch',
 82 |         type=int,
 83 |         default=4,
 84 |         help='number of ppo epochs (default: 4)')
 85 |     parser.add_argument(
 86 |         '--num-mini-batch',
 87 |         type=int,
 88 |         default=32,
 89 |         help='number of batches for ppo (default: 32)')
 90 |     parser.add_argument(
 91 |         '--clip-param',
 92 |         type=float,
 93 |         default=0.2,
 94 |         help='ppo clip parameter (default: 0.2)')
 95 |     parser.add_argument(
 96 |         '--log-interval',
 97 |         type=int,
 98 |         default=10,
 99 |         help='log interval, one log per n updates (default: 10)')
100 |     parser.add_argument(
101 |         '--save-interval',
102 |         type=int,
103 |         default=100,
104 |         help='save interval, one save per n updates (default: 100)')
105 |     parser.add_argument(
106 |         '--eval-interval',
107 |         type=int,
108 |         default=None,
109 |         help='eval interval, one eval per n updates (default: None)')
110 |     parser.add_argument(
111 |         '--num-env-steps',
112 |         type=int,
113 |         default=10e6,
114 |         help='number of environment steps to train (default: 10e6)')
115 |     parser.add_argument(
116 |         '--env-name',
117 |         default='load_balance',
118 |         help='environment to train on (default: load_balance)')
119 |     parser.add_argument(
120 |         '--log-dir',
121 |         default='logs',
122 |         help='directory to save agent logs (default: logs)')
123 |     parser.add_argument(
124 |         '--save-dir',
125 |         default='./trained_models/',
126 |         help='directory to save agent logs (default: ./trained_models/)')
127 |     parser.add_argument(
128 |         '--resume-dir',
129 |         default=None,
130 |         type=str,
131 |         help='directory to trained agent for resuming (default: None)')
132 |     parser.add_argument(
133 |         '--no-cuda',
134 |         action='store_true',
135 |         default=False,
136 |         help='disables CUDA training')
137 |     parser.add_argument(
138 |         '--use-proper-time-limits',
139 |         action='store_true',
140 |         default=False,
141 |         help='compute returns taking into account time limits')
142 |     parser.add_argument(
143 |         '--max-episode-steps',
144 |         default=1000,
145 |         type=int,
146 |         help='maximum number of steps per episode of environment (default: 1000)')
147 |     parser.add_argument(
148 |         '--num-frame-stack',
149 |         default=1,
150 |         help='number of observation that will be grouped together (default: 4)')
151 |     parser.add_argument(
152 |         '--recurrent-policy',
153 |         action='store_true',
154 |         default=False,
155 |         help='use a recurrent policy')
156 |     parser.add_argument(
157 |         '--use-linear-lr-decay',
158 |         action='store_true',
159 |         default=False,
160 |         help='use a linear schedule on the learning rate')
161 | 
162 |     # IMITATION LEARNING
163 |     parser.add_argument(
164 |         '--use-imitation-learning',
165 |         action='store_true',
166 |         default=False,
167 |         help='if True then apply imitation learning during training')
168 |     parser.add_argument(
169 |         '--il-coef',
170 |         type=float,
171 |         default=10,
172 |         help='coefficient of imitation learning (default: 10)')
173 | 
174 |     # META INPUT-DEPENDENT BASELINE
175 |     parser.add_argument(
176 |         '--fix-job-sequence',
177 |         action='store_true',
178 |         default=False,
179 |         help='if True then jobs arriving to servers will be fixed for every episode')
180 |     parser.add_argument(
181 |         '--num-inner-steps',
182 |         type=int,
183 |         default=4,
184 |         help='number of gradient steps for adapting to new input sequences (default: 4)')
185 |     parser.add_argument(
186 |         '--adapt-lr',
187 |         type=float,
188 |         default=5e-3,
189 |         help='learning rate of innerloop when adapting to new input sequences (default: 2e-3)')
190 |     parser.add_argument(
191 |         '--use-memory-to-pred-weights',
192 |         action='store_true',
193 |         default=False,
194 |         help='if True then use memory in storage to predict weights of advantages')
195 | 
196 |     # LACIE
197 |     parser.add_argument(
198 |         '--lacie-buffer-size',
199 |         type=int,
200 |         default=400,
201 |         help='Size of buffer contains obs, actions for learning hindsight ratio (default: 400)')
202 |     parser.add_argument(
203 |         '--lacie-batch-size',
204 |         type=int,
205 |         default=64,
206 |         help='Batch size of every update to learn hindsight ratio via contrastive loss (default: 64)')
207 |     parser.add_argument(
208 |         '--lacie-num-iter',
209 |         type=int,
210 |         default=10,
211 |         help='Number of iterations to learn hindsight ratio each update (default: 10)')
212 | 
213 |     # LOAD BALANCE ENVIRONMENT
214 |     parser.add_argument(
215 |         '--load-balance-service-rates',
216 |         default=[0.15, 0.25, 0.35, 0.45, 0.55, 0.65, 0.75, 0.85, 0.95, 1.05],
217 |         nargs='+',
218 |         type=float,
219 |         help='Service rates of each servers of load balance environment')
220 |     parser.add_argument(
221 |         '--num-curriculum-time',
222 |         default=65,
223 |         type=int,
224 |         help='number of time we would like to increase the num-stream-jobs in load balance env (default: 65)')
225 |     parser.add_argument(
226 |         '--num-stream-jobs-factor',
227 |         default=1.1,
228 |         type=float,
229 |         help='exponentially increase the number of stream jobs in environment after some interval (default: 1.1)')
230 |     parser.add_argument(
231 |         '--job-size-norm-factor',
232 |         default=1000,
233 |         type=float,
234 |         help='normalize factor of job size in load balance env (default: 10)')
235 |     parser.add_argument(
236 |         '--server-load-norm-factor',
237 |         default=5000,
238 |         type=float,
239 |         help='normalize factor of server load in load balance env (default: 50)')
240 |     parser.add_argument(
241 |         '--elapsed-time-norm-factor',
242 |         default=55,
243 |         type=float,
244 |         help='normalize factor of elapsed time between 2 consecutive events in load balance env (default: 55)')
245 |     parser.add_argument(
246 |         '--highest-server-obs',
247 |         default=20,
248 |         type=float,
249 |         help='Clip server having higher load than this value in load balance environment (default: 2000)')
250 |     parser.add_argument(
251 |         '--highest-job-obs',
252 |         default=10,
253 |         type=float,
254 |         help='Clip job having greater size than this value in load balance environment (default: 1000)')
255 |     parser.add_argument(
256 |         '--highest-elapsed-time',
257 |         default=10,
258 |         type=float,
259 |         help='Clip elapsed time longer than this value in load balance environment (default: 1000)')
260 |     parser.add_argument(
261 |         '--reward-norm-factor',
262 |         default=10000,
263 |         type=float,
264 |         help='normalize factor of reward in training (default: 1000)')
265 |     parser.add_argument(
266 |         '--max-random-init-steps',
267 |         default=1,
268 |         type=int,
269 |         help='maximum number  of random initial steps after resetting (default: 50)')
270 |     parser.add_argument(
271 |         '--num-stream-jobs',
272 |         default=1000,
273 |         type=int,
274 |         help='number of stream jobs of load balance env in training (default: 1000)')
275 | 
276 |     args = parser.parse_args()
277 | 
278 |     args.cuda = not args.no_cuda and torch.cuda.is_available()
279 | 
280 |     assert args.algo in ['a2c', 'ppo', 'acktr',
281 |                          'mib_a2c', 'mib_ppo', 'lacie_a2c', 'lacie_ppo', 'lacie_a2c_memory', 'lacie_ppo_memory']
282 |     if args.recurrent_policy:
283 |         assert args.algo in ['a2c', 'ppo', 'mib_a2c', 'mib_ppo', 'lacie_a2c', 'lacie_ppo', 'lacie_a2c_memory', 'lacie_ppo_memory'], \
284 |             'Recurrent policy is not implemented for ACKTR'
285 | 
286 |     return args
287 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/core/algorithms/lacie/lacie_a2c.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | 
  4 | from itertools import chain
  5 | from torch import optim
  6 | from core.algorithms.lacie.base_lacie import LacieAlgo
  7 | from core.storage import LacieStorage
  8 | 
  9 | 
 10 | class LACIE_A2C(LacieAlgo):
 11 |     """
 12 |         Meta Input-dependent Baseline A2C. \
 13 |         This A2C class leverages input-dependent baseline, which is learned with meta learning, \
 14 |             to reduce variance when updating parameters
 15 |     """
 16 | 
 17 |     def __init__(self,
 18 |                  actor_critic,
 19 |                  value_coef,
 20 |                  entropy_coef,
 21 |                  regularize_coef,
 22 |                  eps=None,
 23 |                  alpha=None,
 24 |                  state_to_input_seq=None,
 25 |                  lr=1e-3,
 26 |                  max_grad_norm=None,
 27 |                  expert=None,
 28 |                  il_coef=1,
 29 |                  num_cpc_steps=10,
 30 |                  cpc_lr=1e-3):
 31 |         super().__init__(actor_critic=actor_critic,
 32 |                          lr=lr,
 33 |                          value_coef=value_coef,
 34 |                          entropy_coef=entropy_coef,
 35 |                          regularize_coef=regularize_coef,
 36 |                          state_to_input_seq=state_to_input_seq,
 37 |                          expert=expert,
 38 |                          il_coef=il_coef,
 39 |                          num_cpc_steps=num_cpc_steps,
 40 |                          cpc_lr=cpc_lr)
 41 |         self.max_grad_norm = max_grad_norm
 42 | 
 43 |     def update(self, rollouts):
 44 |         obs_shape = rollouts.obs.size()[2:]
 45 |         action_shape = rollouts.actions.size()[-1]
 46 |         num_steps, num_processes, _ = rollouts.rewards.size()
 47 | 
 48 |         # Estimate baseline
 49 |         values, action_log_probs, dist_entropy, _ = self.actor_critic.evaluate_actions(
 50 |             rollouts.obs[:-1].view(-1, *obs_shape),
 51 |             rollouts.recurrent_hidden_states[:-1].view(
 52 |                 -1, self.actor_critic.recurrent_hidden_state_size),
 53 |             rollouts.masks[:-1].view(-1, 1),
 54 |             rollouts.actions.view(-1, action_shape))
 55 |         values = values.view(num_steps, num_processes, 1)
 56 |         action_log_probs = action_log_probs.view(num_steps, num_processes, 1)
 57 | 
 58 |         advantages = rollouts.returns[:-1] - values
 59 |         returns = rollouts.returns[:-1]
 60 | 
 61 |         # Value loss for updating Critic Net
 62 |         value_loss = advantages.pow(2).mean()
 63 | 
 64 |         # LEARNING CONTRASTIVE PREDICTIVE MODEL
 65 |         # compute contrastive loss and accuracy
 66 |         contrastive_loss, contrastive_accuracy, regularize_loss = self.compute_contrastive_loss(
 67 |             rollouts.obs, rollouts.actions, rollouts.masks, returns)
 68 |         contrastive_loss = contrastive_loss.item()
 69 |         regularize_loss = regularize_loss.item()
 70 |         # computed weighted advantage according to its dependency with input sequences
 71 | 
 72 |         # learn cpc model for n steps
 73 |         for _ in range(self.num_cpc_steps):
 74 |             cpc_loss, _, cpc_regularize_loss = self.compute_contrastive_loss(
 75 |                 rollouts.obs, rollouts.actions, rollouts.masks, returns)
 76 | 
 77 |             self.cpc_optimizer.zero_grad()
 78 |             (cpc_loss + self.regularize_coef * cpc_regularize_loss).backward()
 79 | 
 80 |             # nn.utils.clip_grad_norm_(chain(self.advantage_encoder.parameters(),
 81 |             #                                self.input_seq_encoder.parameters(),
 82 |             #                                self.state_encoder.parameters(),
 83 |             #                                self.condition_encoder.parameters(),
 84 |             #                                self.action_encoder.parameters()),
 85 |             #                          self.max_grad_norm)
 86 | 
 87 |             self.cpc_optimizer.step()
 88 | 
 89 |         # IMPORTANCE: we need to compute the weighted before learn cpc model
 90 |         # FIXME: Move to training to top to verify if the model can estimate density ratio
 91 |         weighted_advantages = self.compute_weighted_advantages(
 92 |             rollouts.obs, rollouts.actions, rollouts.masks, returns) - values
 93 | 
 94 |         # Action loss of Actor Net
 95 |         action_loss = -(weighted_advantages.detach() * action_log_probs).mean()
 96 | 
 97 |         # IMITATION LEARNING
 98 |         imitation_loss, imitation_accuracy = torch.tensor(
 99 |             0).to(rollouts.obs.device), 0
100 |         if self.expert:
101 |             imitation_loss, imitation_accuracy = self.imitation_learning(
102 |                 rollouts.obs[:-1].view(-1, *obs_shape),
103 |                 rollouts.recurrent_hidden_states[0].view(
104 |                     -1, self.actor_critic.recurrent_hidden_state_size),
105 |                 rollouts.masks[:-1].view(-1, 1),
106 |                 self.expert)
107 | 
108 |         self.optimizer.zero_grad()
109 | 
110 |         (imitation_loss * self.il_coef + value_loss * self.value_coef + action_loss -
111 |          dist_entropy * self.entropy_coef).backward()
112 | 
113 |         nn.utils.clip_grad_norm_(self.actor_critic.parameters(),
114 |                                  self.max_grad_norm)
115 | 
116 |         self.optimizer.step()
117 |         self.after_update()
118 | 
119 |         return {
120 |             'value loss': value_loss.item(),
121 |             'action loss': action_loss.item(),
122 |             'entropy loss': dist_entropy.item(),
123 |             'imitation loss': imitation_loss.item(),
124 |             'imitation accuracy': imitation_accuracy,
125 |             'contrastive loss': contrastive_loss,
126 |             'contrastive accuracy': contrastive_accuracy,
127 |             'regularize loss': regularize_loss
128 |         }
129 | 
130 | 
131 | class LACIE_A2C_Memory(LACIE_A2C):
132 |     def __init__(self,
133 |                  actor_critic,
134 |                  value_coef,
135 |                  entropy_coef,
136 |                  regularize_coef,
137 |                  eps=None,
138 |                  alpha=None,
139 |                  state_to_input_seq=None,
140 |                  lr=1e-3,
141 |                  max_grad_norm=None,
142 |                  expert=None,
143 |                  il_coef=1,
144 |                  num_cpc_steps=10,
145 |                  lacie_batch_size=64,
146 |                  lacie_buffer=None,
147 |                  use_memory_to_pred_weights=False,
148 |                  cpc_lr=1e-3):
149 |         super().__init__(actor_critic,
150 |                          value_coef,
151 |                          entropy_coef,
152 |                          regularize_coef,
153 |                          eps,
154 |                          alpha,
155 |                          state_to_input_seq,
156 |                          lr,
157 |                          max_grad_norm,
158 |                          expert,
159 |                          il_coef,
160 |                          num_cpc_steps,
161 |                          cpc_lr)
162 |         self.lacie_batch_size = lacie_batch_size
163 |         self.lacie_buffer = lacie_buffer
164 |         self.use_memory_to_pred_weights = use_memory_to_pred_weights
165 | 
166 |     def update(self, rollouts):
167 |         obs_shape = rollouts.obs.size()[2:]
168 |         action_shape = rollouts.actions.size()[-1]
169 |         num_steps, num_processes, _ = rollouts.rewards.size()
170 | 
171 |         # Estimate baseline
172 |         values, action_log_probs, dist_entropy, _ = self.actor_critic.evaluate_actions(
173 |             rollouts.obs[:-1].view(-1, *obs_shape),
174 |             rollouts.recurrent_hidden_states[:-1].view(
175 |                 -1, self.actor_critic.recurrent_hidden_state_size),
176 |             rollouts.masks[:-1].view(-1, 1),
177 |             rollouts.actions.view(-1, action_shape))
178 |         values = values.view(num_steps, num_processes, 1)
179 |         action_log_probs = action_log_probs.view(num_steps, num_processes, 1)
180 | 
181 |         advantages = rollouts.returns[:-1] - values
182 |         returns = rollouts.returns[:-1]
183 | 
184 |         # Value loss for updating Critic Net
185 |         value_loss = advantages.pow(2).mean()
186 | 
187 |         # LEARNING CONTRASTIVE PREDICTIVE MODEL
188 |         # update LACIE_Storage
189 |         self.lacie_buffer.insert(rollouts, advantages.detach())
190 |         # compute contrastive loss and accuracy
191 |         contrastive_loss, contrastive_accuracy, regularize_loss = self.compute_contrastive_loss(
192 |             rollouts.obs, rollouts.actions, rollouts.masks, advantages.detach())
193 |         contrastive_loss = contrastive_loss.item()
194 |         regularize_loss = regularize_loss.item()
195 | 
196 |         # computed weighted advantage according to its dependency with input sequences
197 |         # learn cpc model for n steps
198 |         for _ in range(self.num_cpc_steps):
199 |             data = self.lacie_buffer.sample()
200 |             obs, actions, masks, sample_advantages = data['obs'], data['actions'], data['masks'], data['advantages']
201 |             cpc_loss, _, cpc_regularize_loss = self.compute_contrastive_loss(
202 |                 obs, actions, masks, sample_advantages)
203 | 
204 |             self.cpc_optimizer.zero_grad()
205 |             (cpc_loss + self.regularize_coef * cpc_regularize_loss).backward()
206 | 
207 |             # nn.utils.clip_grad_norm_(chain(self.advantage_encoder.parameters(),
208 |             #                                self.input_seq_encoder.parameters(),
209 |             #                                self.state_encoder.parameters(),
210 |             #                                self.condition_encoder.parameters(),
211 |             #                                self.action_encoder.parameters()),
212 |             #                          self.max_grad_norm)
213 | 
214 |             self.cpc_optimizer.step()
215 | 
216 |         # IMPORTANCE: we need to compute the weighted before learn cpc model
217 |         # FIXME: Move the cpc training on top to verify if it can learn useful estimation
218 |         if not self.use_memory_to_pred_weights:
219 |             weighted_advantages = self.compute_weighted_advantages(
220 |                 rollouts.obs, rollouts.actions, rollouts.masks, advantages.detach())
221 |         else:
222 |             data = self.lacie_buffer.sample_most_recent()
223 |             obs, actions, masks, sample_advantages = data['obs'], data[
224 |                 'actions'], data['masks'], data['advantages']
225 |             weighted_advantages = self.compute_weighted_advantages(
226 |                 obs, actions, masks, sample_advantages, rollouts.actions.shape[1])
227 | 
228 |         # Action loss of Actor Net
229 |         action_loss = -(weighted_advantages.detach() * action_log_probs).mean()
230 | 
231 |         # IMITATION LEARNING
232 |         imitation_loss, imitation_accuracy = torch.tensor(
233 |             0).to(rollouts.obs.device), 0
234 |         if self.expert:
235 |             imitation_loss, imitation_accuracy = self.imitation_learning(
236 |                 rollouts.obs[:-1].view(-1, *obs_shape),
237 |                 rollouts.recurrent_hidden_states[0].view(
238 |                     -1, self.actor_critic.recurrent_hidden_state_size),
239 |                 rollouts.masks[:-1].view(-1, 1),
240 |                 self.expert)
241 | 
242 |         self.optimizer.zero_grad()
243 | 
244 |         (imitation_loss * self.il_coef + value_loss * self.value_coef + action_loss -
245 |          dist_entropy * self.entropy_coef).backward()
246 | 
247 |         nn.utils.clip_grad_norm_(self.actor_critic.parameters(),
248 |                                  self.max_grad_norm)
249 | 
250 |         self.optimizer.step()
251 |         self.after_update()
252 | 
253 |         return {
254 |             'value loss': value_loss.item(),
255 |             'action loss': action_loss.item(),
256 |             'entropy loss': dist_entropy.item(),
257 |             'imitation loss': imitation_loss.item(),
258 |             'imitation accuracy': imitation_accuracy,
259 |             'contrastive loss': contrastive_loss,
260 |             'contrastive accuracy': contrastive_accuracy,
261 |             'regularize loss': regularize_loss
262 |         }
263 | 


--------------------------------------------------------------------------------
/utils/plot.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | # switch backend in driver file
  4 | import matplotlib
  5 | import matplotlib.pyplot as plt
  6 | 
  7 | import os
  8 | import os.path as osp
  9 | import glob
 10 | from scipy.signal import medfilt
 11 | 
 12 | 
 13 | def smooth_reward_curve(x, y):
 14 |     # Halfwidth of our smoothing convolution
 15 |     halfwidth = min(31, int(np.ceil(len(x) / 30)))
 16 |     k = halfwidth
 17 |     xsmoo = x[k:-k]
 18 |     ysmoo = np.convolve(y, np.ones(2 * k + 1), mode='valid') / \
 19 |         np.convolve(np.ones_like(y), np.ones(2 * k + 1), mode='valid')
 20 |     downsample = max(int(np.floor(len(xsmoo) / 1e3)), 1)
 21 |     return xsmoo[::downsample], ysmoo[::downsample]
 22 | 
 23 | 
 24 | def fix_point(x, y, interval):
 25 |     np.insert(x, 0, 0)
 26 |     np.insert(y, 0, 0)
 27 | 
 28 |     fx, fy = [], []
 29 |     pointer = 0
 30 | 
 31 |     ninterval = int(max(x) / interval + 1)
 32 | 
 33 |     for i in range(ninterval):
 34 |         tmpx = interval * i
 35 | 
 36 |         while pointer + 1 < len(x) and tmpx > x[pointer + 1]:
 37 |             pointer += 1
 38 | 
 39 |         if pointer + 1 < len(x):
 40 |             alpha = (y[pointer + 1] - y[pointer]) / \
 41 |                 (x[pointer + 1] - x[pointer])
 42 |             tmpy = y[pointer] + alpha * (tmpx - x[pointer])
 43 |             fx.append(tmpx)
 44 |             fy.append(tmpy)
 45 | 
 46 |     return fx, fy
 47 | 
 48 | 
 49 | def load_reward_data(indir, smooth, bin_size):
 50 |     datas = []
 51 |     infiles = glob.glob(os.path.join(indir, '*.monitor.csv'))
 52 | 
 53 |     for inf in infiles:
 54 |         with open(inf, 'r') as f:
 55 |             f.readline()
 56 |             f.readline()
 57 |             for line in f:
 58 |                 tmp = line.split(',')
 59 |                 t_time = float(tmp[2])
 60 |                 tmp = [t_time, int(tmp[1]), float(tmp[0])]
 61 |                 datas.append(tmp)
 62 | 
 63 |     datas = sorted(datas, key=lambda d_entry: d_entry[0])
 64 |     result = []
 65 |     timesteps = 0
 66 |     for i in range(len(datas)):
 67 |         result.append([timesteps, datas[i][-1]])
 68 |         timesteps += datas[i][1]
 69 | 
 70 |     if len(result) < bin_size:
 71 |         return [None, None]
 72 | 
 73 |     x, y = np.array(result)[:, 0], np.array(result)[:, 1]
 74 | 
 75 |     if smooth == 1:
 76 |         x, y = smooth_reward_curve(x, y)
 77 | 
 78 |     if smooth == 2:
 79 |         y = medfilt(y, kernel_size=9)
 80 | 
 81 |     x, y = fix_point(x, y, bin_size)
 82 |     return [x, y]
 83 | 
 84 | # TODO: only works for Experience Replay style training for now
 85 | 
 86 | 
 87 | def load_custom_data(indir, stat_file, smooth, bin_size):
 88 |     datas = []
 89 |     infiles = glob.glob(os.path.join(indir, stat_file))
 90 | 
 91 |     for inf in infiles:  # should be 1
 92 |         with open(inf, 'r') as f:
 93 |             for line in f:
 94 |                 tmp = line.split(',')
 95 |                 tmp = [int(tmp[0]), float(tmp[1])]
 96 |                 datas.append(tmp)
 97 | 
 98 |     datas = sorted(datas, key=lambda d_entry: d_entry[0])
 99 |     result = []
100 |     for i in range(len(datas)):
101 |         result.append([datas[i][0], datas[i][1]])
102 | 
103 |     if len(result) < bin_size:
104 |         return [None, None]
105 | 
106 |     x, y = np.array(result)[:, 0], np.array(result)[:, 1]
107 | 
108 |     if smooth == 1:
109 |         x, y = smooth_reward_curve(x, y)
110 | 
111 |     if smooth == 2:
112 |         y = medfilt(y, kernel_size=9)
113 | 
114 |     x, y = fix_point(x, y, bin_size)
115 |     return [x, y]
116 | 
117 | # TODO: only works for Experience Replay style training for now
118 | 
119 | 
120 | def load_action_data(indir, smooth, bin_size):
121 |     datas = []
122 |     infiles = glob.glob(os.path.join(indir, 'action_log.csv'))
123 | 
124 |     for inf in infiles:  # should be 1
125 |         with open(inf, 'r') as f:
126 |             for line in f:
127 |                 tmp = line.split(',')
128 |                 tmp = [int(tmp[0])] + [float(tmp[i])
129 |                                        for i in range(1, len(tmp))]
130 |                 datas.append(tmp)
131 | 
132 |     datas = sorted(datas, key=lambda d_entry: d_entry[0])
133 |     result = datas
134 |     # for i in range(len(datas)):
135 |     #    result.append([datas[i][0], datas[i][1]])
136 | 
137 |     if len(result) < bin_size:
138 |         return [None, None]
139 | 
140 |     x, y = np.array(result)[:, 0], np.array(result)[:, 1:]
141 | 
142 |     '''if smooth == 1:
143 |         x, y = smooth_reward_curve(x, y)
144 | 
145 |     if smooth == 2:
146 |         y = medfilt(y, kernel_size=9)
147 | 
148 |     x, y = fix_point(x, y, bin_size)'''
149 |     return [x, np.transpose(y)]
150 | 
151 | 
152 | def visdom_plot(viz, win, folder, game, name, num_steps, bin_size=100, smooth=1):
153 |     tx, ty = load_reward_data(folder, smooth, bin_size)
154 |     if tx is None or ty is None:
155 |         return win
156 | 
157 |     fig = plt.figure()
158 |     plt.plot(tx, ty, label="{}".format(name))
159 | 
160 |     tick_fractions = np.array([0.1, 0.2, 0.4, 0.6, 0.8, 1.0])
161 |     ticks = tick_fractions * num_steps
162 |     tick_names = ["{:.0e}".format(tick) for tick in ticks]
163 |     plt.xticks(ticks, tick_names)
164 |     plt.xlim(0, num_steps * 1.01)
165 | 
166 |     plt.xlabel('Number of Timesteps')
167 |     plt.ylabel('Rewards')
168 | 
169 |     plt.title(game)
170 |     plt.legend(loc=4)
171 |     plt.show()
172 |     plt.draw()
173 | 
174 |     image = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='')
175 |     image = image.reshape(fig.canvas.get_width_height()[::-1] + (3, ))
176 |     plt.close(fig)
177 | 
178 |     # Show it in visdom
179 |     image = np.transpose(image, (2, 0, 1))
180 | 
181 |     return viz.image(image, win=win)
182 | 
183 | 
184 | def plot(folder, game, name, num_steps, bin_size=100, smooth=1):
185 |     matplotlib.rcParams.update({'font.size': 20})
186 |     tx, ty = load_reward_data(folder, smooth, bin_size)
187 | 
188 |     if tx is None or ty is None:
189 |         return
190 | 
191 |     fig = plt.figure(figsize=(20, 5))
192 |     plt.plot(tx, ty, label="{}".format(name))
193 | 
194 |     tick_fractions = np.array([0.1, 0.2, 0.4, 0.6, 0.8, 1.0])
195 |     ticks = tick_fractions * num_steps
196 |     tick_names = ["{:.0e}".format(tick) for tick in ticks]
197 |     plt.xticks(ticks, tick_names)
198 |     plt.xlim(0, num_steps * 1.01)
199 | 
200 |     plt.xlabel('Number of Timesteps')
201 |     plt.ylabel('Rewards')
202 | 
203 |     plt.title(game)
204 |     plt.legend(loc=4)
205 |     plt.savefig(osp.join(folder, 'plot.png'))
206 |     plt.close()
207 |     # plt.show()
208 | 
209 | 
210 | def make_patch_spines_invisible(ax):
211 |     ax.set_frame_on(True)
212 |     ax.patch.set_visible(False)
213 |     for sp in ax.spines.values():
214 |         sp.set_visible(False)
215 | 
216 | 
217 | def plot_all_data(folder, game, name, num_steps, bin_size=(10, 100, 100, 1), smooth=1, time=None, save_filename='results.png', ipynb=False):
218 |     matplotlib.rcParams.update({'font.size': 20})
219 |     params = {
220 |         'xtick.labelsize': 20,
221 |         'ytick.labelsize': 15,
222 |         'legend.fontsize': 15
223 |     }
224 |     plt.rcParams.update(params)
225 | 
226 |     tx, ty = load_reward_data(folder, smooth, bin_size[0])
227 | 
228 |     if tx is None or ty is None:
229 |         return
230 | 
231 |     if time is not None:
232 |         title = 'Avg. Last 10 Rewards: ' + \
233 |             str(np.round(np.mean(ty[-10]))) + ' || ' + \
234 |             game + ' || Elapsed Time: ' + str(time)
235 |     else:
236 |         title = 'Avg. Last 10 Rewards: ' + \
237 |             str(np.round(np.mean(ty[-10]))) + ' || ' + game
238 | 
239 |     tick_fractions = np.array([0.1, 0.2, 0.4, 0.6, 0.8, 1.0])
240 |     ticks = tick_fractions * num_steps
241 |     tick_names = ["{:.0e}".format(tick) for tick in ticks]
242 | 
243 |     fig, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize=(20, 15), subplot_kw=dict(
244 |         xticks=ticks, xlim=(0, num_steps*1.15), xlabel='Timestep', title=title))
245 |     ax1.set_xticklabels(tick_names)
246 |     ax2.set_xticklabels(tick_names)
247 |     ax3.set_xticklabels(tick_names)
248 | 
249 |     ax1.set_ylabel('Reward')
250 | 
251 |     p1, = ax1.plot(tx, ty, label="Reward")
252 |     #lines = [p1]
253 | 
254 |     ax1.yaxis.label.set_color(p1.get_color())
255 |     ax1.tick_params(axis='y', colors=p1.get_color())
256 | 
257 |     ax1.legend([p1], [p1.get_label()], loc=4)
258 | 
259 |     # Load td data if it exists
260 |     tx, ty = load_custom_data(folder, 'td.csv', smooth, bin_size[1])
261 | 
262 |     ax2.set_title('Loss vs Timestep')
263 | 
264 |     if tx is not None or ty is not None:
265 |         ax2.set_ylabel('Avg .Temporal Difference')
266 |         p2, = ax2.plot(tx, ty, 'r-', label='Avg. TD')
267 |         g2_lines = [p2]
268 | 
269 |         ax2.yaxis.label.set_color(p2.get_color())
270 |         ax2.tick_params(axis='y', colors=p2.get_color())
271 | 
272 |         ax2.legend(g2_lines, [l.get_label() for l in g2_lines], loc=4)
273 | 
274 |     # Load Sigma Parameter Data if it exists
275 |     tx, ty = load_custom_data(folder, 'sig_param_mag.csv', smooth, bin_size[2])
276 | 
277 |     if tx is not None or ty is not None:
278 |         # need to update g2 title if sig data will be included
279 |         ax2.set_title('Loss/Avg. Sigma Parameter Magnitude vs Timestep')
280 | 
281 |         ax4 = ax2.twinx()
282 | 
283 |         ax4.set_ylabel('Avg. Sigma Parameter Mag.')
284 |         p4, = ax4.plot(tx, ty, 'g-', label='Avg. Sigma Mag.')
285 |         g2_lines += [p4]
286 | 
287 |         ax4.yaxis.label.set_color(p4.get_color())
288 |         ax4.tick_params(axis='y', colors=p4.get_color())
289 | 
290 |         #ax4.spines["right"].set_position(("axes", 1.05))
291 |         # make_patch_spines_invisible(ax4)
292 |         # ax4.spines["right"].set_visible(True)
293 | 
294 |         # remake g2 legend because we have a new line
295 |         ax2.legend(g2_lines, [l.get_label() for l in g2_lines], loc=4)
296 | 
297 |     # Load action selection data if it exists
298 |     tx, ty = load_action_data(folder, smooth, bin_size[3])
299 | 
300 |     ax3.set_title('Action Selection Frequency(%) vs Timestep')
301 | 
302 |     if tx is not None or ty is not None:
303 |         ax3.set_ylabel('Action Selection Frequency(%)')
304 |         labels = ['Action {}'.format(i) for i in range(ty.shape[0])]
305 |         p3 = ax3.stackplot(tx, ty, labels=labels)
306 | 
307 |         base = 0.0
308 |         for percent, index in zip(ty, range(ty.shape[0])):
309 |             offset = base + percent[-1]/3.0
310 |             ax3.annotate(str('{:.2f}'.format(
311 |                 ty[index][-1])), xy=(tx[-1], offset), color=p3[index].get_facecolor().ravel())
312 |             base += percent[-1]
313 | 
314 |         # ax3.yaxis.label.set_color(p3.get_color())
315 |         #ax3.tick_params(axis='y', colors=p3.get_color())
316 | 
317 |         ax3.legend(loc=4)  # remake g2 legend because we have a new line
318 | 
319 |     plt.tight_layout()  # prevent label cutoff
320 | 
321 |     if ipynb:
322 |         plt.show()
323 |     else:
324 |         plt.savefig(save_filename)
325 |     plt.clf()
326 |     plt.close()
327 | 
328 |     # return np.round(np.mean(ty[-10:]))
329 | 
330 | 
331 | def plot_reward(folder, game, name, num_steps, bin_size=10, smooth=1, time=None, save_filename='results.png', ipynb=False):
332 |     matplotlib.rcParams.update({'font.size': 20})
333 |     tx, ty = load_reward_data(folder, smooth, bin_size)
334 | 
335 |     if tx is None or ty is None:
336 |         return
337 | 
338 |     fig = plt.figure(figsize=(20, 5))
339 |     plt.plot(tx, ty, label="{}".format(name))
340 | 
341 |     tick_fractions = np.array([0.1, 0.2, 0.4, 0.6, 0.8, 1.0])
342 |     ticks = tick_fractions * num_steps
343 |     tick_names = ["{:.0e}".format(tick) for tick in ticks]
344 |     plt.xticks(ticks, tick_names)
345 |     plt.xlim(0, num_steps * 1.01)
346 | 
347 |     plt.xlabel('Number of Timesteps')
348 |     plt.ylabel('Rewards')
349 | 
350 |     if time is not None:
351 |         plt.title(game + ' || Last 10: ' +
352 |                   str(np.round(np.mean(ty[-10]))) + ' || Elapsed Time: ' + str(time))
353 |     else:
354 |         plt.title(game + ' || Last 10: ' + str(np.round(np.mean(ty[-10]))))
355 |     plt.legend(loc=4)
356 |     if ipynb:
357 |         plt.show()
358 |     else:
359 |         plt.savefig(save_filename)
360 |     plt.clf()
361 |     plt.close()
362 | 
363 |     return np.round(np.mean(ty[-10]))
364 | 


--------------------------------------------------------------------------------
/core/algorithms/lacie/base_lacie.py:
--------------------------------------------------------------------------------
  1 | """
  2 |     Implement the algorithm `Learning to Assign Credit in Input-Driven Environment'
  3 | """
  4 | from core.algorithms.base_algo import BaseAlgo
  5 | from torch import optim
  6 | from itertools import chain
  7 | 
  8 | import torch
  9 | import random
 10 | import torch.nn as nn
 11 | 
 12 | 
 13 | class LacieAlgo(BaseAlgo):
 14 |     """
 15 |         Base class for LACIE algorithm. Support `cpc` (contrastive predictive coding) to estimate \
 16 |             the independent between input-process and future states.
 17 |             :param actor_critic: nn.Module - the actor critic object
 18 |             :param entropy_coef: float - weight of entropy loss
 19 |             :param max_grad_norm: float - maximum value of gradient
 20 |             :param n_steps: int - n-steps advantage estimation with hindsight
 21 |             :param state_to_input_seq: function - a function object that decompose input-processes from states\
 22 |                     the signature of function should be: foo(states) where states is torch.Tensor of shape \
 23 |                     T x N_processes x Obs_shape
 24 |     """
 25 |     UPPER_BOUND_CLIP_THRESHOLD = 4
 26 |     LOWER_BOUND_CLIP_THRESHOLD = 1/100
 27 |     WEIGHT_CLIP_GROWTH_FACTOR = 1.002
 28 |     WEIGHT_CLIP_DECAY_FACTOR = 0.998
 29 |     CPC_HIDDEN_DIM = 96
 30 |     ADVANTAGE_ENC_DIM = CPC_HIDDEN_DIM//3
 31 |     INPUT_ENC_DIM = 32
 32 | 
 33 |     def __init__(self,
 34 |                  actor_critic,
 35 |                  lr,
 36 |                  value_coef,
 37 |                  entropy_coef,
 38 |                  regularize_coef=0.05,
 39 |                  state_to_input_seq=None,
 40 |                  expert=None,
 41 |                  il_coef=1,
 42 |                  num_cpc_steps=10,
 43 |                  cpc_lr=0.001):
 44 |         super().__init__(actor_critic, lr, value_coef, entropy_coef, expert, il_coef)
 45 |         self.regularize_coef = regularize_coef
 46 |         self.state_to_input_seq = state_to_input_seq
 47 |         self.num_cpc_steps = num_cpc_steps
 48 | 
 49 |         self.device = next(self.actor_critic.parameters()).device
 50 | 
 51 |         # encoder for advantages
 52 |         self.advantage_encoder = nn.Sequential(
 53 |             nn.Linear(self.ADVANTAGE_ENC_DIM, self.CPC_HIDDEN_DIM//3, bias=True),
 54 |             nn.LeakyReLU(inplace=True),
 55 |             nn.Linear(self.CPC_HIDDEN_DIM//3,
 56 |                       self.CPC_HIDDEN_DIM//3, bias=True)
 57 |         ).to(self.device)
 58 | 
 59 |         # encoder for states
 60 |         # FIXME: hard code for 1D env
 61 |         self.state_encoder = nn.Sequential(
 62 |             nn.Linear(self.actor_critic.obs_shape[0],
 63 |                       self.CPC_HIDDEN_DIM//3, bias=True),
 64 |             nn.LeakyReLU(inplace=True),
 65 |             nn.Linear(self.CPC_HIDDEN_DIM//3, self.CPC_HIDDEN_DIM//3)
 66 |         ).to(self.device)
 67 | 
 68 |         # encoder for action
 69 |         self.action_encoder = nn.Sequential(
 70 |             nn.Embedding(self.actor_critic.action_space.n,
 71 |                          self.CPC_HIDDEN_DIM//3),
 72 |             nn.LeakyReLU(inplace=True),
 73 |             nn.Linear(self.CPC_HIDDEN_DIM//3, self.CPC_HIDDEN_DIM//3)
 74 |         ).to(self.device)
 75 | 
 76 |         # encoding conditions (i.e. advantages + states + actions)
 77 |         self.condition_encoder = nn.Sequential(
 78 |             nn.LeakyReLU(inplace=True),
 79 |             nn.Linear(self.CPC_HIDDEN_DIM, self.CPC_HIDDEN_DIM, bias=True),
 80 |             nn.LeakyReLU(inplace=True),
 81 |             nn.Linear(self.CPC_HIDDEN_DIM, self.CPC_HIDDEN_DIM)
 82 |         ).to(self.device)
 83 | 
 84 |         # input sequence encoder
 85 |         self.input_seq_encoder = nn.GRU(
 86 |             self.INPUT_ENC_DIM, self.CPC_HIDDEN_DIM, 1).to(self.device)
 87 | 
 88 |         # optimizer to learn the parameters for cpc loss
 89 |         self.cpc_optimizer = optim.Adam(
 90 |             chain(
 91 |                 self.advantage_encoder.parameters(),
 92 |                 self.input_seq_encoder.parameters(),
 93 |                 self.state_encoder.parameters(),
 94 |                 self.action_encoder.parameters(),
 95 |                 self.condition_encoder.parameters()
 96 |             ),
 97 |             lr=cpc_lr
 98 |         )
 99 | 
100 |         self.softmax = nn.Softmax(dim=-1)
101 |         self.log_softmax = nn.LogSoftmax(dim=-1)
102 |         self.cpc_criterion = nn.CrossEntropyLoss()
103 |         self.regularization_criterion = nn.L1Loss()
104 | 
105 |         self.upper_bound_clip_threshold = 1
106 |         self.lower_bound_clip_threshold = 1
107 | 
108 |         # Initialize weights
109 |         def _weights_init(m):
110 |             if isinstance(m, nn.Linear):
111 |                 nn.init.kaiming_normal_(
112 |                     m.weight, mode='fan_out', nonlinearity='relu')
113 |             if isinstance(m, nn.Conv1d):
114 |                 nn.init.kaiming_normal_(
115 |                     m.weight, mode='fan_out', nonlinearity='relu')
116 |             elif isinstance(m, nn.BatchNorm1d):
117 |                 nn.init.constant_(m.weight, 1)
118 |                 nn.init.constant_(m.bias, 0)
119 | 
120 |         # initialize gru
121 |         for layer_p in self.input_seq_encoder._all_weights:
122 |             for p in layer_p:
123 |                 if 'weight' in p:
124 |                     nn.init.kaiming_normal_(self.input_seq_encoder.__getattr__(
125 |                         p), mode='fan_out', nonlinearity='relu')
126 | 
127 |         self.condition_encoder.apply(_weights_init)
128 |         self.state_encoder.apply(_weights_init)
129 |         self.action_encoder.apply(_weights_init)
130 |         self.advantage_encoder.apply(_weights_init)
131 | 
132 |     def _encode_input_sequences(self, obs, masks):
133 |         num_steps, n_processes, _ = obs.shape
134 |         # obs is tensor of shape (n_steps + 1, n_processes, obs_shape)
135 |         num_steps -= 1
136 |         # INPUT SEQUENCES AND MASKS
137 |         # the stochastic input will be defined by last 2 scalar
138 |         input_seq = obs[1:, :, -2:]
139 | 
140 |         # transform input_seq with fourier features
141 |         jobs, intervals = input_seq[:, :, 0].reshape(-1, 1), input_seq[:, :, 1].reshape(-1, 1)
142 |         jobs, intervals = self.encode_fourier_features(jobs, self.INPUT_ENC_DIM//2), self.encode_fourier_features(intervals, self.INPUT_ENC_DIM//2)
143 |         jobs = jobs.reshape(num_steps, n_processes, self.INPUT_ENC_DIM//2)
144 |         intervals = intervals.reshape(num_steps, n_processes, self.INPUT_ENC_DIM//2)
145 |         input_seq = torch.cat([jobs, intervals], dim=-1)
146 | 
147 |         masks = masks[1:].reshape(num_steps, n_processes)
148 |         # reverse the input seq order since we want to compute from right to left
149 |         input_seq = torch.flip(input_seq, [0])
150 |         masks = torch.flip(masks, [0])
151 |         # encode the input sequence
152 |         # Let's figure out which steps in the sequence have a zero for any agent
153 |         has_zeros = ((masks[1:-1] == 0.0)
154 |                      .any(dim=-1)
155 |                      .nonzero()
156 |                      .squeeze()
157 |                      .cpu())
158 | 
159 |         # +1 to correct the masks[1:]
160 |         if has_zeros.dim() == 0:
161 |             # Deal with scalar
162 |             has_zeros = [has_zeros.item() + 1]
163 |         else:
164 |             has_zeros = (has_zeros + 1).numpy().tolist()
165 | 
166 |         # add t=0 and t=T to the list
167 |         has_zeros = [-1] + has_zeros + [num_steps - 1]
168 | 
169 |         outputs = []
170 | 
171 |         for i in range(len(has_zeros) - 1):
172 |             # We can now process steps that don't have any zeros in masks together!
173 |             # This is much faster
174 |             start_idx = has_zeros[i]
175 |             end_idx = has_zeros[i + 1]
176 | 
177 |             output, hxs = self.input_seq_encoder(
178 |                 input_seq[start_idx + 1: end_idx + 1],
179 |                 hxs * masks[start_idx].view(1, -1, 1) if start_idx > -1 else None)
180 | 
181 |             outputs.append(output)
182 | 
183 |         # x is a (T, N, -1) tensor
184 |         input_seq = torch.cat(outputs, dim=0)
185 |         assert len(input_seq) == num_steps
186 |         # reverse back
187 |         input_seq = torch.flip(input_seq, [0])
188 | 
189 |         return input_seq
190 | 
191 |     def _encode_advantages(self, advantages):
192 |         # FIXME: only compatible with 1D observation
193 |         num_steps, n_processes, _ = advantages.shape
194 |         # ADVANTAGES
195 |         # encode
196 |         # n_steps  x n_process x hidden_dim/2
197 |         advantages = advantages.reshape(-1, 1)
198 |         advantages = self.encode_fourier_features(advantages, self.ADVANTAGE_ENC_DIM)
199 |         advantages = self.advantage_encoder(advantages).reshape(num_steps, n_processes, -1)
200 | 
201 |         return advantages
202 | 
203 |     def _encode_states(self, obs):
204 |         num_steps, n_processes, _ = obs.shape
205 |         num_steps -= 1
206 |         # STATES
207 |         # encode
208 |         # n_steps x n_process x hidden_dim/2
209 |         states = obs[:-1]
210 |         # FIXME: hard code for 1D env
211 |         states_shape = states.shape[2:][0]
212 |         states = self.state_encoder(
213 |             states.reshape(-1, states_shape)).reshape(num_steps, n_processes, -1)
214 | 
215 |         return states
216 | 
217 |     def _encode_actions(self, actions):
218 |         num_steps, n_processes, _ = actions.shape
219 |         # ACTION
220 |         # encode
221 |         # n_steps x n_process x 1
222 |         actions = self.action_encoder(
223 |             actions.reshape(-1)).reshape(num_steps, n_processes, -1)
224 | 
225 |         return actions
226 | 
227 |     def _encode_conditions(self, conditions):
228 |         num_steps, n_processes, hidden_dim = conditions.shape
229 |         # ACTION
230 |         # encode
231 |         # n_steps x n_process x 1
232 |         conditions = self.condition_encoder(
233 |             conditions.reshape(-1, hidden_dim)).reshape(num_steps, n_processes, -1)
234 | 
235 |         return conditions
236 | 
237 |     def compute_contrastive_loss(self, obs, actions, masks, advantages):
238 |         """
239 |             Contrastive Predictive Coding for learning representation and density ratio
240 |             :param rollouts: Storage's instance
241 |             :param advantage: tensor of shape: (timestep, n_processes, 1)
242 |         """
243 |         # FIXME: only compatible with 1D observation
244 |         num_steps, n_processes, _ = advantages.shape
245 | 
246 |         # encoded all the input
247 |         encoded_input_seq = self._encode_input_sequences(obs, masks)
248 |         encoded_advantages = self._encode_advantages(advantages)
249 |         encoded_states = self._encode_states(obs)
250 |         encoded_actions = self._encode_actions(actions)
251 | 
252 |         # condition = STATE + ADVANTAGE + ACTIONS
253 |         conditions = torch.cat(
254 |             [encoded_advantages, encoded_states, encoded_actions], dim=-1)
255 |         conditions = self._encode_conditions(conditions)
256 |         # reshape to n_steps x hidden_dim x n_processes
257 |         encoded_input_seq = encoded_input_seq.permute(0, 2, 1)
258 | 
259 |         # compute nce
260 |         # create label mask
261 |         label = torch.tensor(torch.arange(
262 |             0, n_processes).tolist() * num_steps).to(self.device)
263 | 
264 |         # broadcast compute matmul
265 |         f_value = torch.bmm(
266 |             conditions, encoded_input_seq).reshape(-1, n_processes)
267 | 
268 |         # compute accuracy
269 |         correct = torch.sum(torch.eq(torch.argmax(
270 |             self.softmax(f_value), dim=1), label))
271 |         accuracy = correct.item()/(n_processes*num_steps)
272 | 
273 |         # compute loss
274 |         contrastive_loss = self.cpc_criterion(f_value, label)
275 |         regularization_loss = self.regularization_criterion(
276 |             self.softmax(f_value) * n_processes, torch.ones_like(f_value))
277 | 
278 |         return contrastive_loss, accuracy, regularization_loss
279 | 
280 |     def compute_weighted_advantages(self, obs, actions, masks, advantages, n_envs=None):
281 |         """
282 |             Compute return for rollout experience with trained contrastive module
283 |         """
284 |         with torch.no_grad():
285 |             # FIXME: only compatible with 1D observation
286 |             num_steps, batch_size, _ = advantages.shape
287 | 
288 |             input_seq = self._encode_input_sequences(
289 |                 obs, masks)
290 |             encoded_advantages = self._encode_advantages(advantages)
291 |             encoded_states = self._encode_states(obs)
292 |             encoded_actions = self._encode_actions(actions)
293 | 
294 |             # condition = STATE + ADVANTAGE
295 |             conditions = torch.cat(
296 |                 [encoded_advantages, encoded_states, encoded_actions], dim=-1)
297 |             conditions = self._encode_conditions(conditions)
298 |             
299 |             # reshape to n_steps x hidden_dim x n_processes
300 |             input_seq = input_seq.permute(0, 2, 1)
301 | 
302 |             # weight of each advantage score
303 |             weights = torch.zeros((num_steps, n_envs if n_envs else batch_size, 1)).to(
304 |                 self.device)
305 | 
306 |             for i in range(num_steps):
307 |                 # n_steps x n_steps
308 |                 density_ratio = self.softmax(
309 |                     torch.mm(conditions[i], input_seq[i]))
310 |                 if n_envs:
311 |                     # N is not None => used memory for predicting weights
312 |                     density_ratio = density_ratio[:n_envs, :n_envs]
313 |                 # take the diag element
314 |                 density_ratio = density_ratio.diag().reshape(
315 |                     n_envs if n_envs else batch_size, 1)
316 | 
317 |                 weights[i] = density_ratio
318 | 
319 |             weights *= batch_size
320 |             weights = 1/(weights+1e-5)
321 |             weights = torch.clamp(
322 |                 weights,
323 |                 self.lower_bound_clip_threshold,
324 |                 self.upper_bound_clip_threshold
325 |             )
326 | 
327 |         if random.randint(0, 9) == 0:
328 |             print('weights mean: ', weights.mean())
329 |             print('weights max: ', weights.max())
330 |             print('weights min: ', weights.min())
331 |         weighted_advantages = advantages[:, :n_envs] * \
332 |             weights if n_envs else advantages*weights
333 | 
334 |         return weighted_advantages
335 | 
336 |     def update_weight_clip_threshold(self):
337 |         self.upper_bound_clip_threshold = min(
338 |             self.upper_bound_clip_threshold * self.WEIGHT_CLIP_GROWTH_FACTOR,
339 |             self.UPPER_BOUND_CLIP_THRESHOLD
340 |         )
341 |         self.lower_bound_clip_threshold = max(
342 |             self.lower_bound_clip_threshold * self.WEIGHT_CLIP_DECAY_FACTOR,
343 |             self.LOWER_BOUND_CLIP_THRESHOLD
344 |         )
345 | 
346 |     def after_update(self):
347 |         super().after_update()
348 |         self.update_weight_clip_threshold()
349 | 
350 |     def encode_fourier_features(self, x, d=10):
351 |         """
352 |             Encode input with fourier features according to https://arxiv.org/abs/2006.10739
353 |             :param x: torch.Tensor of shape Nx1
354 |             :param d: int - encoded dimension
355 |         """
356 |         if (d//2)*2-d != 0:
357 |             raise ValueError("Dimension must be even number...")
358 |         N = x.shape[0]
359 | 
360 |         position_enc = torch.zeros(N, d).to(self.device)
361 |         idx = torch.arange(d//2).reshape(1, -1).to(self.device)
362 | 
363 |         position_enc[:, 0::2] = torch.sin(x*2**idx)
364 |         position_enc[:, 1::2] = torch.cos(x*2**idx)
365 | 
366 |         return position_enc
367 | 


--------------------------------------------------------------------------------
/core/algorithms/lacie/lacie_ppo.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | import torch.optim as optim
  5 | 
  6 | from itertools import chain
  7 | from .base_lacie import LacieAlgo
  8 | 
  9 | 
 10 | class LACIE_PPO(LacieAlgo):
 11 |     def __init__(self,
 12 |                  actor_critic,
 13 |                  clip_param,
 14 |                  ppo_epoch,
 15 |                  num_mini_batch,
 16 |                  value_loss_coef,
 17 |                  entropy_coef,
 18 |                  regularize_coef,
 19 |                  state_to_input_seq=None,
 20 |                  lr=None,
 21 |                  eps=None,
 22 |                  max_grad_norm=None,
 23 |                  use_clipped_value_loss=True,
 24 |                  expert=None,
 25 |                  il_coef=1,
 26 |                  num_cpc_steps=10,
 27 |                  cpc_lr=1e-3):
 28 |         super().__init__(actor_critic=actor_critic,
 29 |                          lr=lr,
 30 |                          value_coef=value_loss_coef,
 31 |                          entropy_coef=entropy_coef,
 32 |                          regularize_coef=regularize_coef,
 33 |                          state_to_input_seq=state_to_input_seq,
 34 |                          expert=expert,
 35 |                          il_coef=il_coef,
 36 |                          num_cpc_steps=num_cpc_steps,
 37 |                          cpc_lr=cpc_lr)
 38 | 
 39 |         self.clip_param = clip_param
 40 |         self.ppo_epoch = ppo_epoch
 41 |         self.num_mini_batch = num_mini_batch
 42 | 
 43 |         self.max_grad_norm = max_grad_norm
 44 |         self.use_clipped_value_loss = use_clipped_value_loss
 45 | 
 46 |     def update(self, rollouts):
 47 |         obs_shape = rollouts.obs.size()[2:]
 48 |         advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1]
 49 | 
 50 |         # contrastive learning loss
 51 |         contrastive_loss_epoch, contrastive_accuracy_epoch = self.compute_contrastive_loss(
 52 |             rollouts.obs, rollouts.actions, rollouts.masks, advantages.detach())
 53 |         contrastive_loss_epoch = contrastive_loss_epoch.item()
 54 | 
 55 |         # weighted advantages
 56 |         weighted_advantages = self.compute_weighted_advantages(
 57 |             rollouts.obs, rollouts.actions, rollouts.masks, advantages.detach())
 58 |         weighted_advantages = (weighted_advantages - weighted_advantages.mean()) / (
 59 |             weighted_advantages.std() + 1e-5)
 60 | 
 61 |         value_loss_epoch = 0
 62 |         action_loss_epoch = 0
 63 |         dist_entropy_epoch = 0
 64 |         imitation_loss_epoch = 0
 65 |         accuracy_epoch = 0
 66 | 
 67 |         for e in range(self.ppo_epoch):
 68 |             if self.actor_critic.is_recurrent:
 69 |                 data_generator = rollouts.recurrent_generator(
 70 |                     weighted_advantages, self.num_mini_batch)
 71 |             else:
 72 |                 data_generator = rollouts.feed_forward_generator(
 73 |                     weighted_advantages, self.num_mini_batch)
 74 | 
 75 |             for sample in data_generator:
 76 |                 obs_batch, recurrent_hidden_states_batch, actions_batch, \
 77 |                     value_preds_batch, return_batch, masks_batch, old_action_log_probs_batch, \
 78 |                     adv_targ = sample
 79 | 
 80 |                 # Reshape to do in a single forward pass for all steps
 81 |                 values, action_log_probs, dist_entropy, _ = self.actor_critic.evaluate_actions(
 82 |                     obs_batch, recurrent_hidden_states_batch, masks_batch,
 83 |                     actions_batch)
 84 | 
 85 |                 ratio = torch.exp(action_log_probs -
 86 |                                   old_action_log_probs_batch)
 87 |                 surr1 = ratio * adv_targ
 88 |                 surr2 = torch.clamp(ratio, 1.0 - self.clip_param,
 89 |                                     1.0 + self.clip_param) * adv_targ
 90 |                 action_loss = -torch.min(surr1, surr2).mean()
 91 | 
 92 |                 if self.use_clipped_value_loss:
 93 |                     value_pred_clipped = value_preds_batch + \
 94 |                         (values - value_preds_batch).clamp(-self.clip_param,
 95 |                                                            self.clip_param)
 96 |                     value_losses = (values - return_batch).pow(2)
 97 |                     value_losses_clipped = (
 98 |                         value_pred_clipped - return_batch).pow(2)
 99 |                     value_loss = 0.5 * torch.max(value_losses,
100 |                                                  value_losses_clipped).mean()
101 |                 else:
102 |                     value_loss = 0.5 * (return_batch - values).pow(2).mean()
103 | 
104 |                 # imitation learning
105 |                 imitation_loss, accuracy = torch.tensor(
106 |                     0).to(action_loss.device), 0
107 |                 if self.expert:
108 |                     imitation_loss, accuracy = self.imitation_learning(
109 |                         rollouts.obs[:-1].view(-1, *obs_shape),
110 |                         rollouts.recurrent_hidden_states[0].view(
111 |                             -1, self.actor_critic.recurrent_hidden_state_size),
112 |                         rollouts.masks[:-1].view(-1, 1),
113 |                         self.expert)
114 | 
115 |                 # contrastive learning density ratio
116 |                 contrastive_loss, _ = self.compute_contrastive_loss(
117 |                     rollouts.obs, rollouts.actions, rollouts.masks, advantages)
118 | 
119 |                 self.optimizer.zero_grad()
120 |                 self.cpc_optimizer.zero_grad()
121 |                 (imitation_loss * self.il_coef * self.value_coef + action_loss -
122 |                  dist_entropy * self.entropy_coef + contrastive_loss).backward()
123 |                 nn.utils.clip_grad_norm_(chain(self.actor_critic.parameters(),
124 |                                                self.input_seq_encoder.parameters(),
125 |                                                self.advantage_encoder.parameters(),
126 |                                                self.state_encoder.parameters(),
127 |                                                self.condition_encoder.parameters(),
128 |                                                self.action_encoder.parameters()),
129 |                                          self.max_grad_norm)
130 |                 self.optimizer.step()
131 |                 self.cpc_optimizer.step()
132 | 
133 |                 value_loss_epoch += value_loss.item()
134 |                 action_loss_epoch += action_loss.item()
135 |                 dist_entropy_epoch += dist_entropy.item()
136 |                 imitation_loss_epoch += imitation_loss.item()
137 |                 accuracy_epoch += accuracy
138 | 
139 |         num_updates = self.ppo_epoch * self.num_mini_batch
140 | 
141 |         value_loss_epoch /= num_updates
142 |         action_loss_epoch /= num_updates
143 |         dist_entropy_epoch /= num_updates
144 |         imitation_loss_epoch /= num_updates
145 |         accuracy_epoch /= num_updates
146 | 
147 |         self.after_update()
148 | 
149 |         return {
150 |             "value loss": value_loss_epoch,
151 |             "action loss": action_loss_epoch,
152 |             "entropy loss": dist_entropy_epoch,
153 |             "imitation loss": imitation_loss_epoch,
154 |             "accuracy": accuracy_epoch,
155 |             "contrastive loss": contrastive_loss_epoch,
156 |             "contrastive accuracy": contrastive_accuracy_epoch
157 |         }
158 | 
159 | 
160 | class LACIE_PPO_Memory(LACIE_PPO):
161 |     def __init__(self,
162 |                  actor_critic,
163 |                  clip_param,
164 |                  ppo_epoch,
165 |                  num_mini_batch,
166 |                  value_loss_coef,
167 |                  entropy_coef,
168 |                  regularize_coef,
169 |                  state_to_input_seq=None,
170 |                  lr=None,
171 |                  eps=None,
172 |                  max_grad_norm=None,
173 |                  use_clipped_value_loss=True,
174 |                  expert=None,
175 |                  il_coef=1,
176 |                  num_cpc_steps=10,
177 |                  lacie_buffer=None,
178 |                  lacie_batch_size=64,
179 |                  use_memory_to_pred_weights=False,
180 |                  cpc_lr=1e-3):
181 |         super().__init__(actor_critic,
182 |                          clip_param,
183 |                          ppo_epoch,
184 |                          num_mini_batch,
185 |                          value_loss_coef,
186 |                          entropy_coef,
187 |                          regularize_coef,
188 |                          state_to_input_seq,
189 |                          lr,
190 |                          eps,
191 |                          max_grad_norm,
192 |                          use_clipped_value_loss,
193 |                          expert,
194 |                          il_coef,
195 |                          num_cpc_steps,
196 |                          cpc_lr=cpc_lr)
197 | 
198 |         self.lacie_buffer = lacie_buffer
199 |         self.lacie_buffer_size = lacie_batch_size
200 |         self.use_memory_to_pred_weights = use_memory_to_pred_weights
201 | 
202 |     def update(self, rollouts):
203 |         obs_shape = rollouts.obs.size()[2:]
204 |         advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1]
205 | 
206 |         # update LACIE_Storage
207 |         self.lacie_buffer.insert(rollouts, advantages.detach())
208 | 
209 |         # contrastive learning loss
210 |         contrastive_loss_epoch, contrastive_accuracy_epoch, regularize_loss_epoch = self.compute_contrastive_loss(
211 |             rollouts.obs, rollouts.actions, rollouts.masks, advantages.detach())
212 |         contrastive_loss_epoch = contrastive_loss_epoch.item()
213 |         regularize_loss_epoch = regularize_loss_epoch.item()
214 | 
215 |         # ---------------------------------------------------------------------------
216 |         # learn cpc model for n steps
217 | 
218 |         for _ in range(self.num_cpc_steps):
219 |             data = self.lacie_buffer.sample()
220 |             obs, actions, masks, sample_advantages = data['obs'], data['actions'], data['masks'], data['advantages']
221 |             cpc_loss, _, cpc_regularize_loss = self.compute_contrastive_loss(
222 |                 obs, actions, masks, sample_advantages)
223 | 
224 |             self.cpc_optimizer.zero_grad()
225 |             (cpc_loss + self.regularize_coef * cpc_regularize_loss).backward()
226 | 
227 |             nn.utils.clip_grad_norm_(chain(self.advantage_encoder.parameters(),
228 |                                            self.input_seq_encoder.parameters(),
229 |                                            self.state_encoder.parameters(),
230 |                                            self.condition_encoder.parameters(),
231 |                                            self.action_encoder.parameters()),
232 |                                      self.max_grad_norm)
233 | 
234 |             self.cpc_optimizer.step()
235 | 
236 |         # weighted advantages
237 |         if not self.use_memory_to_pred_weights:
238 |             weighted_advantages = self.compute_weighted_advantages(
239 |                 rollouts.obs, rollouts.actions, rollouts.masks, advantages.detach())
240 |         else:
241 |             data = self.lacie_buffer.sample_most_recent()
242 |             obs, actions, masks, sample_advantages = data['obs'], data[
243 |                 'actions'], data['masks'], data['advantages']
244 |             weighted_advantages = self.compute_weighted_advantages(
245 |                 obs, actions, masks, sample_advantages, rollouts.actions.shape[1])
246 |         # normalize advantages
247 |         # TODO: Conduct Ablation Study to verify if we should normalize the advantages or not
248 |         weighted_advantages = (weighted_advantages - weighted_advantages.mean()) / (
249 |             weighted_advantages.std() + 1e-5)
250 | 
251 |         # ---------------------------------------------------------------------------
252 |         # learn actor and critic
253 | 
254 |         value_loss_epoch = 0
255 |         action_loss_epoch = 0
256 |         dist_entropy_epoch = 0
257 |         imitation_loss_epoch = 0
258 |         accuracy_epoch = 0
259 | 
260 |         for e in range(self.ppo_epoch):
261 |             if self.actor_critic.is_recurrent:
262 |                 data_generator = rollouts.recurrent_generator(
263 |                     weighted_advantages, self.num_mini_batch)
264 |             else:
265 |                 data_generator = rollouts.feed_forward_generator(
266 |                     weighted_advantages, self.num_mini_batch)
267 | 
268 |             for sample in data_generator:
269 |                 obs_batch, recurrent_hidden_states_batch, actions_batch, \
270 |                     value_preds_batch, return_batch, masks_batch, old_action_log_probs_batch, \
271 |                     adv_targ = sample
272 | 
273 |                 # Reshape to do in a single forward pass for all steps
274 |                 values, action_log_probs, dist_entropy, _ = self.actor_critic.evaluate_actions(
275 |                     obs_batch, recurrent_hidden_states_batch, masks_batch,
276 |                     actions_batch)
277 | 
278 |                 ratio = torch.exp(action_log_probs -
279 |                                   old_action_log_probs_batch)
280 |                 surr1 = ratio * adv_targ
281 |                 surr2 = torch.clamp(ratio, 1.0 - self.clip_param,
282 |                                     1.0 + self.clip_param) * adv_targ
283 |                 action_loss = -torch.min(surr1, surr2).mean()
284 | 
285 |                 if self.use_clipped_value_loss:
286 |                     value_pred_clipped = value_preds_batch + \
287 |                         (values - value_preds_batch).clamp(-self.clip_param,
288 |                                                            self.clip_param)
289 |                     value_losses = (values - return_batch).pow(2)
290 |                     value_losses_clipped = (
291 |                         value_pred_clipped - return_batch).pow(2)
292 |                     value_loss = 0.5 * torch.max(value_losses,
293 |                                                  value_losses_clipped).mean()
294 |                 else:
295 |                     value_loss = 0.5 * (return_batch - values).pow(2).mean()
296 | 
297 |                 # imitation learning
298 |                 imitation_loss, accuracy = torch.tensor(
299 |                     0).to(action_loss.device), 0
300 |                 if self.expert:
301 |                     imitation_loss, accuracy = self.imitation_learning(
302 |                         rollouts.obs[:-1].view(-1, *obs_shape),
303 |                         rollouts.recurrent_hidden_states[0].view(
304 |                             -1, self.actor_critic.recurrent_hidden_state_size),
305 |                         rollouts.masks[:-1].view(-1, 1),
306 |                         self.expert)
307 | 
308 |                 self.optimizer.zero_grad()
309 |                 (imitation_loss * self.il_coef * self.value_coef + action_loss -
310 |                  dist_entropy * self.entropy_coef).backward()
311 |                 nn.utils.clip_grad_norm_(self.actor_critic.parameters(),
312 |                                          self.max_grad_norm)
313 |                 self.optimizer.step()
314 | 
315 |                 value_loss_epoch += value_loss.item()
316 |                 action_loss_epoch += action_loss.item()
317 |                 dist_entropy_epoch += dist_entropy.item()
318 |                 imitation_loss_epoch += imitation_loss.item()
319 |                 accuracy_epoch += accuracy
320 | 
321 |         num_updates = self.ppo_epoch * self.num_mini_batch
322 | 
323 |         value_loss_epoch /= num_updates
324 |         action_loss_epoch /= num_updates
325 |         dist_entropy_epoch /= num_updates
326 |         imitation_loss_epoch /= num_updates
327 |         accuracy_epoch /= num_updates
328 | 
329 |         self.after_update()
330 | 
331 |         return {
332 |             "value loss": value_loss_epoch,
333 |             "action loss": action_loss_epoch,
334 |             "entropy loss": dist_entropy_epoch,
335 |             "imitation loss": imitation_loss_epoch,
336 |             "accuracy": accuracy_epoch,
337 |             "contrastive loss": contrastive_loss_epoch,
338 |             "contrastive accuracy": contrastive_accuracy_epoch,
339 |             "regularization loss": regularize_loss_epoch
340 |         }
341 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | import numpy as np
  4 | import torch
  5 | import os.path as osp
  6 | 
  7 | from collections import deque
  8 | from core import algorithms, utils
  9 | from core.agents import Policy
 10 | from core.agents.heuristic.load_balance import ShortestProcessingTimeAgent, \
 11 |     EarliestCompletionTimeAgent, LeastWorkAgent
 12 | from core.arguments import get_args
 13 | from core.envs import make_vec_envs
 14 | from core.storage import RolloutStorage, LacieStorage
 15 | from evaluation import evaluate
 16 | from tensorboardX import SummaryWriter
 17 | from utils.plot import plot
 18 | 
 19 | 
 20 | def main():
 21 |     args = get_args()
 22 | 
 23 |     torch.manual_seed(args.seed)
 24 |     torch.cuda.manual_seed_all(args.seed)
 25 | 
 26 |     if args.cuda and torch.cuda.is_available() and args.cuda_deterministic:
 27 |         torch.backends.cudnn.benchmark = False
 28 |         torch.backends.cudnn.deterministic = True
 29 | 
 30 |     base_dir = osp.expanduser(args.log_dir)
 31 |     log_dir = osp.join(base_dir, 'train_log')
 32 |     eval_log_dir = osp.join(base_dir, "eval_log")
 33 |     tensorboard_dir = osp.join(base_dir, "tensorboard_log")
 34 | 
 35 |     utils.cleanup_log_dir(log_dir)
 36 |     utils.cleanup_log_dir(eval_log_dir)
 37 |     utils.cleanup_log_dir(tensorboard_dir)
 38 |     utils.dump_config(args, osp.join(base_dir, 'config.txt'))
 39 | 
 40 |     torch.set_num_threads(1)
 41 |     device = torch.device("cuda:0" if args.cuda else "cpu")
 42 |     writer = SummaryWriter(tensorboard_dir)
 43 | 
 44 |     # limited the number of steps for each episode
 45 |     # IMPORTANT: for load balance / spark-sim we automatically do this by setting
 46 |     # the number of stream jobs
 47 |     if not args.use_proper_time_limits:
 48 |         envs = make_vec_envs(env_name=args.env_name,
 49 |                              seed=args.seed,
 50 |                              num_processes=args.num_processes,
 51 |                              log_dir=log_dir,
 52 |                              device=device,
 53 |                              allow_early_resets=False,
 54 |                              args=args)
 55 |     else:
 56 |         envs = make_vec_envs(env_name=args.env_name,
 57 |                              seed=args.seed,
 58 |                              num_processes=args.num_processes,
 59 |                              log_dir=log_dir,
 60 |                              device=device,
 61 |                              allow_early_resets=True,
 62 |                              max_episode_steps=args.max_episode_steps,
 63 |                              args=args)
 64 | 
 65 |     # create actor critic
 66 |     actor_critic = Policy(
 67 |         envs.observation_space.shape,
 68 |         envs.action_space,
 69 |         base_kwargs={'recurrent': args.recurrent_policy})
 70 |     # if the resume directory is provided, then directly load that checkpoint
 71 |     if args.resume_dir is not None:
 72 |         print("=> Resuming from checkpoint: {}".format(args.resume_dir))
 73 |         actor_critic = torch.load(args.resume_dir, map_location='cpu')[0]
 74 |     actor_critic.to(device)
 75 | 
 76 |     # expert for imitation learning
 77 |     if args.use_imitation_learning:
 78 |         expert = LeastWorkAgent()
 79 |     else:
 80 |         expert = None
 81 | 
 82 |     if args.algo == 'a2c':
 83 |         agent = algorithms.A2C_ACKTR(
 84 |             actor_critic,
 85 |             args.value_loss_coef,
 86 |             args.entropy_coef,
 87 |             lr=args.lr,
 88 |             eps=args.eps,
 89 |             alpha=args.alpha,
 90 |             max_grad_norm=args.max_grad_norm,
 91 |             expert=expert,
 92 |             il_coef=args.il_coef)
 93 |     elif args.algo == 'ppo':
 94 |         agent = algorithms.PPO(
 95 |             actor_critic,
 96 |             args.clip_param,
 97 |             args.ppo_epoch,
 98 |             args.num_mini_batch,
 99 |             args.value_loss_coef,
100 |             args.entropy_coef,
101 |             lr=args.lr,
102 |             eps=args.eps,
103 |             max_grad_norm=args.max_grad_norm,
104 |             expert=expert,
105 |             il_coef=args.il_coef)
106 |     elif args.algo == 'acktr':
107 |         agent = algorithms.A2C_ACKTR(
108 |             actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True)
109 |     elif args.algo == 'mib_a2c':
110 |         agent = algorithms.MIB_A2C(
111 |             actor_critic,
112 |             args.entropy_coef,
113 |             lr=args.lr,
114 |             adapt_lr=args.adapt_lr,
115 |             num_inner_steps=args.num_inner_steps,
116 |             max_grad_norm=args.max_grad_norm,
117 |             expert=expert,
118 |             il_coef=args.il_coef
119 |         )
120 |     elif args.algo == 'mib_ppo':
121 |         agent = algorithms.MIB_PPO(
122 |             actor_critic=actor_critic,
123 |             clip_param=args.clip_param,
124 |             ppo_epoch=args.ppo_epoch,
125 |             num_mini_batch=args.num_mini_batch,
126 |             entropy_coef=args.entropy_coef,
127 |             lr=args.lr,
128 |             adapt_lr=args.adapt_lr,
129 |             num_inner_steps=args.num_inner_steps,
130 |             max_grad_norm=args.max_grad_norm,
131 |             expert=expert,
132 |             il_coef=args.il_coef
133 |         )
134 |     elif args.algo == 'lacie_a2c':
135 |         agent = algorithms.LACIE_A2C(
136 |             actor_critic=actor_critic,
137 |             value_coef=args.value_loss_coef,
138 |             entropy_coef=args.entropy_coef,
139 |             regularize_coef=args.regularize_coef,
140 |             lr=args.lr,
141 |             eps=args.eps,
142 |             alpha=args.alpha,
143 |             max_grad_norm=args.max_grad_norm,
144 |             expert=expert,
145 |             il_coef=args.il_coef,
146 |             num_cpc_steps=args.lacie_num_iter,
147 |             cpc_lr=args.cpc_lr
148 |         )
149 |     elif args.algo == 'lacie_a2c_memory':
150 |         lacie_buffer = LacieStorage(args.num_steps,
151 |                                     envs.observation_space.shape,
152 |                                     envs.action_space,
153 |                                     max_size=args.lacie_buffer_size,
154 |                                     batch_size=args.lacie_batch_size,
155 |                                     n_processes=args.num_processes)
156 |         lacie_buffer.to(device)
157 |         agent = algorithms.LACIE_A2C_Memory(
158 |             actor_critic=actor_critic,
159 |             value_coef=args.value_loss_coef,
160 |             entropy_coef=args.entropy_coef,
161 |             regularize_coef=args.regularize_coef,
162 |             lr=args.lr,
163 |             eps=args.eps,
164 |             alpha=args.alpha,
165 |             max_grad_norm=args.max_grad_norm,
166 |             expert=expert,
167 |             il_coef=args.il_coef,
168 |             num_cpc_steps=args.lacie_num_iter,
169 |             lacie_batch_size=args.lacie_batch_size,
170 |             lacie_buffer=lacie_buffer,
171 |             use_memory_to_pred_weights=args.use_memory_to_pred_weights,
172 |             cpc_lr=args.cpc_lr
173 |         )
174 |     elif args.algo == 'lacie_ppo':
175 |         agent = algorithms.LACIE_PPO(
176 |             actor_critic,
177 |             args.clip_param,
178 |             args.ppo_epoch,
179 |             args.num_mini_batch,
180 |             args.value_loss_coef,
181 |             args.entropy_coef,
182 |             regularize_coef=args.regularize_coef,
183 |             lr=args.lr,
184 |             eps=args.eps,
185 |             max_grad_norm=args.max_grad_norm,
186 |             expert=expert,
187 |             il_coef=args.il_coef,
188 |             cpc_lr=args.cpc_lr)
189 |     elif args.algo == 'lacie_ppo_memory':
190 |         lacie_buffer = LacieStorage(args.num_steps,
191 |                                     envs.observation_space.shape,
192 |                                     envs.action_space,
193 |                                     max_size=args.lacie_buffer_size,
194 |                                     batch_size=args.lacie_batch_size,
195 |                                     n_processes=args.num_processes)
196 |         lacie_buffer.to(device)
197 |         agent = algorithms.LACIE_PPO_Memory(
198 |             actor_critic,
199 |             args.clip_param,
200 |             args.ppo_epoch,
201 |             args.num_mini_batch,
202 |             args.value_loss_coef,
203 |             args.entropy_coef,
204 |             regularize_coef=args.regularize_coef,
205 |             lr=args.lr,
206 |             eps=args.eps,
207 |             max_grad_norm=args.max_grad_norm,
208 |             expert=expert,
209 |             il_coef=args.il_coef,
210 |             num_cpc_steps=args.lacie_num_iter,
211 |             lacie_batch_size=args.lacie_batch_size,
212 |             lacie_buffer=lacie_buffer,
213 |             use_memory_to_pred_weights=args.use_memory_to_pred_weights,
214 |             cpc_lr=args.cpc_lr
215 |         )
216 |     else:
217 |         raise ValueError("Not Implemented algorithm...")
218 | 
219 |     rollouts = RolloutStorage(args.num_steps, args.num_processes,
220 |                               envs.observation_space.shape, envs.action_space,
221 |                               actor_critic.recurrent_hidden_state_size)
222 | 
223 |     obs = envs.reset()
224 |     rollouts.obs[0].copy_(obs)
225 |     rollouts.to(device)
226 | 
227 |     episode_rewards = deque(maxlen=10)
228 | 
229 |     start = time.time()
230 | 
231 |     num_updates = int(
232 |         args.num_env_steps) // args.num_steps // args.num_processes
233 | 
234 |     # the gradient update interval to increase number of stream jobs
235 |     curriculum_interval = int(num_updates / args.num_curriculum_time)
236 | 
237 |     for j in range(num_updates):
238 |         random_seed = args.seed if args.fix_job_sequence else args.seed + j
239 |         # if using load_balance environment: \
240 |         # we have to gradually increase number of stream jos
241 |         # if (args.env_name == 'load_balance') and ((j + 1) % curriculum_interval) == 0:
242 |         #     args.num_stream_jobs = int(
243 |         #         args.num_stream_jobs * args.num_stream_jobs_factor)
244 | 
245 |         #     # reconstruct environments to increase the number of stream jobs
246 |         #     # also alter the random seed
247 |         #     if not args.use_proper_time_limits:
248 |         #         envs = make_vec_envs(env_name=args.env_name,
249 |         #                              seed=random_seed,
250 |         #                              num_processes=args.num_processes,
251 |         #                              log_dir=log_dir,
252 |         #                              device=device,
253 |         #                              allow_early_resets=False,
254 |         #                              args=args)
255 |         #     else:
256 |         #         envs = make_vec_envs(env_name=args.env_name,
257 |         #                              seed=random_seed,
258 |         #                              num_processes=args.num_processes,
259 |         #                              log_dir=log_dir,
260 |         #                              device=device,
261 |         #                              allow_early_resets=True,
262 |         #                              max_episode_steps=args.max_episode_steps,
263 |         #                              args=args)
264 | 
265 |         #     print("Increase the number of stream jobs to " +
266 |         #           str(args.num_stream_jobs))
267 |         #     obs = envs.reset()
268 |         #     rollouts.obs[0].copy_(obs)
269 |         #     rollouts.to(device)
270 | 
271 |         # decrease learning rate linearly
272 |         if args.use_linear_lr_decay:
273 |             cur_lr = utils.update_linear_schedule(
274 |                 agent.optimizer, j, num_updates,
275 |                 agent.optimizer.lr if args.algo == "acktr" else args.lr)
276 |             if args.algo.startswith('lacie'):
277 |                 cur_lr = utils.update_linear_schedule(
278 |                     agent.cpc_optimizer, j, num_updates, args.cpc_lr
279 |                 )
280 |         else:
281 |             cur_lr = agent.optimizer.param_groups[0]["lr"]
282 | 
283 |         # Rolling out, collecting and storing SARS (State, action, reward, new state)
284 |         for step in range(args.num_steps):
285 |             # Sample actions
286 |             with torch.no_grad():
287 |                 value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
288 |                     rollouts.obs[step], rollouts.recurrent_hidden_states[step],
289 |                     rollouts.masks[step])
290 | 
291 |             # Obser reward and next obs
292 |             # TODO: park env does not support cuda tensor???
293 |             obs, reward, done, infos = envs.step(action.cpu())
294 |             for info in infos:
295 |                 if 'episode' in info.keys():
296 |                     episode_rewards.append(info['episode']['r'])
297 | 
298 |             # If done then clean the history of observations.
299 |             masks = torch.FloatTensor(
300 |                 [[0.0] if done_ else [1.0] for done_ in done])
301 |             bad_masks = torch.FloatTensor(
302 |                 [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos])
303 |             rollouts.insert(obs, recurrent_hidden_states, action,
304 |                             action_log_prob, value, reward, masks, bad_masks)
305 | 
306 |         with torch.no_grad():
307 |             next_value = actor_critic.get_value(
308 |                 rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
309 |                 rollouts.masks[-1]).detach()
310 | 
311 |         rollouts.compute_returns(next_value, args.use_gae, args.gamma,
312 |                                  args.gae_lambda, args.use_proper_time_limits)
313 | 
314 |         results = agent.update(rollouts)
315 | 
316 |         rollouts.after_update()
317 | 
318 |         # SAVE trained model
319 |         if (j % args.save_interval == 0
320 |                 or j == num_updates - 1) and args.save_dir != "":
321 |             save_path = os.path.join(args.save_dir, args.algo)
322 |             try:
323 |                 os.makedirs(save_path)
324 |             except OSError:
325 |                 pass
326 | 
327 |             torch.save([
328 |                 actor_critic,
329 |                 getattr(utils.get_vec_normalize(envs), 'ob_rms', None)
330 |             ], os.path.join(save_path, args.env_name + ".pt"))
331 | 
332 |         # LOG TRAINING results
333 |         if j % args.log_interval == 0 and len(episode_rewards) > 1:
334 |             total_num_steps = (j + 1) * args.num_processes * args.num_steps
335 |             end = time.time()
336 |             print("="*90)
337 |             print("Updates {}, num timesteps {}, FPS {}, LR: {}"
338 |                   "\n=> Last {} training episodes: mean/median reward "
339 |                   "{:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}".format(
340 |                       j, total_num_steps,
341 |                       int(total_num_steps / (end - start)),
342 |                       cur_lr,
343 |                       len(episode_rewards), np.mean(episode_rewards),
344 |                       np.median(episode_rewards), np.min(episode_rewards),
345 |                       np.max(episode_rewards)))
346 |             result_str = "=> "
347 |             for k, v in results.items():
348 |                 result_str = result_str + "{}: {:.2f} ".format(k, v)
349 |             print(result_str)
350 | 
351 |             writer.add_scalar("train/reward", np.mean(episode_rewards), j)
352 |             for k, v in results.items():
353 |                 writer.add_scalar("train/"+k.replace(' ', '_'), v, j)
354 | 
355 |             plot(log_dir, 'load-balance', args.algo,
356 |                  args.num_env_steps)
357 | 
358 |         # EVALUATE performance of learned policy along with heuristic
359 |         if (args.eval_interval is not None and len(episode_rewards) > 1
360 |                 and j % args.eval_interval == 0):
361 |             # alter the random seed
362 |             eval_results = evaluate(actor_critic, args.env_name, seed=args.seed,
363 |                                     num_processes=args.num_processes, eval_log_dir=eval_log_dir,
364 |                                     device=device, env_args=args)
365 |             writer.add_scalars(
366 |                 'eval/reward',
367 |                 {k: np.mean(v) for k, v in eval_results.items()},
368 |                 j)
369 |             # plot(eval_log_dir, 'load-balance', args.algo,
370 |             #     args.num_env_steps)
371 | 
372 |     writer.close()
373 | 
374 | 
375 | if __name__ == "__main__":
376 |     main()
377 | 


--------------------------------------------------------------------------------