├── core ├── __init__.py ├── agents │ ├── heuristic │ │ ├── __init__.py │ │ └── load_balance │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ ├── random_allocate.py │ │ │ ├── least_work.py │ │ │ ├── earliest_completion_time.py │ │ │ └── shortest_processing_time.py │ ├── __init__.py │ ├── models │ │ ├── __init__.py │ │ ├── cnn_base.py │ │ ├── mlp_base.py │ │ ├── bncnn.py │ │ └── base.py │ └── pg.py ├── storage │ ├── __init__.py │ ├── lacie_storage.py │ └── base_storage.py ├── algorithms │ ├── lacie │ │ ├── __init__.py │ │ ├── lacie_a2c.py │ │ ├── base_lacie.py │ │ └── lacie_ppo.py │ ├── __init__.py │ ├── input_dependent_baseline │ │ ├── __init__.py │ │ ├── mib_a2c.py │ │ ├── mib_ppo.py │ │ └── base_meta_critic.py │ ├── base_algo.py │ ├── a2c_acktr.py │ ├── ppo.py │ └── kfac.py ├── envs │ ├── __init__.py │ ├── load_balance_wrappers.py │ └── park_envs.py ├── utils.py ├── distributions.py └── arguments.py ├── utils ├── __init__.py └── plot.py ├── _config.yml ├── assets └── logo.png ├── scripts ├── grid_search │ ├── ppo │ │ ├── lacie │ │ │ ├── run.sh │ │ │ ├── config_1.sh │ │ │ ├── config_2.sh │ │ │ ├── config_3.sh │ │ │ ├── config_6.sh │ │ │ ├── config_7.sh │ │ │ ├── config_4.sh │ │ │ └── config_5.sh │ │ └── vanilla │ │ │ ├── run.sh │ │ │ ├── config_7.sh │ │ │ ├── config_3.sh │ │ │ ├── config_4.sh │ │ │ ├── config_8.sh │ │ │ ├── config_2.sh │ │ │ ├── config_5.sh │ │ │ ├── config_1.sh │ │ │ └── config_6.sh │ └── a2c │ │ ├── vanilla │ │ ├── config_2.sh │ │ ├── config_1.sh │ │ ├── config_3.sh │ │ └── config_4.sh │ │ └── lacie │ │ ├── config_1.sh │ │ ├── config_2.sh │ │ ├── config_3.sh │ │ ├── config_4.sh │ │ ├── config_5.sh │ │ ├── config_6.sh │ │ └── config_7.sh ├── lacie_a2c_load_balance.sh └── lacie_ppo_load_balance.sh ├── README.md ├── .gitignore ├── evaluation.py ├── LICENSE └── main.py /core/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /core/agents/heuristic/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-cayman -------------------------------------------------------------------------------- /core/agents/__init__.py: -------------------------------------------------------------------------------- 1 | from .pg import Policy 2 | -------------------------------------------------------------------------------- /core/storage/__init__.py: -------------------------------------------------------------------------------- 1 | from .base_storage import RolloutStorage 2 | from .lacie_storage import LacieStorage 3 | -------------------------------------------------------------------------------- /assets/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lehduong/Job-Scheduling-with-Reinforcement-Learning/HEAD/assets/logo.png -------------------------------------------------------------------------------- /core/algorithms/lacie/__init__.py: -------------------------------------------------------------------------------- 1 | from .lacie_a2c import LACIE_A2C, LACIE_A2C_Memory 2 | from .lacie_ppo import LACIE_PPO, LACIE_PPO_Memory 3 | -------------------------------------------------------------------------------- /core/envs/__init__.py: -------------------------------------------------------------------------------- 1 | from .park_envs import make_env 2 | from .park_envs import make_vec_envs 3 | from .park_envs import PARK_ENV_LIST 4 | -------------------------------------------------------------------------------- /core/agents/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import NNBase 2 | from .cnn_base import CNNBase 3 | from .mlp_base import MLPBase 4 | from .bncnn import BNCNN 5 | -------------------------------------------------------------------------------- /core/algorithms/__init__.py: -------------------------------------------------------------------------------- 1 | from .a2c_acktr import A2C_ACKTR 2 | from .ppo import PPO 3 | from .input_dependent_baseline import MIB_A2C, MIB_PPO 4 | from .lacie import LACIE_A2C, LACIE_PPO, LACIE_A2C_Memory, LACIE_PPO_Memory 5 | -------------------------------------------------------------------------------- /core/agents/heuristic/load_balance/__init__.py: -------------------------------------------------------------------------------- 1 | from .least_work import LeastWorkAgent 2 | from .shortest_processing_time import ShortestProcessingTimeAgent 3 | from .random_allocate import RandomAllocateAgent 4 | from .earliest_completion_time import EarliestCompletionTimeAgent 5 | -------------------------------------------------------------------------------- /core/algorithms/input_dependent_baseline/__init__.py: -------------------------------------------------------------------------------- 1 | from core.algorithms.input_dependent_baseline.mib_a2c import MIB_A2C 2 | from core.algorithms.input_dependent_baseline.mib_ppo import MIB_PPO 3 | from core.algorithms.input_dependent_baseline.base_meta_critic import ActorMetaCriticAlgo 4 | -------------------------------------------------------------------------------- /core/agents/heuristic/load_balance/base.py: -------------------------------------------------------------------------------- 1 | from abc import ABC 2 | 3 | 4 | class HeuristicAgent(ABC): 5 | def __init__(self): 6 | pass 7 | 8 | def act(self, states): 9 | """ 10 | Give actions for given states 11 | :param states: torch tensor of shape num_envs x (num_servers+1) 12 | """ 13 | pass 14 | -------------------------------------------------------------------------------- /core/agents/heuristic/load_balance/random_allocate.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from .base import HeuristicAgent 4 | 5 | 6 | class RandomAllocateAgent(HeuristicAgent): 7 | def act(self, states): 8 | num_env = states.shape[0] 9 | num_servers = states.shape[1] - 2 10 | 11 | return torch.randint(0, num_servers, (num_env, 1)).to(states.device) 12 | -------------------------------------------------------------------------------- /scripts/grid_search/ppo/lacie/run.sh: -------------------------------------------------------------------------------- 1 | bash scripts/grid_search/ppo/lacie/config_1.sh && \ 2 | bash scripts/grid_search/ppo/lacie/config_2.sh && \ 3 | bash scripts/grid_search/ppo/lacie/config_3.sh && \ 4 | bash scripts/grid_search/ppo/lacie/config_4.sh && \ 5 | bash scripts/grid_search/ppo/lacie/config_5.sh && \ 6 | bash scripts/grid_search/ppo/lacie/config_6.sh && \ 7 | bash scripts/grid_search/ppo/lacie/config_7.sh -------------------------------------------------------------------------------- /scripts/grid_search/ppo/vanilla/run.sh: -------------------------------------------------------------------------------- 1 | bash scripts/grid_search/ppo/vanilla/config_1.sh && \ 2 | bash scripts/grid_search/ppo/vanilla/config_2.sh && \ 3 | bash scripts/grid_search/ppo/vanilla/config_3.sh && \ 4 | bash scripts/grid_search/ppo/vanilla/config_4.sh && \ 5 | bash scripts/grid_search/ppo/vanilla/config_5.sh && \ 6 | bash scripts/grid_search/ppo/vanilla/config_6.sh && \ 7 | bash scripts/grid_search/ppo/vanilla/config_7.sh && \ 8 | bash scripts/grid_search/ppo/vanilla/config_8.sh -------------------------------------------------------------------------------- /core/agents/heuristic/load_balance/least_work.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | from .base import HeuristicAgent 5 | 6 | 7 | class LeastWorkAgent(HeuristicAgent): 8 | def act(self, states): 9 | """ 10 | Give actions for given states 11 | :param states: torch tensor of shape num_envs x (num_servers+1) 12 | :return: np.array of shape num_env x 1 13 | """ 14 | return torch.argmin(states[:, :-2], dim=1, keepdims=True).to(states.device) 15 | -------------------------------------------------------------------------------- /core/agents/heuristic/load_balance/earliest_completion_time.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from .shortest_processing_time import ShortestProcessingTimeAgent 4 | 5 | 6 | class EarliestCompletionTimeAgent(ShortestProcessingTimeAgent): 7 | def act(self, states): 8 | """ 9 | Give actions for given states 10 | :param states: torch tensor of shape num_envs x (num_servers+1) 11 | :return: np.array of shape num_env x 1 12 | """ 13 | processing_time = states[:, - 14 | 2].reshape(-1, 1) / self.service_rates.to(states.device) 15 | completion_time = states[:, :-2] + processing_time 16 | 17 | return torch.argmin(completion_time, dim=1, keepdims=True) 18 | -------------------------------------------------------------------------------- /scripts/grid_search/ppo/vanilla/config_7.sh: -------------------------------------------------------------------------------- 1 | python main.py --num-stream-jobs 1000 --num-stream-jobs-factor 1.05\ 2 | --num-curriculum-time 1 \ 3 | --algo ppo \ 4 | --clip-param 0.3\ 5 | --ppo-epoch 4\ 6 | --num-env-steps 50000000\ 7 | --gamma 1\ 8 | --entropy-coef 0.01\ 9 | --regularize-coef 1\ 10 | --load-balance-service-rates 0.15 0.25 0.35 0.45 0.55 0.65 0.75 0.85 0.95 1.05 \ 11 | --reward-norm-factor 10000\ 12 | --lr 0.00025\ 13 | --num-mini-batch 4\ 14 | --num-process 16 --num-steps 1000 --log-interval 5 \ 15 | --seed 100 --use-memory-to-pred-weights --use-linear-lr-decay\ 16 | --log-dir ppo_7 -------------------------------------------------------------------------------- /scripts/grid_search/ppo/vanilla/config_3.sh: -------------------------------------------------------------------------------- 1 | python main.py --num-stream-jobs 1000 --num-stream-jobs-factor 1.05\ 2 | --num-curriculum-time 1 \ 3 | --algo ppo \ 4 | --clip-param 0.1\ 5 | --ppo-epoch 4\ 6 | --num-env-steps 50000000\ 7 | --gamma 1\ 8 | --entropy-coef 0.01\ 9 | --regularize-coef 1\ 10 | --load-balance-service-rates 0.15 0.25 0.35 0.45 0.55 0.65 0.75 0.85 0.95 1.05 \ 11 | --reward-norm-factor 10000\ 12 | --lr 0.0005\ 13 | --num-mini-batch 4\ 14 | --num-process 16 --num-steps 1000 --log-interval 5 \ 15 | --seed 100 --use-memory-to-pred-weights --use-linear-lr-decay\ 16 | --log-dir ppo_3 -------------------------------------------------------------------------------- /scripts/grid_search/ppo/vanilla/config_4.sh: -------------------------------------------------------------------------------- 1 | python main.py --num-stream-jobs 1000 --num-stream-jobs-factor 1.05\ 2 | --num-curriculum-time 1 \ 3 | --algo ppo \ 4 | --clip-param 0.1\ 5 | --ppo-epoch 4\ 6 | --num-env-steps 50000000\ 7 | --gamma 1\ 8 | --entropy-coef 0.01\ 9 | --regularize-coef 1\ 10 | --load-balance-service-rates 0.15 0.25 0.35 0.45 0.55 0.65 0.75 0.85 0.95 1.05 \ 11 | --reward-norm-factor 10000\ 12 | --lr 0.0001\ 13 | --num-mini-batch 4\ 14 | --num-process 16 --num-steps 1000 --log-interval 5 \ 15 | --seed 100 --use-memory-to-pred-weights --use-linear-lr-decay\ 16 | --log-dir ppo_4 -------------------------------------------------------------------------------- /core/agents/heuristic/load_balance/shortest_processing_time.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | from .base import HeuristicAgent 5 | 6 | 7 | class ShortestProcessingTimeAgent(HeuristicAgent): 8 | def __init__(self, service_rates=[0.15, 0.25, 0.35, 0.45, 0.55, 0.65, 0.75, 0.85, 0.95, 1.05]): 9 | self.service_rates = torch.tensor(service_rates).reshape(1, -1) 10 | 11 | def act(self, states): 12 | """ 13 | Give actions for given states 14 | :param states: torch tensor of shape num_envs x (num_servers+1) 15 | :return: np.array of shape num_env x 1 16 | """ 17 | processing_time = states[:, - 18 | 2].reshape(-1, 1) / self.service_rates.to(states.device) 19 | 20 | return torch.argmin(processing_time, dim=1, keepdims=True) 21 | -------------------------------------------------------------------------------- /scripts/grid_search/ppo/vanilla/config_8.sh: -------------------------------------------------------------------------------- 1 | python main.py --num-stream-jobs 1000 --num-stream-jobs-factor 1.05\ 2 | --num-curriculum-time 1 \ 3 | --algo ppo \ 4 | --clip-param 0.2\ 5 | --ppo-epoch 8\ 6 | --num-env-steps 50000000\ 7 | --gamma 1\ 8 | --entropy-coef 0.01\ 9 | --regularize-coef 1\ 10 | --load-balance-service-rates 0.15 0.25 0.35 0.45 0.55 0.65 0.75 0.85 0.95 1.05 \ 11 | --reward-norm-factor 10000\ 12 | --lr 0.000125\ 13 | --num-mini-batch 4\ 14 | --num-process 16 --num-steps 1000 --log-interval 5 \ 15 | --seed 100 --use-memory-to-pred-weights --use-linear-lr-decay\ 16 | --log-dir ppo_8 -------------------------------------------------------------------------------- /scripts/grid_search/ppo/vanilla/config_2.sh: -------------------------------------------------------------------------------- 1 | python main.py --num-stream-jobs 1000 --num-stream-jobs-factor 1.05\ 2 | --num-curriculum-time 1 \ 3 | --algo ppo \ 4 | --clip-param 0.1\ 5 | --ppo-epoch 8\ 6 | --num-env-steps 50000000\ 7 | --gamma 1\ 8 | --entropy-coef 0.01\ 9 | --regularize-coef 1\ 10 | --load-balance-service-rates 0.15 0.25 0.35 0.45 0.55 0.65 0.75 0.85 0.95 1.05 \ 11 | --reward-norm-factor 10000\ 12 | --lr 0.000125\ 13 | --num-mini-batch 4\ 14 | --num-process 16 --num-steps 1000 --log-interval 5 \ 15 | --seed 100 --use-memory-to-pred-weights --use-linear-lr-decay\ 16 | --log-dir ppo_2 -------------------------------------------------------------------------------- /scripts/grid_search/ppo/vanilla/config_5.sh: -------------------------------------------------------------------------------- 1 | python main.py --num-stream-jobs 1000 --num-stream-jobs-factor 1.05\ 2 | --num-curriculum-time 1 \ 3 | --algo ppo \ 4 | --clip-param 0.1\ 5 | --ppo-epoch 4\ 6 | --num-env-steps 50000000\ 7 | --gamma 1\ 8 | --entropy-coef 0.001\ 9 | --regularize-coef 1\ 10 | --load-balance-service-rates 0.15 0.25 0.35 0.45 0.55 0.65 0.75 0.85 0.95 1.05 \ 11 | --reward-norm-factor 10000\ 12 | --lr 0.00025\ 13 | --num-mini-batch 4\ 14 | --num-process 16 --num-steps 1000 --log-interval 5 \ 15 | --seed 100 --use-memory-to-pred-weights --use-linear-lr-decay\ 16 | --log-dir ppo_5 -------------------------------------------------------------------------------- /scripts/grid_search/ppo/vanilla/config_1.sh: -------------------------------------------------------------------------------- 1 | python main.py --num-stream-jobs 1000 --num-stream-jobs-factor 1.05\ 2 | --num-curriculum-time 1 \ 3 | --algo ppo \ 4 | --clip-param 0.1\ 5 | --ppo-epoch 4\ 6 | --num-env-steps 50000000\ 7 | --gamma 1\ 8 | --entropy-coef 0.01\ 9 | --regularize-coef 1\ 10 | --load-balance-service-rates 0.15 0.25 0.35 0.45 0.55 0.65 0.75 0.85 0.95 1.05 \ 11 | --reward-norm-factor 10000\ 12 | --lr 0.00025\ 13 | --num-mini-batch 4\ 14 | --num-process 16 --num-steps 1000 --log-interval 5 \ 15 | --seed 100 --use-memory-to-pred-weights --use-linear-lr-decay\ 16 | --log-dir ppo_1 -------------------------------------------------------------------------------- /scripts/grid_search/ppo/vanilla/config_6.sh: -------------------------------------------------------------------------------- 1 | python main.py --num-stream-jobs 1000 --num-stream-jobs-factor 1.05\ 2 | --num-curriculum-time 1 \ 3 | --algo ppo \ 4 | --clip-param 0.2\ 5 | --ppo-epoch 4\ 6 | --num-env-steps 50000000\ 7 | --gamma 1\ 8 | --entropy-coef 0.01\ 9 | --regularize-coef 1\ 10 | --load-balance-service-rates 0.15 0.25 0.35 0.45 0.55 0.65 0.75 0.85 0.95 1.05 \ 11 | --reward-norm-factor 10000\ 12 | --lr 0.00025\ 13 | --num-mini-batch 4\ 14 | --num-process 16 --num-steps 1000 --log-interval 5 \ 15 | --seed 100 --use-memory-to-pred-weights --use-linear-lr-decay\ 16 | --log-dir ppo_6 -------------------------------------------------------------------------------- /scripts/grid_search/a2c/vanilla/config_2.sh: -------------------------------------------------------------------------------- 1 | python main.py --num-stream-jobs 1000 --num-stream-jobs-factor 1.05\ 2 | --num-curriculum-time 1 \ 3 | --algo a2c \ 4 | --num-env-steps 50000000\ 5 | --gamma 1\ 6 | --entropy-coef 0.01\ 7 | --regularize-coef 1\ 8 | --load-balance-service-rates 0.15 0.25 0.35 0.45 0.55 0.65 0.75 0.85 0.95 1.05 \ 9 | --reward-norm-factor 10000\ 10 | --lr 0.00075\ 11 | --num-mini-batch 32\ 12 | --adapt-lr 1e-3\ 13 | --num-inner-steps 5\ 14 | --lacie-batch-size 64\ 15 | --lacie-buffer-size 400\ 16 | --lacie-num-iter 40\ 17 | --num-process 16 --num-steps 1000 --log-interval 5 \ 18 | --seed 100 --use-memory-to-pred-weights --use-linear-lr-decay\ 19 | --log-dir a2c/vanilla/config_2 -------------------------------------------------------------------------------- /scripts/grid_search/a2c/vanilla/config_1.sh: -------------------------------------------------------------------------------- 1 | python main.py --num-stream-jobs 1000 --num-stream-jobs-factor 1.05\ 2 | --num-curriculum-time 1 \ 3 | --algo a2c \ 4 | --num-env-steps 50000000\ 5 | --gamma 1\ 6 | --entropy-coef 0.01\ 7 | --regularize-coef 1\ 8 | --load-balance-service-rates 0.15 0.25 0.35 0.45 0.55 0.65 0.75 0.85 0.95 1.05 \ 9 | --reward-norm-factor 10000\ 10 | --lr 0.001\ 11 | --num-mini-batch 32\ 12 | --adapt-lr 1e-3\ 13 | --num-inner-steps 5\ 14 | --lacie-batch-size 64\ 15 | --lacie-buffer-size 400\ 16 | --lacie-num-iter 40\ 17 | --num-process 16 --num-steps 1000 --log-interval 5 \ 18 | --seed 100 --use-memory-to-pred-weights --use-linear-lr-decay\ 19 | --log-dir a2c/vanilla/config_1 20 | -------------------------------------------------------------------------------- /scripts/grid_search/a2c/vanilla/config_3.sh: -------------------------------------------------------------------------------- 1 | python main.py --num-stream-jobs 1000 --num-stream-jobs-factor 1.05\ 2 | --num-curriculum-time 1 \ 3 | --algo a2c \ 4 | --num-env-steps 50000000\ 5 | --gamma 1\ 6 | --entropy-coef 0.01\ 7 | --regularize-coef 1\ 8 | --load-balance-service-rates 0.15 0.25 0.35 0.45 0.55 0.65 0.75 0.85 0.95 1.05 \ 9 | --reward-norm-factor 10000\ 10 | --lr 0.0005\ 11 | --num-mini-batch 32\ 12 | --adapt-lr 1e-3\ 13 | --num-inner-steps 5\ 14 | --lacie-batch-size 64\ 15 | --lacie-buffer-size 400\ 16 | --lacie-num-iter 40\ 17 | --num-process 16 --num-steps 1000 --log-interval 5 \ 18 | --seed 100 --use-memory-to-pred-weights --use-linear-lr-decay\ 19 | --log-dir a2c/vanilla/config_3 20 | -------------------------------------------------------------------------------- /scripts/grid_search/a2c/vanilla/config_4.sh: -------------------------------------------------------------------------------- 1 | python main.py --num-stream-jobs 1000 --num-stream-jobs-factor 1.05\ 2 | --num-curriculum-time 1 \ 3 | --algo a2c \ 4 | --num-env-steps 50000000\ 5 | --gamma 1\ 6 | --entropy-coef 0.01\ 7 | --regularize-coef 1\ 8 | --load-balance-service-rates 0.15 0.25 0.35 0.45 0.55 0.65 0.75 0.85 0.95 1.05 \ 9 | --reward-norm-factor 10000\ 10 | --lr 0.002\ 11 | --num-mini-batch 32\ 12 | --adapt-lr 1e-3\ 13 | --num-inner-steps 5\ 14 | --lacie-batch-size 64\ 15 | --lacie-buffer-size 400\ 16 | --lacie-num-iter 40\ 17 | --num-process 16 --num-steps 1000 --log-interval 5 \ 18 | --seed 100 --use-memory-to-pred-weights --use-linear-lr-decay\ 19 | --log-dir a2c/vanilla/config_4 20 | -------------------------------------------------------------------------------- /scripts/lacie_a2c_load_balance.sh: -------------------------------------------------------------------------------- 1 | python main.py --num-stream-jobs 1000 --num-stream-jobs-factor 1.05\ 2 | --num-curriculum-time 1 \ 3 | --algo lacie_a2c_memory \ 4 | --num-env-steps 200000000\ 5 | --gamma 1\ 6 | --entropy-coef 0.01\ 7 | --regularize-coef 1\ 8 | --load-balance-service-rates 0.15 0.25 0.35 0.45 0.55 0.65 0.75 0.85 0.95 1.05 \ 9 | --reward-norm-factor 10000\ 10 | --lr 0.001\ 11 | --num-mini-batch 32\ 12 | --eval-interval 100\ 13 | --adapt-lr 1e-3\ 14 | --num-inner-steps 5\ 15 | --lacie-batch-size 64\ 16 | --lacie-buffer-size 400\ 17 | --lacie-num-iter 40\ 18 | --num-process 16 --num-steps 1000 --log-interval 5 \ 19 | --seed 100 --use-memory-to-pred-weights --use-linear-lr-decay\ 20 | --log-dir lacie_a2c 21 | -------------------------------------------------------------------------------- /scripts/grid_search/a2c/lacie/config_1.sh: -------------------------------------------------------------------------------- 1 | python main.py --num-stream-jobs 1000 --num-stream-jobs-factor 1.05\ 2 | --num-curriculum-time 1 \ 3 | --algo lacie_a2c_memory \ 4 | --num-env-steps 50000000\ 5 | --gamma 1\ 6 | --entropy-coef 0.01\ 7 | --regularize-coef 1\ 8 | --load-balance-service-rates 0.15 0.25 0.35 0.45 0.55 0.65 0.75 0.85 0.95 1.05 \ 9 | --reward-norm-factor 10000\ 10 | --lr 0.001\ 11 | --cpc-lr 0.001\ 12 | --num-mini-batch 32\ 13 | --adapt-lr 1e-3\ 14 | --num-inner-steps 5\ 15 | --lacie-batch-size 64\ 16 | --lacie-buffer-size 400\ 17 | --lacie-num-iter 40\ 18 | --num-process 16 --num-steps 1000 --log-interval 5 \ 19 | --seed 100 --use-memory-to-pred-weights --use-linear-lr-decay\ 20 | --log-dir lacie_a2c_1 21 | -------------------------------------------------------------------------------- /scripts/grid_search/a2c/lacie/config_2.sh: -------------------------------------------------------------------------------- 1 | python main.py --num-stream-jobs 1000 --num-stream-jobs-factor 1.05\ 2 | --num-curriculum-time 1 \ 3 | --algo lacie_a2c_memory \ 4 | --num-env-steps 50000000\ 5 | --gamma 1\ 6 | --entropy-coef 0.01\ 7 | --regularize-coef 1\ 8 | --load-balance-service-rates 0.15 0.25 0.35 0.45 0.55 0.65 0.75 0.85 0.95 1.05 \ 9 | --reward-norm-factor 10000\ 10 | --lr 0.00075\ 11 | --cpc-lr 0.001\ 12 | --num-mini-batch 32\ 13 | --adapt-lr 1e-3\ 14 | --num-inner-steps 5\ 15 | --lacie-batch-size 64\ 16 | --lacie-buffer-size 400\ 17 | --lacie-num-iter 40\ 18 | --num-process 16 --num-steps 1000 --log-interval 5 \ 19 | --seed 100 --use-memory-to-pred-weights --use-linear-lr-decay\ 20 | --log-dir lacie_a2c_2 21 | -------------------------------------------------------------------------------- /scripts/grid_search/a2c/lacie/config_3.sh: -------------------------------------------------------------------------------- 1 | python main.py --num-stream-jobs 1000 --num-stream-jobs-factor 1.05\ 2 | --num-curriculum-time 1 \ 3 | --algo lacie_a2c_memory \ 4 | --num-env-steps 50000000\ 5 | --gamma 1\ 6 | --entropy-coef 0.01\ 7 | --regularize-coef 1\ 8 | --load-balance-service-rates 0.15 0.25 0.35 0.45 0.55 0.65 0.75 0.85 0.95 1.05 \ 9 | --reward-norm-factor 10000\ 10 | --lr 0.0005\ 11 | --cpc-lr 0.001\ 12 | --num-mini-batch 32\ 13 | --adapt-lr 1e-3\ 14 | --num-inner-steps 5\ 15 | --lacie-batch-size 64\ 16 | --lacie-buffer-size 400\ 17 | --lacie-num-iter 40\ 18 | --num-process 16 --num-steps 1000 --log-interval 5 \ 19 | --seed 100 --use-memory-to-pred-weights --use-linear-lr-decay\ 20 | --log-dir lacie_a2c_3 21 | -------------------------------------------------------------------------------- /scripts/grid_search/a2c/lacie/config_4.sh: -------------------------------------------------------------------------------- 1 | python main.py --num-stream-jobs 1000 --num-stream-jobs-factor 1.05\ 2 | --num-curriculum-time 1 \ 3 | --algo lacie_a2c_memory \ 4 | --num-env-steps 50000000\ 5 | --gamma 1\ 6 | --entropy-coef 0.01\ 7 | --regularize-coef 1\ 8 | --load-balance-service-rates 0.15 0.25 0.35 0.45 0.55 0.65 0.75 0.85 0.95 1.05 \ 9 | --reward-norm-factor 10000\ 10 | --lr 0.002\ 11 | --cpc-lr 0.001\ 12 | --num-mini-batch 32\ 13 | --adapt-lr 1e-3\ 14 | --num-inner-steps 5\ 15 | --lacie-batch-size 64\ 16 | --lacie-buffer-size 400\ 17 | --lacie-num-iter 40\ 18 | --num-process 16 --num-steps 1000 --log-interval 5 \ 19 | --seed 100 --use-memory-to-pred-weights --use-linear-lr-decay\ 20 | --log-dir lacie_a2c_4 21 | -------------------------------------------------------------------------------- /scripts/grid_search/a2c/lacie/config_5.sh: -------------------------------------------------------------------------------- 1 | python main.py --num-stream-jobs 1000 --num-stream-jobs-factor 1.05\ 2 | --num-curriculum-time 1 \ 3 | --algo lacie_a2c_memory \ 4 | --num-env-steps 50000000\ 5 | --gamma 1\ 6 | --entropy-coef 0.01\ 7 | --regularize-coef 1\ 8 | --load-balance-service-rates 0.15 0.25 0.35 0.45 0.55 0.65 0.75 0.85 0.95 1.05 \ 9 | --reward-norm-factor 10000\ 10 | --lr 0.001\ 11 | --cpc-lr 0.00075\ 12 | --num-mini-batch 32\ 13 | --adapt-lr 1e-3\ 14 | --num-inner-steps 5\ 15 | --lacie-batch-size 64\ 16 | --lacie-buffer-size 400\ 17 | --lacie-num-iter 40\ 18 | --num-process 16 --num-steps 1000 --log-interval 5 \ 19 | --seed 100 --use-memory-to-pred-weights --use-linear-lr-decay\ 20 | --log-dir lacie_a2c_5 21 | -------------------------------------------------------------------------------- /scripts/grid_search/a2c/lacie/config_6.sh: -------------------------------------------------------------------------------- 1 | python main.py --num-stream-jobs 1000 --num-stream-jobs-factor 1.05\ 2 | --num-curriculum-time 1 \ 3 | --algo lacie_a2c_memory \ 4 | --num-env-steps 50000000\ 5 | --gamma 1\ 6 | --entropy-coef 0.01\ 7 | --regularize-coef 1\ 8 | --load-balance-service-rates 0.15 0.25 0.35 0.45 0.55 0.65 0.75 0.85 0.95 1.05 \ 9 | --reward-norm-factor 10000\ 10 | --lr 0.001\ 11 | --cpc-lr 0.002\ 12 | --num-mini-batch 32\ 13 | --adapt-lr 1e-3\ 14 | --num-inner-steps 5\ 15 | --lacie-batch-size 64\ 16 | --lacie-buffer-size 400\ 17 | --lacie-num-iter 40\ 18 | --num-process 16 --num-steps 1000 --log-interval 5 \ 19 | --seed 100 --use-memory-to-pred-weights --use-linear-lr-decay\ 20 | --log-dir lacie_a2c_6 21 | -------------------------------------------------------------------------------- /scripts/grid_search/a2c/lacie/config_7.sh: -------------------------------------------------------------------------------- 1 | python main.py --num-stream-jobs 1000 --num-stream-jobs-factor 1.05\ 2 | --num-curriculum-time 1 \ 3 | --algo lacie_a2c_memory \ 4 | --num-env-steps 50000000\ 5 | --gamma 1\ 6 | --entropy-coef 0.01\ 7 | --regularize-coef 1\ 8 | --load-balance-service-rates 0.15 0.25 0.35 0.45 0.55 0.65 0.75 0.85 0.95 1.05 \ 9 | --reward-norm-factor 10000\ 10 | --lr 0.001\ 11 | --cpc-lr 0.0005\ 12 | --num-mini-batch 32\ 13 | --adapt-lr 1e-3\ 14 | --num-inner-steps 5\ 15 | --lacie-batch-size 64\ 16 | --lacie-buffer-size 400\ 17 | --lacie-num-iter 40\ 18 | --num-process 16 --num-steps 1000 --log-interval 5 \ 19 | --seed 100 --use-memory-to-pred-weights --use-linear-lr-decay\ 20 | --log-dir lacie_a2c_7 21 | -------------------------------------------------------------------------------- /scripts/grid_search/ppo/lacie/config_1.sh: -------------------------------------------------------------------------------- 1 | python main.py --num-stream-jobs 1000 --num-stream-jobs-factor 1.05\ 2 | --num-curriculum-time 1 \ 3 | --algo lacie_ppo_memory \ 4 | --clip-param 0.1\ 5 | --ppo-epoch 4\ 6 | --num-env-steps 50000000\ 7 | --gamma 1\ 8 | --entropy-coef 0.01\ 9 | --regularize-coef 1\ 10 | --load-balance-service-rates 0.15 0.25 0.35 0.45 0.55 0.65 0.75 0.85 0.95 1.05 \ 11 | --reward-norm-factor 10000\ 12 | --lr 0.0001\ 13 | --cpc-lr 0.001\ 14 | --num-mini-batch 4\ 15 | --adapt-lr 1e-3\ 16 | --num-inner-steps 5\ 17 | --lacie-batch-size 64\ 18 | --lacie-buffer-size 400\ 19 | --lacie-num-iter 40\ 20 | --num-process 16 --num-steps 1000 --log-interval 5 \ 21 | --seed 100 --use-memory-to-pred-weights --use-linear-lr-decay\ 22 | --log-dir lacie_ppo_1 -------------------------------------------------------------------------------- /scripts/grid_search/ppo/lacie/config_2.sh: -------------------------------------------------------------------------------- 1 | python main.py --num-stream-jobs 1000 --num-stream-jobs-factor 1.05\ 2 | --num-curriculum-time 1 \ 3 | --algo lacie_ppo_memory \ 4 | --clip-param 0.1\ 5 | --ppo-epoch 4\ 6 | --num-env-steps 50000000\ 7 | --gamma 1\ 8 | --entropy-coef 0.01\ 9 | --regularize-coef 1\ 10 | --load-balance-service-rates 0.15 0.25 0.35 0.45 0.55 0.65 0.75 0.85 0.95 1.05 \ 11 | --reward-norm-factor 10000\ 12 | --lr 0.0001\ 13 | --cpc-lr 0.001\ 14 | --num-mini-batch 4\ 15 | --adapt-lr 1e-3\ 16 | --num-inner-steps 5\ 17 | --lacie-batch-size 32\ 18 | --lacie-buffer-size 400\ 19 | --lacie-num-iter 40\ 20 | --num-process 16 --num-steps 1000 --log-interval 5 \ 21 | --seed 100 --use-memory-to-pred-weights --use-linear-lr-decay\ 22 | --log-dir lacie_ppo_2 -------------------------------------------------------------------------------- /scripts/grid_search/ppo/lacie/config_3.sh: -------------------------------------------------------------------------------- 1 | python main.py --num-stream-jobs 1000 --num-stream-jobs-factor 1.05\ 2 | --num-curriculum-time 1 \ 3 | --algo lacie_ppo_memory \ 4 | --clip-param 0.1\ 5 | --ppo-epoch 4\ 6 | --num-env-steps 50000000\ 7 | --gamma 1\ 8 | --entropy-coef 0.01\ 9 | --regularize-coef 1\ 10 | --load-balance-service-rates 0.15 0.25 0.35 0.45 0.55 0.65 0.75 0.85 0.95 1.05 \ 11 | --reward-norm-factor 10000\ 12 | --lr 0.0001\ 13 | --cpc-lr 0.001\ 14 | --num-mini-batch 4\ 15 | --adapt-lr 1e-3\ 16 | --num-inner-steps 5\ 17 | --lacie-batch-size 16\ 18 | --lacie-buffer-size 400\ 19 | --lacie-num-iter 40\ 20 | --num-process 16 --num-steps 1000 --log-interval 5 \ 21 | --seed 100 --use-memory-to-pred-weights --use-linear-lr-decay\ 22 | --log-dir lacie_ppo_3 -------------------------------------------------------------------------------- /scripts/grid_search/ppo/lacie/config_6.sh: -------------------------------------------------------------------------------- 1 | python main.py --num-stream-jobs 1000 --num-stream-jobs-factor 1.05\ 2 | --num-curriculum-time 1 \ 3 | --algo lacie_ppo_memory \ 4 | --clip-param 0.1\ 5 | --ppo-epoch 4\ 6 | --num-env-steps 50000000\ 7 | --gamma 1\ 8 | --entropy-coef 0.01\ 9 | --regularize-coef 1\ 10 | --load-balance-service-rates 0.15 0.25 0.35 0.45 0.55 0.65 0.75 0.85 0.95 1.05 \ 11 | --reward-norm-factor 10000\ 12 | --lr 0.0001\ 13 | --cpc-lr 0.001\ 14 | --num-mini-batch 4\ 15 | --adapt-lr 1e-3\ 16 | --num-inner-steps 5\ 17 | --lacie-batch-size 16\ 18 | --lacie-buffer-size 400\ 19 | --lacie-num-iter 25\ 20 | --num-process 16 --num-steps 1000 --log-interval 5 \ 21 | --seed 100 --use-memory-to-pred-weights --use-linear-lr-decay\ 22 | --log-dir lacie_ppo_6 -------------------------------------------------------------------------------- /scripts/grid_search/ppo/lacie/config_7.sh: -------------------------------------------------------------------------------- 1 | python main.py --num-stream-jobs 1000 --num-stream-jobs-factor 1.05\ 2 | --num-curriculum-time 1 \ 3 | --algo lacie_ppo_memory \ 4 | --clip-param 0.1\ 5 | --ppo-epoch 4\ 6 | --num-env-steps 50000000\ 7 | --gamma 1\ 8 | --entropy-coef 0.01\ 9 | --regularize-coef 1\ 10 | --load-balance-service-rates 0.15 0.25 0.35 0.45 0.55 0.65 0.75 0.85 0.95 1.05 \ 11 | --reward-norm-factor 10000\ 12 | --lr 0.0001\ 13 | --cpc-lr 0.001\ 14 | --num-mini-batch 4\ 15 | --adapt-lr 1e-3\ 16 | --num-inner-steps 5\ 17 | --lacie-batch-size 16\ 18 | --lacie-buffer-size 400\ 19 | --lacie-num-iter 75\ 20 | --num-process 16 --num-steps 1000 --log-interval 5 \ 21 | --seed 100 --use-memory-to-pred-weights --use-linear-lr-decay\ 22 | --log-dir lacie_ppo_7 -------------------------------------------------------------------------------- /scripts/lacie_ppo_load_balance.sh: -------------------------------------------------------------------------------- 1 | python main.py --num-stream-jobs 1000 --num-stream-jobs-factor 1.05\ 2 | --num-curriculum-time 1 \ 3 | --algo lacie_ppo_memory \ 4 | --clip-param 0.1\ 5 | --ppo-epoch 4\ 6 | --num-env-steps 200000000\ 7 | --gamma 1\ 8 | --entropy-coef 0.01\ 9 | --regularize-coef 1\ 10 | --load-balance-service-rates 0.15 0.25 0.35 0.45 0.55 0.65 0.75 0.85 0.95 1.05 \ 11 | --reward-norm-factor 10000\ 12 | --lr 0.00025\ 13 | --num-mini-batch 4\ 14 | --eval-interval 100\ 15 | --adapt-lr 1e-3\ 16 | --num-inner-steps 5\ 17 | --lacie-batch-size 64\ 18 | --lacie-buffer-size 400\ 19 | --lacie-num-iter 40\ 20 | --num-process 16 --num-steps 1000 --log-interval 5 \ 21 | --seed 100 --use-memory-to-pred-weights --use-linear-lr-decay\ 22 | --log-dir lacie_ppo -------------------------------------------------------------------------------- /scripts/grid_search/ppo/lacie/config_4.sh: -------------------------------------------------------------------------------- 1 | python main.py --num-stream-jobs 1000 --num-stream-jobs-factor 1.05\ 2 | --num-curriculum-time 1 \ 3 | --algo lacie_ppo_memory \ 4 | --clip-param 0.1\ 5 | --ppo-epoch 4\ 6 | --num-env-steps 50000000\ 7 | --gamma 1\ 8 | --entropy-coef 0.01\ 9 | --regularize-coef 1\ 10 | --load-balance-service-rates 0.15 0.25 0.35 0.45 0.55 0.65 0.75 0.85 0.95 1.05 \ 11 | --reward-norm-factor 10000\ 12 | --lr 0.0001\ 13 | --cpc-lr 0.0005\ 14 | --num-mini-batch 4\ 15 | --adapt-lr 1e-3\ 16 | --num-inner-steps 5\ 17 | --lacie-batch-size 32\ 18 | --lacie-buffer-size 400\ 19 | --lacie-num-iter 40\ 20 | --num-process 16 --num-steps 1000 --log-interval 5 \ 21 | --seed 100 --use-memory-to-pred-weights --use-linear-lr-decay\ 22 | --log-dir lacie_ppo_4 -------------------------------------------------------------------------------- /scripts/grid_search/ppo/lacie/config_5.sh: -------------------------------------------------------------------------------- 1 | python main.py --num-stream-jobs 1000 --num-stream-jobs-factor 1.05\ 2 | --num-curriculum-time 1 \ 3 | --algo lacie_ppo_memory \ 4 | --clip-param 0.1\ 5 | --ppo-epoch 4\ 6 | --num-env-steps 50000000\ 7 | --gamma 1\ 8 | --entropy-coef 0.01\ 9 | --regularize-coef 1\ 10 | --load-balance-service-rates 0.15 0.25 0.35 0.45 0.55 0.65 0.75 0.85 0.95 1.05 \ 11 | --reward-norm-factor 10000\ 12 | --lr 0.0001\ 13 | --cpc-lr 0.0001\ 14 | --num-mini-batch 4\ 15 | --adapt-lr 1e-3\ 16 | --num-inner-steps 5\ 17 | --lacie-batch-size 32\ 18 | --lacie-buffer-size 400\ 19 | --lacie-num-iter 40\ 20 | --num-process 16 --num-steps 1000 --log-interval 5 \ 21 | --seed 100 --use-memory-to-pred-weights --use-linear-lr-decay\ 22 | --log-dir lacie_ppo_5 -------------------------------------------------------------------------------- /core/agents/models/cnn_base.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | from core.agents.models.base import NNBase, Flatten 3 | from core.utils import init 4 | 5 | class CNNBase(NNBase): 6 | def __init__(self, num_inputs, recurrent=False, hidden_size=512): 7 | super(CNNBase, self).__init__(recurrent, hidden_size, hidden_size) 8 | 9 | init_ = lambda m: init(m, nn.init.orthogonal_, lambda x: nn.init. 10 | constant_(x, 0), nn.init.calculate_gain('relu')) 11 | 12 | self.main = nn.Sequential( 13 | init_(nn.Conv2d(num_inputs, 32, 8, stride=4)), nn.ReLU(), 14 | init_(nn.Conv2d(32, 64, 4, stride=2)), nn.ReLU(), 15 | init_(nn.Conv2d(64, 32, 3, stride=1)), nn.ReLU(), Flatten(), 16 | init_(nn.Linear(32 * 7 * 7, hidden_size)), nn.ReLU()) 17 | 18 | init_ = lambda m: init(m, nn.init.orthogonal_, lambda x: nn.init. 19 | constant_(x, 0)) 20 | 21 | self.critic_linear = init_(nn.Linear(hidden_size, 1)) 22 | 23 | self.train() 24 | 25 | def forward(self, inputs, rnn_hxs, masks): 26 | x = self.main(inputs / 255.0) 27 | 28 | if self.is_recurrent: 29 | x, rnn_hxs = self._forward_gru(x, rnn_hxs, masks) 30 | 31 | return self.critic_linear(x), x, rnn_hxs 32 | 33 | -------------------------------------------------------------------------------- /core/agents/models/mlp_base.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from torch import nn 3 | from core.agents.models.base import NNBase 4 | from core.utils import init 5 | 6 | 7 | class MLPBase(NNBase): 8 | def __init__(self, num_inputs, recurrent=False, hidden_size=64): 9 | super(MLPBase, self).__init__(recurrent, num_inputs, hidden_size) 10 | 11 | def init_(m): return init(m, nn.init.orthogonal_, lambda x: nn.init. 12 | constant_(x, 0), np.sqrt(2)) 13 | 14 | if recurrent: 15 | num_inputs = hidden_size 16 | 17 | self.actor = nn.Sequential( 18 | init_(nn.Linear(num_inputs, hidden_size)), nn.Tanh(), 19 | init_(nn.Linear(hidden_size, hidden_size)), nn.Tanh(), 20 | ) 21 | 22 | self.critic = nn.Sequential( 23 | init_(nn.Linear(num_inputs, hidden_size)), nn.Tanh(), 24 | init_(nn.Linear(hidden_size, hidden_size)), nn.Tanh(), 25 | init_(nn.Linear(hidden_size, hidden_size)), nn.Tanh(), 26 | init_(nn.Linear(hidden_size, 1))) 27 | 28 | self.train() 29 | 30 | def forward(self, inputs, rnn_hxs, masks): 31 | x = inputs 32 | 33 | if self.is_recurrent: 34 | x, rnn_hxs = self._forward_gru(x, rnn_hxs, masks) 35 | 36 | value = self.critic(x) 37 | hidden_actor = self.actor(x) 38 | 39 | return value, hidden_actor, rnn_hxs 40 | -------------------------------------------------------------------------------- /core/algorithms/base_algo.py: -------------------------------------------------------------------------------- 1 | from abc import ABC 2 | from torch import optim, nn 3 | 4 | import torch 5 | 6 | 7 | class BaseAlgo(ABC): 8 | IL_DECAY_RATE = 0.995 # decay factor of imitation learning 9 | ENTROPY_DECAY_RATE = 0.999 10 | MIN_ENTROPY_COEF = 0.0001 11 | 12 | def __init__(self, 13 | actor_critic, 14 | lr, 15 | value_coef, 16 | entropy_coef, 17 | expert=None, 18 | il_coef=1): 19 | self.actor_critic = actor_critic 20 | self.optimizer = optim.Adam(actor_critic.parameters(), lr) 21 | 22 | self.value_coef = value_coef 23 | self.entropy_coef = entropy_coef 24 | 25 | self.il_coef = il_coef 26 | self.expert = expert 27 | self.il_criterion = nn.CrossEntropyLoss() 28 | 29 | def update(self, rollouts): 30 | pass 31 | 32 | def imitation_learning(self, inputs, rnn_hxs, masks, expert): 33 | """ 34 | Imitation learning loss 35 | 36 | :param inputs: state observations 37 | 38 | :param rnn_hxs: rnn hidden state 39 | 40 | :param masks: mask the final state with 0 value 41 | 42 | :param expert: a trained or heuristic agent 43 | 44 | :return: log probability of expert's actions 45 | """ 46 | _, actor_features, _ = self.actor_critic.base(inputs, rnn_hxs, masks) 47 | dist = self.actor_critic.dist(actor_features) 48 | 49 | expert_actions = expert.act(inputs) 50 | 51 | il_loss = self.il_criterion(dist.probs, expert_actions.reshape(-1)) 52 | accuracy = (torch.argmax(dist.probs, dim=1) == 53 | expert_actions.reshape(-1)).float().sum()/expert_actions.shape[0] 54 | 55 | return il_loss, accuracy 56 | 57 | def after_update(self): 58 | self.il_coef *= self.IL_DECAY_RATE 59 | self.entropy_coef = max( 60 | self.entropy_coef * self.ENTROPY_DECAY_RATE, self.MIN_ENTROPY_COEF) 61 | -------------------------------------------------------------------------------- /core/utils.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | import json 4 | 5 | import torch 6 | import torch.nn as nn 7 | 8 | from core.envs.park_envs import VecNormalize 9 | 10 | 11 | # Get a render function 12 | def get_render_func(venv): 13 | if hasattr(venv, 'envs'): 14 | return venv.envs[0].render 15 | elif hasattr(venv, 'venv'): 16 | return get_render_func(venv.venv) 17 | elif hasattr(venv, 'env'): 18 | return get_render_func(venv.env) 19 | 20 | return None 21 | 22 | 23 | def get_vec_normalize(venv): 24 | if isinstance(venv, VecNormalize): 25 | return venv 26 | elif hasattr(venv, 'venv'): 27 | return get_vec_normalize(venv.venv) 28 | 29 | return None 30 | 31 | 32 | # Necessary for my KFAC implementation. 33 | class AddBias(nn.Module): 34 | def __init__(self, bias): 35 | super(AddBias, self).__init__() 36 | self._bias = nn.Parameter(bias.unsqueeze(1)) 37 | 38 | def forward(self, x): 39 | if x.dim() == 2: 40 | bias = self._bias.t().view(1, -1) 41 | else: 42 | bias = self._bias.t().view(1, -1, 1, 1) 43 | 44 | return x + bias 45 | 46 | 47 | def update_linear_schedule(optimizer, epoch, total_num_epochs, initial_lr): 48 | """Decreases the learning rate linearly""" 49 | lr = initial_lr - (initial_lr * (epoch / float(total_num_epochs))) 50 | for param_group in optimizer.param_groups: 51 | param_group['lr'] = lr 52 | return lr 53 | 54 | 55 | def init(module, weight_init, bias_init, gain=1): 56 | weight_init(module.weight.data, gain=gain) 57 | if hasattr(module, 'bias') and (not module.bias is None): 58 | bias_init(module.bias.data) 59 | return module 60 | 61 | 62 | def cleanup_log_dir(log_dir): 63 | try: 64 | os.makedirs(log_dir) 65 | except OSError: 66 | files = glob.glob(os.path.join(log_dir, '*.monitor.csv')) 67 | for f in files: 68 | os.remove(f) 69 | 70 | 71 | def dump_config(argparser, output_path): 72 | with open(output_path, 'w') as f: 73 | json.dump(argparser.__dict__, f, indent=2) 74 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Learning to Assign Credit in Input-driven Environment (LACIE) reduce the variance of estimation of advantages value in noisy MDP with hindsight distribution. 2 | 3 | ## Input-driven MDP 4 | Input-driven MDP are the Markov processes governed by not only agent's actions but also stochastic, exogenous input processes [1]. These environments have high variance inheritantly making it hard to learn optimal policy. 5 | 6 | This repository implemented: 7 | 8 | + Input-dependence baseline as in proposed in [1]. 9 | 10 | + **Lacie** - an algorithm that learn to weight the advantages of each rollout in hindsight with respect to future input sequences. 11 | 12 | ## Install Dependencies 13 | 14 | 1. Install Pytorch 15 | 16 | ```bash 17 | pip install torch torchvision 18 | ``` 19 | 20 | 2. install Tensorflow 2 21 | 22 | ```bash 23 | pip install tensorflow=2.2 24 | ``` 25 | or 26 | ```bash 27 | pip install tensorflow-gpu=2.2 28 | ``` 29 | 30 | 3. Install [OpenAI baseline](https://github.com/openai/baselines/tree/tf2) (Tensorflow 2 version) 31 | ```bash 32 | git clone https://github.com/openai/baselines.git -b tf2 && \ 33 | cd baselines && \ 34 | pip install -e . 35 | ``` 36 | 37 | **Note**: I haven't tested the code on Tensorflow 1 yet but it should work as well. 38 | 39 | 4. Install [Park Platform](https://github.com/park-project/park). I modified the platform slightly to make it compatible with OpenAI's baseline. 40 | ```bash 41 | git clone https://github.com/lehduong/park &&\ 42 | cd park && \ 43 | pip install -e . 44 | ``` 45 | 46 | ## Run experiments 47 | See `scripts` for examples. 48 | 49 | ## Results: 50 | **Reward** of A2C+Lacie (yellow) vs A2C (blue) 51 | ![reward](assets/reward.svg) 52 | 53 | **Value loss** of A2C+Lacie (yellow) vs A2C (blue) during training: 54 | ![train-value-loss](assets/train_value_loss.svg) 55 | 56 | ## Reference 57 | 58 | [1] [Variance Reduction for Reinforcement Learning in Input-Driven Environments](https://openreview.net/forum?id=Hyg1G2AqtQ). 59 | 60 | ## Acknowledgement 61 | The started code is based on [ikostrikov's repository](https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail). 62 | 63 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | ex/ 131 | ex_eval/ 132 | trained_models/ 133 | .DS_Store 134 | .idea/ 135 | .vscode/ 136 | logs/ -------------------------------------------------------------------------------- /core/agents/models/bncnn.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | from core.agents.models.base import NNBase, Flatten 3 | from core.utils import init 4 | 5 | class BNCNN(NNBase): 6 | def __init__(self, num_inputs, recurrent=False, hidden_size=512): 7 | super().__init__(recurrent, hidden_size, hidden_size) 8 | 9 | # shared weight that encode state to vector 10 | self.main = nn.Sequential( 11 | self.init_weight(nn.Conv2d(num_inputs, 32, 4, stride=2, bias=False)), 12 | self.init_weight(nn.BatchNorm2d(32)), # 32x41x41 13 | nn.ReLU(), 14 | self.init_weight(nn.Conv2d(32, 64, 5, stride=2, bias=False)), 15 | self.init_weight(nn.BatchNorm2d(64)), # 64x19x19 16 | nn.ReLU(), 17 | self.init_weight(nn.Conv2d(64, 128, 3, stride=2, bias=False)), 18 | self.init_weight(nn.BatchNorm2d(128)), # 128x9x9 19 | nn.ReLU(), 20 | self.init_weight(nn.Conv2d(128, 256, 5, stride=2, bias=False)), 21 | self.init_weight(nn.BatchNorm2d(256)), 22 | nn.ReLU(), 23 | Flatten(), # 256x3x3 24 | self.init_weight(nn.Linear(256*3*3, hidden_size)), 25 | nn.ReLU()) 26 | 27 | self.critic_linear = nn.Sequential( 28 | self.init_weight(nn.Linear(hidden_size, hidden_size)), 29 | nn.ReLU(), 30 | self.init_weight(nn.Linear(hidden_size, hidden_size)), 31 | nn.ReLU(), 32 | self.init_weight(nn.Linear(hidden_size, 1)) 33 | ) 34 | 35 | # encoder for learning contrastive predictive objective 36 | self.contrastive_encoder = nn.Sequential( 37 | self.init_weight(nn.Linear(hidden_size, hidden_size)), 38 | nn.ReLU(), 39 | self.init_weight(nn.Linear(hidden_size, hidden_size)), 40 | nn.ReLU(), 41 | self.init_weight(nn.Linear(hidden_size, hidden_size)) 42 | ) 43 | self.train() 44 | 45 | def init_weight(self, layer): 46 | init_ = lambda m: init(m, nn.init.orthogonal_, lambda x: nn.init. 47 | constant_(x, 0), nn.init.calculate_gain('relu')) 48 | if isinstance(layer, nn.Conv2d) or isinstance(layer, nn.Linear): 49 | return init_(layer) 50 | elif isinstance(layer, nn.BatchNorm2d): 51 | layer.weight.data.fill_(1) 52 | if hasattr(layer, 'bias'): 53 | layer.bias.data.zero_() 54 | return layer 55 | 56 | def forward(self, inputs, rnn_hxs, masks): 57 | x = self.main(inputs / 255.0) 58 | 59 | if self.is_recurrent: 60 | x, rnn_hxs = self._forward_gru(x, rnn_hxs, masks) 61 | 62 | return self.critic_linear(x), self.contrastive_encoder(x), rnn_hxs 63 | -------------------------------------------------------------------------------- /core/agents/pg.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | 5 | from core.distributions import Bernoulli, Categorical, DiagGaussian 6 | from core.utils import init 7 | from core.agents.models import CNNBase, MLPBase 8 | 9 | 10 | class Policy(nn.Module): 11 | def __init__(self, obs_shape, action_space, base=None, base_kwargs=None): 12 | super(Policy, self).__init__() 13 | # create base network for acting 14 | if base_kwargs is None: 15 | base_kwargs = {} 16 | if base is None: 17 | if len(obs_shape) == 3: 18 | base = CNNBase 19 | elif len(obs_shape) == 1: 20 | base = MLPBase 21 | else: 22 | raise NotImplementedError( 23 | "Expect the observation's shape to be either 3 or 1 but got {}".format(len(obs_shape))) 24 | 25 | self.base = base(obs_shape[0], **base_kwargs) 26 | 27 | # action sampling mechanism 28 | if action_space.__class__.__name__ == "Discrete": 29 | num_outputs = action_space.n 30 | self.dist = Categorical(self.base.output_size, num_outputs) 31 | elif action_space.__class__.__name__ == "Box": 32 | num_outputs = action_space.shape[0] 33 | self.dist = DiagGaussian(self.base.output_size, num_outputs) 34 | elif action_space.__class__.__name__ == "MultiBinary": 35 | num_outputs = action_space.shape[0] 36 | self.dist = Bernoulli(self.base.output_size, num_outputs) 37 | else: 38 | raise NotImplementedError 39 | 40 | self.obs_shape = obs_shape 41 | self.action_space = action_space 42 | 43 | @property 44 | def is_recurrent(self): 45 | return self.base.is_recurrent 46 | 47 | @property 48 | def recurrent_hidden_state_size(self): 49 | """Size of rnn_hx.""" 50 | return self.base.recurrent_hidden_state_size 51 | 52 | def forward(self, inputs, rnn_hxs, masks): 53 | raise NotImplementedError 54 | 55 | def act(self, inputs, rnn_hxs, masks, deterministic=False): 56 | value, actor_features, rnn_hxs = self.base(inputs, rnn_hxs, masks) 57 | dist = self.dist(actor_features) 58 | 59 | # if deterministic greedily choose the most optimal solution otherwise sampling with probability proportional to cummulate reward 60 | if deterministic: 61 | action = dist.mode() 62 | else: 63 | action = dist.sample() 64 | 65 | action_log_probs = dist.log_probs(action) 66 | dist_entropy = dist.entropy().mean() 67 | 68 | return value, action, action_log_probs, rnn_hxs 69 | 70 | def get_value(self, inputs, rnn_hxs, masks): 71 | value, _, _ = self.base(inputs, rnn_hxs, masks) 72 | return value 73 | 74 | def evaluate_actions(self, inputs, rnn_hxs, masks, action): 75 | value, actor_features, rnn_hxs = self.base(inputs, rnn_hxs, masks) 76 | dist = self.dist(actor_features) 77 | 78 | action_log_probs = dist.log_probs(action) 79 | dist_entropy = dist.entropy().mean() 80 | 81 | return value, action_log_probs, dist_entropy, rnn_hxs 82 | -------------------------------------------------------------------------------- /core/envs/load_balance_wrappers.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import random 3 | import numpy as np 4 | 5 | 6 | class ProcessLoadBalanceObservation(gym.ObservationWrapper): 7 | """ 8 | Normalize and clip the observation of LoadBalance environment 9 | :param job_size_norm_factor: float - divide job_size by this factor 10 | :param highest_server_obs: float - clip the server (in observation) having load higher than this value 11 | :param highest_job_obs: float - clip the job (in observation) having size greater than this value 12 | """ 13 | 14 | def __init__(self, 15 | env, 16 | job_size_norm_factor, 17 | server_load_norm_factor, 18 | highest_server_obs, 19 | highest_job_obs, 20 | elapsed_time_norm_factor, 21 | highest_elapsed_time): 22 | super().__init__(env) 23 | self.job_size_norm_factor = job_size_norm_factor 24 | self.server_load_norm_factor = server_load_norm_factor 25 | self.elapsed_time_norm_factor = elapsed_time_norm_factor 26 | self.highest_server_obs = highest_server_obs 27 | self.highest_job_obs = highest_job_obs 28 | self.highest_elapsed_time = highest_elapsed_time 29 | 30 | # compute clip threshold 31 | num_server = len(env.servers) 32 | self.threshold = np.array( 33 | [self.highest_server_obs] * num_server + 34 | [self.highest_job_obs] + 35 | [self.highest_elapsed_time]) 36 | # compute the normalize vector 37 | self.norm_vec = np.array( 38 | [self.server_load_norm_factor] * num_server + 39 | [self.job_size_norm_factor] + 40 | [self.elapsed_time_norm_factor]) 41 | 42 | def observation(self, observation): 43 | # normalized 44 | observation = observation/self.norm_vec 45 | return np.minimum(observation, self.threshold) 46 | 47 | 48 | class LoadBalanceRandomReset(gym.Wrapper): 49 | def __init__(self, env, max_random_steps=50): 50 | """Sample initial states by taking random number of no-ops on reset. 51 | """ 52 | super().__init__(env) 53 | self.max_random_steps = max_random_steps 54 | 55 | def reset(self, **kwargs): 56 | """ Do no-op action for a number of steps in [1, noop_max].""" 57 | obs = self.env.reset(**kwargs) 58 | 59 | # stochastically change number of random steps each time resetting the env 60 | num_random_steps = np.random.randint(0, self.max_random_steps) 61 | 62 | for _ in range(num_random_steps): 63 | obs, _, done, _ = self.env.step( 64 | random.randint(0, len(self.env.servers)-1)) 65 | if done: 66 | obs = self.env.reset(**kwargs) 67 | return obs 68 | 69 | 70 | class RewardNormalize(gym.RewardWrapper): 71 | """ 72 | Divide the reward by a fixed value 73 | """ 74 | 75 | def __init__(self, env, norm_factor): 76 | super().__init__(env) 77 | self.norm_factor = norm_factor 78 | 79 | def reward(self, reward): 80 | return reward/self.norm_factor 81 | 82 | 83 | class FixJobSequence(gym.Wrapper): 84 | """ 85 | Set the random seed of environment to a fixed value every time it reset\ 86 | thus, the job arrival sequence would be unchanged 87 | """ 88 | 89 | def __init__(self, env, seed=0): 90 | super().__init__(env) 91 | self.random_seed = seed 92 | 93 | def reset(self): 94 | self.env.seed(self.random_seed) 95 | return self.env.reset() 96 | -------------------------------------------------------------------------------- /core/algorithms/input_dependent_baseline/mib_a2c.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | from .base_meta_critic import ActorMetaCriticAlgo 5 | 6 | DECAY_RATE = 0.995 7 | 8 | 9 | class MIB_A2C(ActorMetaCriticAlgo): 10 | """ 11 | Meta Input-dependent Baseline A2C. \ 12 | This A2C class leverages input-dependent baseline, which is learned with meta learning, \ 13 | to reduce variance when updating parameters 14 | """ 15 | 16 | def __init__(self, 17 | actor_critic, 18 | entropy_coef, 19 | lr=1e-3, 20 | adapt_lr=1e-3, 21 | num_inner_steps=5, 22 | max_grad_norm=None, 23 | expert=None, 24 | il_coef=10): 25 | super().__init__(actor_critic=actor_critic, 26 | lr=lr, 27 | adapt_lr=adapt_lr, 28 | num_inner_steps=num_inner_steps) 29 | self.entropy_coef = entropy_coef 30 | self.max_grad_norm = max_grad_norm 31 | 32 | self.expert = expert 33 | self.il_coef = il_coef 34 | 35 | def update(self, rollouts): 36 | obs_shape = rollouts.obs.size()[2:] 37 | action_shape = rollouts.actions.size()[-1] 38 | num_steps, num_processes, _ = rollouts.rewards.size() 39 | 40 | # action loss + entropy loss 41 | values, value_loss = self.train_meta_critic_and_predict_values( 42 | rollouts) 43 | _, action_log_probs, dist_entropy, _ = self.actor_critic.evaluate_actions( 44 | rollouts.obs[:-1].view(-1, *obs_shape), 45 | rollouts.recurrent_hidden_states[0].view( 46 | -1, self.actor_critic.recurrent_hidden_state_size), 47 | rollouts.masks[:-1].view(-1, 1), 48 | rollouts.actions.view(-1, action_shape)) 49 | 50 | values = values.view(num_steps, num_processes, 1) 51 | action_log_probs = action_log_probs.view(num_steps, num_processes, 1) 52 | 53 | advantages = rollouts.returns[:-1] - values 54 | # Normalize advantages? 55 | advantages = (advantages - advantages.mean())/(advantages.std() + 1e-5) 56 | 57 | action_loss = -(advantages.detach() * action_log_probs).mean() 58 | 59 | # imitation learning 60 | imitation_loss, accuracy = torch.tensor(0).to(rollouts.obs.device), 0 61 | if self.expert: 62 | imitation_loss, accuracy = self.imitation_learning( 63 | rollouts.obs[:-1].view(-1, *obs_shape), 64 | rollouts.recurrent_hidden_states[0].view( 65 | -1, self.actor_critic.recurrent_hidden_state_size), 66 | rollouts.masks[:-1].view(-1, 1), 67 | self.expert) 68 | # ----------------------------------------------------- 69 | 70 | self.optimizer.zero_grad() 71 | 72 | # total loss 73 | loss = action_loss + self.il_coef * \ 74 | imitation_loss - self.entropy_coef * dist_entropy 75 | loss.backward() 76 | 77 | nn.utils.clip_grad_norm_(self.actor_critic.parameters(), 78 | self.max_grad_norm) 79 | 80 | self.optimizer.step() 81 | 82 | # reduce the weight of imitation learning during training process 83 | self.il_coef = self.il_coef * DECAY_RATE 84 | 85 | return { 86 | 'value loss': value_loss, 87 | 'action loss': action_loss.item(), 88 | 'entropy loss': dist_entropy.item(), 89 | 'imitation loss': imitation_loss.item(), 90 | 'accuracy': accuracy 91 | } 92 | -------------------------------------------------------------------------------- /core/distributions.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | 7 | from core.utils import AddBias, init 8 | 9 | """ 10 | Modify standard PyTorch distributions so they are compatible with this code. 11 | """ 12 | 13 | # 14 | # Standardize distribution interfaces 15 | # 16 | 17 | # Categorical 18 | 19 | 20 | class FixedCategorical(torch.distributions.Categorical): 21 | def sample(self): 22 | return super().sample().unsqueeze(-1) 23 | 24 | def log_probs(self, actions): 25 | return ( 26 | super() 27 | .log_prob(actions.squeeze(-1)) 28 | .view(actions.size(0), -1) 29 | .sum(-1) 30 | .unsqueeze(-1) 31 | ) 32 | 33 | def mode(self): 34 | return self.probs.argmax(dim=-1, keepdim=True) 35 | 36 | 37 | # Normal 38 | class FixedNormal(torch.distributions.Normal): 39 | def log_probs(self, actions): 40 | return super().log_prob(actions).sum(-1, keepdim=True) 41 | 42 | def entrop(self): 43 | return super.entropy().sum(-1) 44 | 45 | def mode(self): 46 | return self.mean 47 | 48 | 49 | # Bernoulli 50 | class FixedBernoulli(torch.distributions.Bernoulli): 51 | def log_probs(self, actions): 52 | return super.log_prob(actions).view(actions.size(0), -1).sum(-1).unsqueeze(-1) 53 | 54 | def entropy(self): 55 | return super().entropy().sum(-1) 56 | 57 | def mode(self): 58 | return torch.gt(self.probs, 0.5).float() 59 | 60 | 61 | class Categorical(nn.Module): 62 | def __init__(self, num_inputs, num_outputs): 63 | super(Categorical, self).__init__() 64 | 65 | def init_(m): return init( 66 | m, 67 | nn.init.orthogonal_, 68 | lambda x: nn.init.constant_(x, 0), 69 | gain=0.01) 70 | 71 | self.linear = nn.Sequential( 72 | init_(nn.Linear(num_inputs, num_outputs)) 73 | ) 74 | 75 | # hack 76 | # The weight of last layer will be initialized 100 times smaller than regular layer 77 | # according to Sec 3.2 https://arxiv.org/abs/2006.05990 78 | self.linear[0].weight.data = self.linear[0].weight.data*0.01 79 | 80 | def forward(self, x): 81 | x = self.linear(x) 82 | return FixedCategorical(logits=x) 83 | 84 | 85 | class DiagGaussian(nn.Module): 86 | def __init__(self, num_inputs, num_outputs): 87 | super(DiagGaussian, self).__init__() 88 | 89 | def init_(m): return init(m, nn.init.orthogonal_, lambda x: nn.init. 90 | constant_(x, 0)) 91 | 92 | self.fc_mean = init_(nn.Linear(num_inputs, num_outputs)) 93 | self.logstd = AddBias(torch.zeros(num_outputs)) 94 | 95 | def forward(self, x): 96 | action_mean = self.fc_mean(x) 97 | 98 | # An ugly hack for my KFAC implementation. 99 | zeros = torch.zeros(action_mean.size()) 100 | if x.is_cuda: 101 | zeros = zeros.cuda() 102 | 103 | action_logstd = self.logstd(zeros) 104 | return FixedNormal(action_mean, action_logstd.exp()) 105 | 106 | 107 | class Bernoulli(nn.Module): 108 | def __init__(self, num_inputs, num_outputs): 109 | super(Bernoulli, self).__init__() 110 | 111 | def init_(m): return init(m, nn.init.orthogonal_, lambda x: nn.init. 112 | constant_(x, 0)) 113 | 114 | self.linear = init_(nn.Linear(num_inputs, num_outputs)) 115 | 116 | def forward(self, x): 117 | x = self.linear(x) 118 | return FixedBernoulli(logits=x) 119 | -------------------------------------------------------------------------------- /core/agents/models/base.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | 4 | class Flatten(nn.Module): 5 | def forward(self, x): 6 | return x.view(x.size(0), -1) 7 | 8 | 9 | class NNBase(nn.Module): 10 | def __init__(self, recurrent, recurrent_input_size, hidden_size): 11 | super(NNBase, self).__init__() 12 | 13 | self._hidden_size = hidden_size 14 | self._recurrent = recurrent 15 | 16 | if recurrent: 17 | self.gru = nn.GRU(recurrent_input_size, hidden_size) 18 | 19 | @property 20 | def is_recurrent(self): 21 | return self._recurrent 22 | 23 | @property 24 | def recurrent_hidden_state_size(self): 25 | if self._recurrent: 26 | return self._hidden_size 27 | return 1 28 | 29 | @property 30 | def output_size(self): 31 | return self._hidden_size 32 | 33 | def _forward_gru(self, x, hxs, masks): 34 | if x.size(0) == hxs.size(0): 35 | x, hxs = self.gru(x.unsqueeze(0), (hxs * masks).unsqueeze(0)) 36 | x = x.squeeze(0) 37 | hxs = hxs.squeeze(0) 38 | else: 39 | # x is a (T, N, -1) tensor that has been flatten to (T * N, -1) 40 | N = hxs.size(0) 41 | T = int(x.size(0) / N) 42 | 43 | # unflatten 44 | x = x.view(T, N, x.size(1)) 45 | 46 | # Same deal with masks 47 | masks = masks.view(T, N) 48 | 49 | # Let's figure out which steps in the sequence have a zero for any agent 50 | # We will always assume t=0 has a zero in it as that makes the logic cleaner 51 | has_zeros = ((masks[1:] == 0.0) \ 52 | .any(dim=-1) 53 | .nonzero() 54 | .squeeze() 55 | .cpu()) 56 | 57 | # +1 to correct the masks[1:] 58 | if has_zeros.dim() == 0: 59 | # Deal with scalar 60 | has_zeros = [has_zeros.item() + 1] 61 | else: 62 | has_zeros = (has_zeros + 1).numpy().tolist() 63 | 64 | # add t=0 and t=T to the list 65 | has_zeros = [0] + has_zeros + [T] 66 | 67 | hxs = hxs.unsqueeze(0) 68 | outputs = [] 69 | self.gru.flatten_parameters() 70 | for i in range(len(has_zeros) - 1): 71 | # We can now process steps that don't have any zeros in masks together! 72 | # This is much faster 73 | start_idx = has_zeros[i] 74 | end_idx = has_zeros[i + 1] 75 | 76 | rnn_scores, hxs = self.gru( 77 | x[start_idx:end_idx], 78 | hxs * masks[start_idx].view(1, -1, 1)) 79 | 80 | outputs.append(rnn_scores) 81 | 82 | # assert len(outputs) == T 83 | # x is a (T, N, -1) tensor 84 | x = torch.cat(outputs, dim=0) 85 | # flatten 86 | x = x.view(T * N, -1) 87 | hxs = hxs.squeeze(0) 88 | 89 | return x, hxs 90 | 91 | def init_weight(self, layer): 92 | init_ = lambda m: init(m, nn.init.orthogonal_, lambda x: nn.init. 93 | constant_(x, 0), nn.init.calculate_gain('relu')) 94 | if isinstance(layer, nn.Conv2d) or isinstance(layer, nn.Linear): 95 | return init_(layer) 96 | elif isinstance(layer, nn.BatchNorm2d): 97 | layer.weight.data.fill_(1) 98 | if hasattr(layer, 'bias'): 99 | layer.bias.data.zero_() 100 | return layer 101 | 102 | -------------------------------------------------------------------------------- /core/algorithms/a2c_acktr.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.optim as optim 4 | 5 | from core.algorithms.kfac import KFACOptimizer 6 | from core.algorithms.base_algo import BaseAlgo 7 | 8 | 9 | class A2C_ACKTR(BaseAlgo): 10 | def __init__(self, 11 | actor_critic, 12 | value_loss_coef, 13 | entropy_coef, 14 | lr=None, 15 | eps=None, 16 | alpha=None, 17 | max_grad_norm=None, 18 | acktr=False, 19 | expert=None, 20 | il_coef=1): 21 | super().__init__(actor_critic, lr, value_loss_coef, entropy_coef, expert, il_coef) 22 | self.acktr = acktr 23 | 24 | self.max_grad_norm = max_grad_norm 25 | 26 | if acktr: 27 | self.optimizer = KFACOptimizer(actor_critic) 28 | 29 | def update(self, rollouts): 30 | obs_shape = rollouts.obs.size()[2:] 31 | action_shape = rollouts.actions.size()[-1] 32 | num_steps, num_processes, _ = rollouts.rewards.size() 33 | 34 | values, action_log_probs, dist_entropy, _ = self.actor_critic.evaluate_actions( 35 | rollouts.obs[:-1].view(-1, *obs_shape), 36 | rollouts.recurrent_hidden_states[:-1].view( 37 | -1, self.actor_critic.recurrent_hidden_state_size), 38 | rollouts.masks[:-1].view(-1, 1), 39 | rollouts.actions.view(-1, action_shape)) 40 | 41 | values = values.view(num_steps, num_processes, 1) 42 | action_log_probs = action_log_probs.view(num_steps, num_processes, 1) 43 | 44 | advantages = rollouts.returns[:-1] - values 45 | value_loss = advantages.pow(2).mean() 46 | 47 | action_loss = -(advantages.detach() * action_log_probs).mean() 48 | 49 | # imitation learning 50 | imitation_loss, accuracy = torch.tensor(0).to(rollouts.obs.device), 0 51 | if self.expert: 52 | imitation_loss, accuracy = self.imitation_learning( 53 | rollouts.obs[:-1].view(-1, *obs_shape), 54 | rollouts.recurrent_hidden_states[0].view( 55 | -1, self.actor_critic.recurrent_hidden_state_size), 56 | rollouts.masks[:-1].view(-1, 1), 57 | self.expert) 58 | 59 | if self.acktr and self.optimizer.steps % self.optimizer.Ts == 0: 60 | # Compute fisher, see Martens 2014 61 | self.actor_critic.zero_grad() 62 | pg_fisher_loss = -action_log_probs.mean() 63 | 64 | value_noise = torch.randn(values.size()) 65 | if values.is_cuda: 66 | value_noise = value_noise.cuda() 67 | 68 | sample_values = values + value_noise 69 | vf_fisher_loss = -(values - sample_values.detach()).pow(2).mean() 70 | 71 | fisher_loss = pg_fisher_loss + vf_fisher_loss 72 | self.optimizer.acc_stats = True 73 | fisher_loss.backward(retain_graph=True) 74 | self.optimizer.acc_stats = False 75 | 76 | self.optimizer.zero_grad() 77 | (imitation_loss * self.il_coef + value_loss * self.value_coef + action_loss - 78 | dist_entropy * self.entropy_coef).backward() 79 | 80 | if self.acktr == False: 81 | nn.utils.clip_grad_norm_(self.actor_critic.parameters(), 82 | self.max_grad_norm) 83 | 84 | self.optimizer.step() 85 | self.after_update() 86 | 87 | return { 88 | 'value loss': value_loss.item(), 89 | 'action loss': action_loss.item(), 90 | 'entropy loss': dist_entropy.item(), 91 | 'imitation loss': imitation_loss.item(), 92 | 'accuracy': accuracy 93 | } 94 | -------------------------------------------------------------------------------- /core/storage/lacie_storage.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | 4 | 5 | class LacieStorage(object): 6 | def __init__(self, num_steps, obs_shape, action_space, 7 | max_size=10000, 8 | batch_size=64, 9 | n_processes=16): 10 | # obs 11 | self.obs = torch.zeros(max_size, num_steps + 1, * obs_shape) 12 | 13 | # action 14 | if action_space.__class__.__name__ == 'Discrete': 15 | action_shape = 1 16 | else: 17 | action_shape = action_space.shape[0] 18 | self.actions = torch.zeros(max_size, num_steps, action_shape) 19 | if action_space.__class__.__name__ == 'Discrete': 20 | self.actions = self.actions.long() 21 | 22 | # mask 23 | self.masks = torch.ones(max_size, num_steps + 1, 1) 24 | 25 | # advantages 26 | self.advantages = torch.zeros(max_size, num_steps, 1) 27 | 28 | self.ptr, self.size, self.max_size = 0, 0, max_size 29 | 30 | self.batch_size = batch_size 31 | self.n_processes = n_processes 32 | 33 | def to(self, device): 34 | self.obs = self.obs.to(device) 35 | self.actions = self.actions.to(device) 36 | self.masks = self.masks.to(device) 37 | self.advantages = self.advantages.to(device) 38 | 39 | def insert(self, rollouts, advantages): 40 | """ 41 | Update the buffer with new rollouts from Storages mem 42 | :param obs: torch.Tensor of shape (num_steps + 1, n_processes, obs_shape) 43 | :param actions: torch.Tensor of shape (num_steps, n_processes, action_shape) 44 | :param masks: torch.Tensor of shape (num_steps + 1, n_processes, 1) 45 | :param advantages: torch.Tensor of shape (num_steps + 1, n_processes, 1) 46 | """ 47 | obs = rollouts.obs.permute(1, 0, 2) 48 | actions = rollouts.actions.permute(1, 0, 2) 49 | masks = rollouts.masks.permute(1, 0, 2) 50 | advantages = advantages.permute(1, 0, 2) 51 | n = obs.shape[0] 52 | 53 | idxs = np.arange(self.ptr, self.ptr + n) % self.max_size 54 | self.obs[idxs] = obs 55 | self.actions[idxs] = actions 56 | self.masks[idxs] = masks 57 | self.advantages[idxs] = advantages 58 | self.ptr = (self.ptr + n) % self.max_size 59 | 60 | self.size = min(self.size + n, self.max_size) 61 | 62 | def sample(self): 63 | idxs = np.random.choice( 64 | self.size, min(self.batch_size, self.size)) 65 | batch = dict(obs=self.obs[idxs], 66 | actions=self.actions[idxs], 67 | advantages=self.advantages[idxs], 68 | masks=self.masks[idxs]) 69 | 70 | # permute tensor to shape n_steps x batch_size x shape 71 | return {k: v.permute(1, 0, 2) for k, v in batch.items()} 72 | 73 | def sample_most_recent(self): 74 | if self.size < self.batch_size: 75 | idxs = np.arange(0, self.size) 76 | else: 77 | idxs = np.arange(self.ptr - self.batch_size, 78 | self.ptr) % self.max_size 79 | # the first n_procecsses indexes will be used to storage current rollout 80 | # the rest are most recent rollouts 81 | idxs = np.concatenate( 82 | [ 83 | idxs[-self.n_processes:], 84 | idxs[:-self.n_processes] 85 | ] 86 | ) 87 | batch = dict(obs=self.obs[idxs], 88 | actions=self.actions[idxs], 89 | advantages=self.advantages[idxs], 90 | masks=self.masks[idxs]) 91 | 92 | # permute tensor to shape n_steps x batch_size x shape 93 | return {k: v.permute(1, 0, 2) for k, v in batch.items()} 94 | -------------------------------------------------------------------------------- /evaluation.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import copy 4 | 5 | from core import utils 6 | from core.envs import make_vec_envs 7 | from core.agents.heuristic.load_balance import LeastWorkAgent, \ 8 | ShortestProcessingTimeAgent, RandomAllocateAgent, EarliestCompletionTimeAgent 9 | 10 | NUM_EVAL_EPISODES = 64 11 | 12 | 13 | def evaluate(actor_critic, env_name, seed, num_processes, eval_log_dir, 14 | device, env_args=None): 15 | seed = seed if env_args.fix_job_sequence else seed + num_processes 16 | num_processes = 1 if env_args.fix_job_sequence else num_processes 17 | 18 | returns = benchmark_heuristic([LeastWorkAgent(), 19 | RandomAllocateAgent(), 20 | EarliestCompletionTimeAgent( 21 | env_args.load_balance_service_rates)], 22 | env_name=env_name, 23 | seed=seed, 24 | num_processes=num_processes, 25 | log_dir=eval_log_dir, 26 | device=device, 27 | args=env_args) 28 | # benchmark heuristic 29 | # least_work 30 | eval_envs = make_vec_envs(env_name=env_name, 31 | seed=seed, 32 | num_processes=num_processes, 33 | log_dir=eval_log_dir, 34 | device=device, 35 | allow_early_resets=True, 36 | train=False, 37 | args=env_args) 38 | eval_episode_rewards = [] 39 | 40 | obs = eval_envs.reset() 41 | eval_recurrent_hidden_states = torch.zeros( 42 | num_processes, actor_critic.recurrent_hidden_state_size, device=device) 43 | eval_masks = torch.zeros(num_processes, 1, device=device) 44 | 45 | while len(eval_episode_rewards) < NUM_EVAL_EPISODES: 46 | with torch.no_grad(): 47 | _, action, _, eval_recurrent_hidden_states = actor_critic.act( 48 | obs, 49 | eval_recurrent_hidden_states, 50 | eval_masks, 51 | deterministic=True) 52 | 53 | # Obser reward and next obs 54 | # FIXME: debug why actions must be moved to cpu? 55 | obs, _, done, infos = eval_envs.step(action.cpu()) 56 | 57 | eval_masks = torch.tensor( 58 | [[0.0] if done_ else [1.0] for done_ in done], 59 | dtype=torch.float32, 60 | device=device) 61 | 62 | for info in infos: 63 | if 'episode' in info.keys(): 64 | eval_episode_rewards.append(info['episode']['r']) 65 | 66 | eval_envs.close() 67 | returns['RLAgent'] = eval_episode_rewards 68 | 69 | # print out the result 70 | for k, v in returns.items(): 71 | print(" => Evaluate {} using {} episodes: mean reward {:.5f}".format( 72 | k, len(v), np.mean(v))) 73 | return returns 74 | 75 | 76 | def benchmark_single_heuristic(agent, eval_envs): 77 | """ 78 | Compute return of a single heuristic agent 79 | """ 80 | obs = eval_envs.reset() 81 | eval_episode_rewards = [] 82 | 83 | while len(eval_episode_rewards) < NUM_EVAL_EPISODES: 84 | action = agent.act(obs) 85 | # Obser reward and next obs 86 | 87 | obs, _, done, infos = eval_envs.step(action.cpu()) 88 | 89 | for info in infos: 90 | if 'episode' in info.keys(): 91 | eval_episode_rewards.append(info['episode']['r']) 92 | 93 | eval_envs.close() 94 | 95 | return eval_episode_rewards 96 | 97 | 98 | def benchmark_heuristic(agents, **kwargs): 99 | """ 100 | Compute return of all heuristics 101 | """ 102 | ret = {} 103 | for agent in agents: 104 | envs = make_vec_envs(env_name=kwargs['env_name'], 105 | seed=kwargs['seed'], 106 | num_processes=kwargs['num_processes'], 107 | log_dir=kwargs['log_dir'], 108 | device=kwargs['device'], 109 | allow_early_resets=True, 110 | train=False, 111 | args=kwargs['args']) 112 | 113 | eval_episode_rewards = benchmark_single_heuristic(agent, envs) 114 | # append the result to return dictionary 115 | ret[agent.__class__.__name__] = eval_episode_rewards 116 | 117 | return ret 118 | -------------------------------------------------------------------------------- /core/algorithms/input_dependent_baseline/mib_ppo.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | from .base_meta_critic import ActorMetaCriticAlgo 5 | 6 | DECAY_RATE = 0.995 7 | 8 | 9 | class MIB_PPO(ActorMetaCriticAlgo): 10 | def __init__(self, 11 | actor_critic, 12 | clip_param, 13 | ppo_epoch, 14 | num_mini_batch, 15 | entropy_coef, 16 | lr=None, 17 | adapt_lr=None, 18 | num_inner_steps=5, 19 | max_grad_norm=None, 20 | expert=None, 21 | il_coef=10): 22 | 23 | super().__init__(actor_critic, lr, adapt_lr, num_inner_steps) 24 | 25 | # PPO Args 26 | self.clip_param = clip_param 27 | self.ppo_epoch = ppo_epoch 28 | self.num_mini_batch = num_mini_batch 29 | 30 | self.entropy_coef = entropy_coef 31 | self.max_grad_norm = max_grad_norm 32 | 33 | self.expert = expert 34 | self.il_coef = il_coef 35 | 36 | def update(self, rollouts): 37 | obs_shape = rollouts.obs.size()[2:] 38 | action_shape = rollouts.actions.size()[-1] 39 | num_steps, num_processes, _ = rollouts.rewards.size() 40 | 41 | # action loss + entropy loss 42 | value_preds, value_loss = self.train_meta_critic_and_predict_values( 43 | rollouts) 44 | _, action_log_probs, dist_entropy, _ = self.actor_critic.evaluate_actions( 45 | rollouts.obs[:-1].view(-1, *obs_shape), 46 | rollouts.recurrent_hidden_states[0].view( 47 | -1, self.actor_critic.recurrent_hidden_state_size), 48 | rollouts.masks[:-1].view(-1, 1), 49 | rollouts.actions.view(-1, action_shape)) 50 | 51 | value_preds = value_preds.view(num_steps, num_processes, 1) 52 | 53 | advantages = rollouts.returns[:-1] - value_preds 54 | advantages = (advantages - advantages.mean()) / ( 55 | advantages.std() + 1e-5) 56 | 57 | advantages = advantages.detach() 58 | 59 | action_loss_epoch = 0 60 | dist_entropy_epoch = 0 61 | imitation_loss_epoch = 0 62 | accuracy_epoch = 0 63 | 64 | for _ in range(self.ppo_epoch): 65 | if self.actor_critic.is_recurrent: 66 | data_generator = rollouts.recurrent_generator( 67 | advantages, self.num_mini_batch) 68 | else: 69 | data_generator = rollouts.feed_forward_generator( 70 | advantages, self.num_mini_batch) 71 | 72 | for sample in data_generator: 73 | obs_batch, recurrent_hidden_states_batch, actions_batch, \ 74 | _, _, masks_batch, old_action_log_probs_batch, \ 75 | adv_targ = sample 76 | 77 | # Reshape to do in a single forward pass for all steps 78 | value_preds, action_log_probs, dist_entropy, _ = self.actor_critic.evaluate_actions( 79 | obs_batch, recurrent_hidden_states_batch, masks_batch, 80 | actions_batch) 81 | 82 | ratio = torch.exp(action_log_probs - 83 | old_action_log_probs_batch) 84 | surr1 = ratio * adv_targ 85 | surr2 = torch.clamp(ratio, 1.0 - self.clip_param, 86 | 1.0 + self.clip_param) * adv_targ 87 | action_loss = -torch.min(surr1, surr2).mean() 88 | 89 | # imitation learning 90 | imitation_loss, accuracy = torch.tensor( 91 | 0).to(action_loss.device), 0 92 | if self.expert: 93 | imitation_loss, accuracy = self.imitation_learning( 94 | rollouts.obs[:-1].view(-1, *obs_shape), 95 | rollouts.recurrent_hidden_states[0].view( 96 | -1, self.actor_critic.recurrent_hidden_state_size), 97 | rollouts.masks[:-1].view(-1, 1), 98 | self.expert) 99 | 100 | self.optimizer.zero_grad() 101 | (imitation_loss * self.il_coef + action_loss - 102 | dist_entropy * self.entropy_coef).backward() 103 | nn.utils.clip_grad_norm_(self.actor_critic.parameters(), 104 | self.max_grad_norm) 105 | self.optimizer.step() 106 | 107 | action_loss_epoch += action_loss.item() 108 | dist_entropy_epoch += dist_entropy.item() 109 | imitation_loss_epoch += imitation_loss.item() 110 | accuracy_epoch += accuracy 111 | 112 | num_updates = self.ppo_epoch * self.num_mini_batch 113 | 114 | action_loss_epoch /= num_updates 115 | dist_entropy_epoch /= num_updates 116 | imitation_loss_epoch /= num_updates 117 | accuracy_epoch /= num_updates 118 | 119 | self.il_coef *= DECAY_RATE 120 | 121 | return { 122 | "value loss": value_loss, 123 | "action loss": action_loss_epoch, 124 | "imitation loss": imitation_loss_epoch, 125 | "accuracy": accuracy_epoch, 126 | "entropy loss": dist_entropy_epoch 127 | } 128 | -------------------------------------------------------------------------------- /core/algorithms/ppo.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import torch.optim as optim 5 | 6 | from .base_algo import BaseAlgo 7 | 8 | 9 | class PPO(BaseAlgo): 10 | def __init__(self, 11 | actor_critic, 12 | clip_param, 13 | ppo_epoch, 14 | num_mini_batch, 15 | value_loss_coef, 16 | entropy_coef, 17 | lr=None, 18 | eps=None, 19 | max_grad_norm=None, 20 | use_clipped_value_loss=True, 21 | expert=None, 22 | il_coef=1): 23 | super().__init__(actor_critic, lr, value_loss_coef, entropy_coef, expert, il_coef) 24 | 25 | self.clip_param = clip_param 26 | self.ppo_epoch = ppo_epoch 27 | self.num_mini_batch = num_mini_batch 28 | 29 | self.max_grad_norm = max_grad_norm 30 | self.use_clipped_value_loss = use_clipped_value_loss 31 | 32 | def update(self, rollouts): 33 | obs_shape = rollouts.obs.size()[2:] 34 | advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1] 35 | advantages = (advantages - advantages.mean()) / ( 36 | advantages.std() + 1e-5) 37 | 38 | value_loss_epoch = 0 39 | action_loss_epoch = 0 40 | dist_entropy_epoch = 0 41 | imitation_loss_epoch = 0 42 | accuracy_epoch = 0 43 | 44 | for e in range(self.ppo_epoch): 45 | if self.actor_critic.is_recurrent: 46 | data_generator = rollouts.recurrent_generator( 47 | advantages, self.num_mini_batch) 48 | else: 49 | data_generator = rollouts.feed_forward_generator( 50 | advantages, self.num_mini_batch) 51 | 52 | for sample in data_generator: 53 | obs_batch, recurrent_hidden_states_batch, actions_batch, \ 54 | value_preds_batch, return_batch, masks_batch, old_action_log_probs_batch, \ 55 | adv_targ = sample 56 | 57 | # Reshape to do in a single forward pass for all steps 58 | values, action_log_probs, dist_entropy, _ = self.actor_critic.evaluate_actions( 59 | obs_batch, recurrent_hidden_states_batch, masks_batch, 60 | actions_batch) 61 | 62 | ratio = torch.exp(action_log_probs - 63 | old_action_log_probs_batch) 64 | surr1 = ratio * adv_targ 65 | surr2 = torch.clamp(ratio, 1.0 - self.clip_param, 66 | 1.0 + self.clip_param) * adv_targ 67 | action_loss = -torch.min(surr1, surr2).mean() 68 | 69 | if self.use_clipped_value_loss: 70 | value_pred_clipped = value_preds_batch + \ 71 | (values - value_preds_batch).clamp(-self.clip_param, 72 | self.clip_param) 73 | value_losses = (values - return_batch).pow(2) 74 | value_losses_clipped = ( 75 | value_pred_clipped - return_batch).pow(2) 76 | value_loss = 0.5 * torch.max(value_losses, 77 | value_losses_clipped).mean() 78 | else: 79 | value_loss = 0.5 * (return_batch - values).pow(2).mean() 80 | 81 | # imitation learning 82 | imitation_loss, accuracy = torch.tensor( 83 | 0).to(action_loss.device), 0 84 | if self.expert: 85 | imitation_loss, accuracy = self.imitation_learning( 86 | rollouts.obs[:-1].view(-1, *obs_shape), 87 | rollouts.recurrent_hidden_states[0].view( 88 | -1, self.actor_critic.recurrent_hidden_state_size), 89 | rollouts.masks[:-1].view(-1, 1), 90 | self.expert) 91 | 92 | self.optimizer.zero_grad() 93 | (imitation_loss * self.il_coef * self.value_coef + action_loss - 94 | dist_entropy * self.entropy_coef).backward() 95 | nn.utils.clip_grad_norm_(self.actor_critic.parameters(), 96 | self.max_grad_norm) 97 | self.optimizer.step() 98 | 99 | value_loss_epoch += value_loss.item() 100 | action_loss_epoch += action_loss.item() 101 | dist_entropy_epoch += dist_entropy.item() 102 | imitation_loss_epoch += imitation_loss.item() 103 | accuracy_epoch += accuracy 104 | 105 | num_updates = self.ppo_epoch * self.num_mini_batch 106 | 107 | value_loss_epoch /= num_updates 108 | action_loss_epoch /= num_updates 109 | dist_entropy_epoch /= num_updates 110 | imitation_loss_epoch /= num_updates 111 | accuracy_epoch /= num_updates 112 | 113 | self.after_update() 114 | 115 | return { 116 | "value loss": value_loss_epoch, 117 | "action loss": action_loss_epoch, 118 | "entropy loss": dist_entropy_epoch, 119 | "imitation loss": imitation_loss_epoch, 120 | "accuracy": accuracy_epoch 121 | } 122 | -------------------------------------------------------------------------------- /core/algorithms/kfac.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | import torch.optim as optim 7 | 8 | from core.utils import AddBias 9 | 10 | # TODO: In order to make this code faster: 11 | # 1) Implement _extract_patches as a single cuda kernel 12 | # 2) Compute QR decomposition in a separate process 13 | # 3) Actually make a general KFAC optimizer so it fits PyTorch 14 | 15 | 16 | def _extract_patches(x, kernel_size, stride, padding): 17 | if padding[0] + padding[1] > 0: 18 | x = F.pad(x, (padding[1], padding[1], padding[0], 19 | padding[0])).data # Actually check dims 20 | x = x.unfold(2, kernel_size[0], stride[0]) 21 | x = x.unfold(3, kernel_size[1], stride[1]) 22 | x = x.transpose_(1, 2).transpose_(2, 3).contiguous() 23 | x = x.view( 24 | x.size(0), x.size(1), x.size(2), 25 | x.size(3) * x.size(4) * x.size(5)) 26 | return x 27 | 28 | 29 | def compute_cov_a(a, classname, layer_info, fast_cnn): 30 | batch_size = a.size(0) 31 | 32 | if classname == 'Conv2d': 33 | if fast_cnn: 34 | a = _extract_patches(a, *layer_info) 35 | a = a.view(a.size(0), -1, a.size(-1)) 36 | a = a.mean(1) 37 | else: 38 | a = _extract_patches(a, *layer_info) 39 | a = a.view(-1, a.size(-1)).div_(a.size(1)).div_(a.size(2)) 40 | elif classname == 'AddBias': 41 | is_cuda = a.is_cuda 42 | a = torch.ones(a.size(0), 1) 43 | if is_cuda: 44 | a = a.cuda() 45 | 46 | return a.t() @ (a / batch_size) 47 | 48 | 49 | def compute_cov_g(g, classname, layer_info, fast_cnn): 50 | batch_size = g.size(0) 51 | 52 | if classname == 'Conv2d': 53 | if fast_cnn: 54 | g = g.view(g.size(0), g.size(1), -1) 55 | g = g.sum(-1) 56 | else: 57 | g = g.transpose(1, 2).transpose(2, 3).contiguous() 58 | g = g.view(-1, g.size(-1)).mul_(g.size(1)).mul_(g.size(2)) 59 | elif classname == 'AddBias': 60 | g = g.view(g.size(0), g.size(1), -1) 61 | g = g.sum(-1) 62 | 63 | g_ = g * batch_size 64 | return g_.t() @ (g_ / g.size(0)) 65 | 66 | 67 | def update_running_stat(aa, m_aa, momentum): 68 | # Do the trick to keep aa unchanged and not create any additional tensors 69 | m_aa *= momentum / (1 - momentum) 70 | m_aa += aa 71 | m_aa *= (1 - momentum) 72 | 73 | 74 | class SplitBias(nn.Module): 75 | def __init__(self, module): 76 | super(SplitBias, self).__init__() 77 | self.module = module 78 | self.add_bias = AddBias(module.bias.data) 79 | self.module.bias = None 80 | 81 | def forward(self, input): 82 | x = self.module(input) 83 | x = self.add_bias(x) 84 | return x 85 | 86 | 87 | class KFACOptimizer(optim.Optimizer): 88 | def __init__(self, 89 | model, 90 | lr=0.25, 91 | momentum=0.9, 92 | stat_decay=0.99, 93 | kl_clip=0.001, 94 | damping=1e-2, 95 | weight_decay=0, 96 | fast_cnn=False, 97 | Ts=1, 98 | Tf=10): 99 | defaults = dict() 100 | 101 | def split_bias(module): 102 | for mname, child in module.named_children(): 103 | if hasattr(child, 'bias') and child.bias is not None: 104 | module._modules[mname] = SplitBias(child) 105 | else: 106 | split_bias(child) 107 | 108 | split_bias(model) 109 | 110 | super(KFACOptimizer, self).__init__(model.parameters(), defaults) 111 | 112 | self.known_modules = {'Linear', 'Conv2d', 'AddBias'} 113 | 114 | self.modules = [] 115 | self.grad_outputs = {} 116 | 117 | self.model = model 118 | self._prepare_model() 119 | 120 | self.steps = 0 121 | 122 | self.m_aa, self.m_gg = {}, {} 123 | self.Q_a, self.Q_g = {}, {} 124 | self.d_a, self.d_g = {}, {} 125 | 126 | self.momentum = momentum 127 | self.stat_decay = stat_decay 128 | 129 | self.lr = lr 130 | self.kl_clip = kl_clip 131 | self.damping = damping 132 | self.weight_decay = weight_decay 133 | 134 | self.fast_cnn = fast_cnn 135 | 136 | self.Ts = Ts 137 | self.Tf = Tf 138 | 139 | self.optim = optim.SGD( 140 | model.parameters(), 141 | lr=self.lr * (1 - self.momentum), 142 | momentum=self.momentum) 143 | 144 | def _save_input(self, module, input): 145 | if torch.is_grad_enabled() and self.steps % self.Ts == 0: 146 | classname = module.__class__.__name__ 147 | layer_info = None 148 | if classname == 'Conv2d': 149 | layer_info = (module.kernel_size, module.stride, 150 | module.padding) 151 | 152 | aa = compute_cov_a(input[0].data, classname, layer_info, 153 | self.fast_cnn) 154 | 155 | # Initialize buffers 156 | if self.steps == 0: 157 | self.m_aa[module] = aa.clone() 158 | 159 | update_running_stat(aa, self.m_aa[module], self.stat_decay) 160 | 161 | def _save_grad_output(self, module, grad_input, grad_output): 162 | # Accumulate statistics for Fisher matrices 163 | if self.acc_stats: 164 | classname = module.__class__.__name__ 165 | layer_info = None 166 | if classname == 'Conv2d': 167 | layer_info = (module.kernel_size, module.stride, 168 | module.padding) 169 | 170 | gg = compute_cov_g(grad_output[0].data, classname, layer_info, 171 | self.fast_cnn) 172 | 173 | # Initialize buffers 174 | if self.steps == 0: 175 | self.m_gg[module] = gg.clone() 176 | 177 | update_running_stat(gg, self.m_gg[module], self.stat_decay) 178 | 179 | def _prepare_model(self): 180 | for module in self.model.modules(): 181 | classname = module.__class__.__name__ 182 | if classname in self.known_modules: 183 | assert not ((classname in ['Linear', 'Conv2d']) and module.bias is not None), \ 184 | "You must have a bias as a separate layer" 185 | 186 | self.modules.append(module) 187 | module.register_forward_pre_hook(self._save_input) 188 | module.register_backward_hook(self._save_grad_output) 189 | 190 | def step(self): 191 | # Add weight decay 192 | if self.weight_decay > 0: 193 | for p in self.model.parameters(): 194 | p.grad.data.add_(self.weight_decay, p.data) 195 | 196 | updates = {} 197 | for i, m in enumerate(self.modules): 198 | assert len(list(m.parameters()) 199 | ) == 1, "Can handle only one parameter at the moment" 200 | classname = m.__class__.__name__ 201 | p = next(m.parameters()) 202 | 203 | la = self.damping + self.weight_decay 204 | 205 | if self.steps % self.Tf == 0: 206 | # My asynchronous implementation exists, I will add it later. 207 | # Experimenting with different ways to this in PyTorch. 208 | self.d_a[m], self.Q_a[m] = torch.symeig( 209 | self.m_aa[m], eigenvectors=True) 210 | self.d_g[m], self.Q_g[m] = torch.symeig( 211 | self.m_gg[m], eigenvectors=True) 212 | 213 | self.d_a[m].mul_((self.d_a[m] > 1e-6).float()) 214 | self.d_g[m].mul_((self.d_g[m] > 1e-6).float()) 215 | 216 | if classname == 'Conv2d': 217 | p_grad_mat = p.grad.data.view(p.grad.data.size(0), -1) 218 | else: 219 | p_grad_mat = p.grad.data 220 | 221 | v1 = self.Q_g[m].t() @ p_grad_mat @ self.Q_a[m] 222 | v2 = v1 / ( 223 | self.d_g[m].unsqueeze(1) * self.d_a[m].unsqueeze(0) + la) 224 | v = self.Q_g[m] @ v2 @ self.Q_a[m].t() 225 | 226 | v = v.view(p.grad.data.size()) 227 | updates[p] = v 228 | 229 | vg_sum = 0 230 | for p in self.model.parameters(): 231 | v = updates[p] 232 | vg_sum += (v * p.grad.data * self.lr * self.lr).sum() 233 | 234 | nu = min(1, math.sqrt(self.kl_clip / vg_sum)) 235 | 236 | for p in self.model.parameters(): 237 | v = updates[p] 238 | p.grad.data.copy_(v) 239 | p.grad.data.mul_(nu) 240 | 241 | self.optim.step() 242 | self.steps += 1 -------------------------------------------------------------------------------- /core/algorithms/input_dependent_baseline/base_meta_critic.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import copy 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.optim as optim 7 | 8 | from torch.nn import L1Loss, MSELoss 9 | from itertools import chain 10 | 11 | 12 | class ActorMetaCriticAlgo: 13 | """ 14 | Base class for algorithm (A2C, PPO, etc) which supports adapt meta critic to new \ 15 | input sequences 16 | """ 17 | 18 | def __init__(self, 19 | actor_critic, 20 | lr=7e-4, 21 | adapt_lr=1e-3, 22 | num_inner_steps=5, 23 | adapt_criterion=MSELoss): 24 | self.actor_critic = actor_critic 25 | self.lr = lr 26 | self.optimizer = optim.Adam(self.actor_critic.parameters(), lr=self.lr) 27 | 28 | # meta critic args 29 | self.adapt_lr = adapt_lr 30 | self.num_inner_steps = num_inner_steps 31 | self.adapt_criterion = adapt_criterion() 32 | 33 | # imitation learning 34 | self.il_criterion = nn.CrossEntropyLoss() 35 | 36 | def adapt_and_predict(self, task_inputs, task_labels, meta_inputs, meta_labels): 37 | """ 38 | Adapt the meta critic to new input-sequence and predict the values of new observation \ 39 | (with same input sequence) 40 | For simplicity, we adopt the **First-order MAML** as in https://arxiv.org/abs/1803.02999 \ 41 | 42 | :param task_inputs: tuple of (obs, rnn_hxs, masks) - training inputs of roll out with same input sequence 43 | 44 | :param task_labels: array of shape (num_steps, num_envs, 1) - the Monte Carlo approximation of values 45 | 46 | :param meta_inputs: tuple of (obs, rnn_hxs, masks) - testing inputs of roll out with same input sequence as training 47 | 48 | :param meta_labels: array of shape (num_steps, num_envs, 1) - the Monte Carlo approximation of values 49 | 50 | :return: value prediction of meta_inputs and meta gradient of critic 51 | """ 52 | # create new net and exclusively update this network 53 | fast_net = copy.deepcopy(self.actor_critic.base) 54 | task_optimizer = optim.Adam( 55 | fast_net.parameters(), lr=self.lr) 56 | 57 | task_obs, task_rnn_hxs, task_masks = task_inputs 58 | 59 | # Adapt to new task 60 | for _ in range(self.num_inner_steps): 61 | task_preds, _, _ = fast_net(task_obs, task_rnn_hxs, task_masks) 62 | task_loss = self.adapt_criterion(task_preds, task_labels) 63 | 64 | # update the fast-adapted network 65 | task_optimizer.zero_grad() 66 | task_loss.backward() 67 | task_optimizer.step() 68 | 69 | # compute meta grad 70 | meta_obs, meta_rnn_hxs, meta_masks = meta_inputs 71 | meta_preds, _, _ = fast_net(meta_obs, meta_rnn_hxs, meta_masks) 72 | meta_loss = self.adapt_criterion(meta_preds, meta_labels) 73 | grads = torch.autograd.grad( 74 | meta_loss, fast_net.parameters(), allow_unused=True) 75 | 76 | # create dictionary contains gradient of meta critic 77 | meta_grads = {name: g if g is not None else torch.zeros_like(weight) 78 | for ((name, weight), g) 79 | in zip(fast_net.named_parameters(), 80 | grads)} 81 | 82 | return meta_preds, meta_grads 83 | 84 | def train_meta_critic_and_predict_values(self, rollouts): 85 | """ 86 | Train the meta critic with rollout experience and return the predicted values \ 87 | The adapted algorithm is described in Algorithm 1 of the paper \ 88 | https://arxiv.org/abs/1807.02264. We split the rollout into 2 half, \ 89 | the critic parameters adapting to the first half will give the prediction \ 90 | for second half. The critic parameters adapting to the second half will \ 91 | give the prediction for first half. 92 | 93 | :param rollouts: RolloutStorage's instance 94 | 95 | :return: input-dependent values 96 | """ 97 | obs_shape = rollouts.obs.size()[2:] 98 | _, num_processes, _ = rollouts.rewards.size() 99 | 100 | # prepare input and output of meta learner 101 | # ie splitting them into 2 102 | task_pt = int(num_processes/2) 103 | # first half rollouts 104 | # num_steps * num_processes * input_shape 105 | first_obs = rollouts.obs[:-1, :task_pt, ...].reshape(-1, *obs_shape) 106 | first_rnn_hxs = rollouts.recurrent_hidden_states[0, :task_pt].reshape( 107 | -1, self.actor_critic.recurrent_hidden_state_size) 108 | first_mask = rollouts.masks[:-1, :task_pt].reshape(-1, 1) 109 | first_inputs = (first_obs, first_rnn_hxs, first_mask) 110 | first_labels = rollouts.returns[:-1, :task_pt, ...].reshape(-1, 1) 111 | # second half rollouts 112 | # num_steps * num_processes * input_shape 113 | second_obs = rollouts.obs[:-1, task_pt:, ...].reshape(-1, *obs_shape) 114 | second_rnn_hxs = rollouts.recurrent_hidden_states[0, task_pt:].reshape( 115 | -1, self.actor_critic.recurrent_hidden_state_size) 116 | second_mask = rollouts.masks[:-1, task_pt:].reshape(-1, 1) 117 | second_inputs = (second_obs, second_rnn_hxs, second_mask) 118 | second_labels = rollouts.returns[:-1, task_pt:, ...].reshape(-1, 1) 119 | 120 | # train meta network 121 | # the actor critic object must be instance of MetaCritic class 122 | second_values, second_meta_grads = self.adapt_and_predict( 123 | first_inputs, first_labels, second_inputs, second_labels) 124 | first_values, first_meta_grads = self.adapt_and_predict( 125 | second_inputs, second_labels, first_inputs, first_labels) 126 | values = torch.cat((first_values, second_values), dim=0) 127 | 128 | # update the meta critic 129 | self.update_meta_grads( 130 | [first_meta_grads, second_meta_grads], first_inputs, first_labels) 131 | 132 | # compute value loss 133 | value_loss = self.adapt_criterion( 134 | values, rollouts.returns[:-1].view(-1, 1)) 135 | 136 | return values, value_loss.item() 137 | 138 | def update_meta_grads(self, grads, dummy_inputs, dummy_labels): 139 | """ 140 | Set the gradient values from grads (dict) to actor_critic parameters and update 141 | 142 | :param grads: list of OrderedDict - each element is the gradient from a task 143 | 144 | :param dummy_inputs: dummy inputs to activate the gradient of meta network 145 | 146 | :param dummy_labels: dummy labels to activate the gradient of meta network 147 | """ 148 | keys = grads[0].keys() 149 | # multiple loss with value_loss_coef equivalent to multiple this coef with grad 150 | gradients = {k: sum(grad[k] for grad in grads) for k in keys} 151 | 152 | # compute dummy loss 153 | value_pred, _, _ = self.actor_critic.base(*dummy_inputs) 154 | loss = self.adapt_criterion(value_pred, dummy_labels) 155 | 156 | hooks = [] 157 | for (k, v) in self.actor_critic.base.named_parameters(): 158 | def get_closure(): 159 | key = k 160 | 161 | def replace_grad(grad): 162 | return gradients[key] 163 | return replace_grad 164 | hooks.append(v.register_hook(get_closure())) 165 | 166 | # compute grad for curr step 167 | self.optimizer.zero_grad() 168 | loss.backward() 169 | # nn.utils.clip_grad_norm_(self.actor_critic.base.critic.parameters(), self.max_grad_norm) 170 | self.optimizer.step() 171 | 172 | for h in hooks: 173 | h.remove() 174 | 175 | def imitation_learning(self, inputs, rnn_hxs, masks, expert): 176 | """ 177 | Imitation learning loss 178 | 179 | :param inputs: state observations 180 | 181 | :param rnn_hxs: rnn hidden state 182 | 183 | :param masks: mask the final state with 0 value 184 | 185 | :param expert: a trained or heuristic agent 186 | 187 | :return: log probability of expert's actions 188 | """ 189 | _, actor_features, _ = self.actor_critic.base(inputs, rnn_hxs, masks) 190 | dist = self.actor_critic.dist(actor_features) 191 | 192 | expert_actions = expert.act(inputs) 193 | 194 | il_loss = self.il_criterion(dist.probs, expert_actions.reshape(-1)) 195 | accuracy = (torch.argmax(dist.probs, dim=1) == 196 | expert_actions.reshape(-1)).float().sum()/expert_actions.shape[0] 197 | 198 | return il_loss, accuracy 199 | -------------------------------------------------------------------------------- /core/storage/base_storage.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler 3 | 4 | 5 | def _flatten_helper(T, N, _tensor): 6 | return _tensor.view(T * N, *_tensor.size()[2:]) 7 | 8 | 9 | class RolloutStorage(object): 10 | def __init__(self, num_steps, num_processes, obs_shape, action_space, 11 | recurrent_hidden_state_size): 12 | self.obs = torch.zeros(num_steps + 1, num_processes, *obs_shape) 13 | self.recurrent_hidden_states = torch.zeros( 14 | num_steps + 1, num_processes, recurrent_hidden_state_size) 15 | self.rewards = torch.zeros(num_steps, num_processes, 1) 16 | self.value_preds = torch.zeros(num_steps + 1, num_processes, 1) 17 | self.returns = torch.zeros(num_steps + 1, num_processes, 1) 18 | self.action_log_probs = torch.zeros(num_steps, num_processes, 1) 19 | if action_space.__class__.__name__ == 'Discrete': 20 | action_shape = 1 21 | else: 22 | action_shape = action_space.shape[0] 23 | self.actions = torch.zeros(num_steps, num_processes, action_shape) 24 | if action_space.__class__.__name__ == 'Discrete': 25 | self.actions = self.actions.long() 26 | self.masks = torch.ones(num_steps + 1, num_processes, 1) 27 | 28 | # Masks that indicate whether it's a true terminal state 29 | # or time limit end state 30 | self.bad_masks = torch.ones(num_steps + 1, num_processes, 1) 31 | 32 | self.num_steps = num_steps 33 | self.step = 0 34 | 35 | def to(self, device): 36 | self.obs = self.obs.to(device) 37 | self.recurrent_hidden_states = self.recurrent_hidden_states.to(device) 38 | self.rewards = self.rewards.to(device) 39 | self.value_preds = self.value_preds.to(device) 40 | self.returns = self.returns.to(device) 41 | self.action_log_probs = self.action_log_probs.to(device) 42 | self.actions = self.actions.to(device) 43 | self.masks = self.masks.to(device) 44 | self.bad_masks = self.bad_masks.to(device) 45 | 46 | def insert(self, obs, recurrent_hidden_states, actions, action_log_probs, 47 | value_preds, rewards, masks, bad_masks): 48 | self.obs[self.step + 1].copy_(obs) 49 | self.recurrent_hidden_states[self.step + 50 | 1].copy_(recurrent_hidden_states) 51 | self.actions[self.step].copy_(actions) 52 | self.action_log_probs[self.step].copy_(action_log_probs) 53 | self.value_preds[self.step].copy_(value_preds) 54 | self.rewards[self.step].copy_(rewards) 55 | self.masks[self.step + 1].copy_(masks) 56 | self.bad_masks[self.step + 1].copy_(bad_masks) 57 | 58 | self.step = (self.step + 1) % self.num_steps 59 | 60 | def after_update(self): 61 | self.obs[0].copy_(self.obs[-1]) 62 | self.recurrent_hidden_states[0].copy_(self.recurrent_hidden_states[-1]) 63 | self.masks[0].copy_(self.masks[-1]) 64 | self.bad_masks[0].copy_(self.bad_masks[-1]) 65 | 66 | def compute_returns(self, 67 | next_value, 68 | use_gae, 69 | gamma, 70 | gae_lambda, 71 | use_proper_time_limits=True): 72 | if use_proper_time_limits: 73 | if use_gae: 74 | self.value_preds[-1] = next_value 75 | gae = 0 76 | for step in reversed(range(self.rewards.size(0))): 77 | delta = self.rewards[step] + gamma * self.value_preds[ 78 | step + 1] * self.masks[step + 79 | 1] - self.value_preds[step] 80 | gae = delta + gamma * gae_lambda * self.masks[step + 81 | 1] * gae 82 | gae = gae * self.bad_masks[step + 1] 83 | self.returns[step] = gae + self.value_preds[step] 84 | else: 85 | self.returns[-1] = next_value 86 | for step in reversed(range(self.rewards.size(0))): 87 | self.returns[step] = (self.returns[step + 1] * 88 | gamma * self.masks[step + 1] + self.rewards[step]) * self.bad_masks[step + 1] \ 89 | + (1 - self.bad_masks[step + 1] 90 | ) * self.value_preds[step] 91 | else: 92 | if use_gae: 93 | self.value_preds[-1] = next_value 94 | gae = 0 95 | for step in reversed(range(self.rewards.size(0))): 96 | delta = self.rewards[step] + gamma * self.value_preds[ 97 | step + 1] * self.masks[step + 98 | 1] - self.value_preds[step] 99 | gae = delta + gamma * gae_lambda * self.masks[step + 100 | 1] * gae 101 | self.returns[step] = gae + self.value_preds[step] 102 | else: 103 | self.returns[-1] = next_value 104 | for step in reversed(range(self.rewards.size(0))): 105 | self.returns[step] = self.returns[step + 1] * \ 106 | gamma * self.masks[step + 1] + self.rewards[step] 107 | 108 | def feed_forward_generator(self, 109 | advantages, 110 | num_mini_batch=None, 111 | mini_batch_size=None): 112 | num_steps, num_processes = self.rewards.size()[0:2] 113 | batch_size = num_processes * num_steps 114 | 115 | if mini_batch_size is None: 116 | assert batch_size >= num_mini_batch, ( 117 | "PPO requires the number of processes ({}) " 118 | "* number of steps ({}) = {} " 119 | "to be greater than or equal to the number of PPO mini batches ({})." 120 | "".format(num_processes, num_steps, num_processes * num_steps, 121 | num_mini_batch)) 122 | mini_batch_size = batch_size // num_mini_batch 123 | sampler = BatchSampler( 124 | SubsetRandomSampler(range(batch_size)), 125 | mini_batch_size, 126 | drop_last=True) 127 | for indices in sampler: 128 | obs_batch = self.obs[:-1].view(-1, *self.obs.size()[2:])[indices] 129 | recurrent_hidden_states_batch = self.recurrent_hidden_states[:-1].view( 130 | -1, self.recurrent_hidden_states.size(-1))[indices] 131 | actions_batch = self.actions.view(-1, 132 | self.actions.size(-1))[indices] 133 | value_preds_batch = self.value_preds[:-1].view(-1, 1)[indices] 134 | return_batch = self.returns[:-1].view(-1, 1)[indices] 135 | masks_batch = self.masks[:-1].view(-1, 1)[indices] 136 | old_action_log_probs_batch = self.action_log_probs.view(-1, 137 | 1)[indices] 138 | if advantages is None: 139 | adv_targ = None 140 | else: 141 | adv_targ = advantages.reshape(-1, 1)[indices] 142 | 143 | yield obs_batch, recurrent_hidden_states_batch, actions_batch, \ 144 | value_preds_batch, return_batch, masks_batch, old_action_log_probs_batch, adv_targ 145 | 146 | def recurrent_generator(self, advantages, num_mini_batch): 147 | num_processes = self.rewards.size(1) 148 | assert num_processes >= num_mini_batch, ( 149 | "PPO requires the number of processes ({}) " 150 | "to be greater than or equal to the number of " 151 | "PPO mini batches ({}).".format(num_processes, num_mini_batch)) 152 | num_envs_per_batch = num_processes // num_mini_batch 153 | perm = torch.randperm(num_processes) 154 | for start_ind in range(0, num_processes, num_envs_per_batch): 155 | obs_batch = [] 156 | recurrent_hidden_states_batch = [] 157 | actions_batch = [] 158 | value_preds_batch = [] 159 | return_batch = [] 160 | masks_batch = [] 161 | old_action_log_probs_batch = [] 162 | adv_targ = [] 163 | 164 | for offset in range(num_envs_per_batch): 165 | ind = perm[start_ind + offset] 166 | obs_batch.append(self.obs[:-1, ind]) 167 | recurrent_hidden_states_batch.append( 168 | self.recurrent_hidden_states[0:1, ind]) 169 | actions_batch.append(self.actions[:, ind]) 170 | value_preds_batch.append(self.value_preds[:-1, ind]) 171 | return_batch.append(self.returns[:-1, ind]) 172 | masks_batch.append(self.masks[:-1, ind]) 173 | old_action_log_probs_batch.append( 174 | self.action_log_probs[:, ind]) 175 | adv_targ.append(advantages[:, ind]) 176 | 177 | T, N = self.num_steps, num_envs_per_batch 178 | # These are all tensors of size (T, N, -1) 179 | obs_batch = torch.stack(obs_batch, 1) 180 | actions_batch = torch.stack(actions_batch, 1) 181 | value_preds_batch = torch.stack(value_preds_batch, 1) 182 | return_batch = torch.stack(return_batch, 1) 183 | masks_batch = torch.stack(masks_batch, 1) 184 | old_action_log_probs_batch = torch.stack( 185 | old_action_log_probs_batch, 1) 186 | adv_targ = torch.stack(adv_targ, 1) 187 | 188 | # States is just a (N, -1) tensor 189 | recurrent_hidden_states_batch = torch.stack( 190 | recurrent_hidden_states_batch, 1).view(N, -1) 191 | 192 | # Flatten the (T, N, ...) tensors to (T * N, ...) 193 | obs_batch = _flatten_helper(T, N, obs_batch) 194 | actions_batch = _flatten_helper(T, N, actions_batch) 195 | value_preds_batch = _flatten_helper(T, N, value_preds_batch) 196 | return_batch = _flatten_helper(T, N, return_batch) 197 | masks_batch = _flatten_helper(T, N, masks_batch) 198 | old_action_log_probs_batch = _flatten_helper(T, N, 199 | old_action_log_probs_batch) 200 | adv_targ = _flatten_helper(T, N, adv_targ) 201 | 202 | yield obs_batch, recurrent_hidden_states_batch, actions_batch, \ 203 | value_preds_batch, return_batch, masks_batch, old_action_log_probs_batch, adv_targ 204 | -------------------------------------------------------------------------------- /core/envs/park_envs.py: -------------------------------------------------------------------------------- 1 | """ 2 | Make environments of Park Platform 3 | """ 4 | import torch 5 | import numpy as np 6 | import os 7 | import park 8 | import gym 9 | import random 10 | 11 | from park.spaces.box import Box 12 | from baselines import bench, logger 13 | from baselines.common.atari_wrappers import make_atari, wrap_deepmind 14 | from baselines.common.vec_env import VecEnvWrapper 15 | from baselines.common.vec_env.dummy_vec_env import DummyVecEnv 16 | from baselines.common.vec_env.shmem_vec_env import ShmemVecEnv 17 | from baselines.common.wrappers import TimeLimit 18 | from baselines.common.vec_env.vec_normalize import \ 19 | VecNormalize as VecNormalize_ 20 | 21 | from .load_balance_wrappers import ProcessLoadBalanceObservation, \ 22 | LoadBalanceRandomReset, RewardNormalize, FixJobSequence 23 | 24 | 25 | PARK_ENV_LIST = ['spark', 'spark_sim', 26 | 'load_balance'] 27 | 28 | 29 | def make_env(env_id, 30 | seed, 31 | rank, 32 | log_dir, 33 | allow_early_resets, 34 | max_episode_steps=None, 35 | args=None, 36 | train=True): 37 | def _thunk(): 38 | if env_id not in PARK_ENV_LIST: 39 | raise ValueError("Unsupported environment, expect the environment to be one of " 40 | + str(PARK_ENV_LIST)+" but got: "+str(env_id)) 41 | elif env_id == 'load_balance': 42 | # arrange the number of stream jobs 43 | env = park.make(env_id, 44 | num_stream_jobs=args.num_stream_jobs, 45 | service_rates=args.load_balance_service_rates) 46 | 47 | # random act after resetting to diversify the state 48 | # only use when training 49 | if train: 50 | env = LoadBalanceRandomReset( 51 | env, args.max_random_init_steps) 52 | 53 | # if using load balance, clip and normalize the observation with this wrapper 54 | if args is not None: 55 | env = ProcessLoadBalanceObservation(env, 56 | args.job_size_norm_factor, 57 | args.server_load_norm_factor, 58 | args.highest_server_obs, 59 | args.highest_job_obs, 60 | args.elapsed_time_norm_factor, 61 | args.highest_elapsed_time 62 | ) 63 | # normalize reward 64 | env = RewardNormalize(env, args.reward_norm_factor) 65 | 66 | if args.fix_job_sequence: 67 | # fix job sequence 68 | env = FixJobSequence(env, seed) 69 | 70 | if max_episode_steps: 71 | env = TimeLimit(env, max_episode_steps) 72 | # adding information to env for computing return 73 | env = TimeLimitMask(env) 74 | 75 | # IMPORTANT: all environments used same random seed to repeat the input-process 76 | if train and args.algo.startswith('mib'): 77 | env.seed(seed) 78 | else: 79 | env.seed(seed + rank) 80 | 81 | if log_dir is not None: 82 | env = bench.Monitor( 83 | env, 84 | os.path.join(log_dir, str(rank)), 85 | allow_early_resets=allow_early_resets) 86 | 87 | return env 88 | 89 | return _thunk 90 | 91 | 92 | def make_vec_envs(env_name, 93 | seed, 94 | num_processes, 95 | log_dir, 96 | device, 97 | allow_early_resets, 98 | max_episode_steps=None, 99 | args=None, 100 | train=True): 101 | """ 102 | Make vectorized environments 103 | :param env_name: str - name of environment 104 | :param seed: int - random seed of environment 105 | :num_process: int - number of parallel environment 106 | :param log_dir: str - path to log directory 107 | :param device: str - 'cuda' or 'cpu' 108 | :param allow_early_reset: bool - if apply TimeLimitMask on environments, set this param to True 109 | :param max_episode_steps: int - maximum number of action in 1 episode 110 | :param args: ArgsParser - use to specifiy environment args 111 | :param train: bool - determine if we are using created to train or evaluate 112 | if we're training, all environment share same random seed to repeat input sequence 113 | otherwise, we diversify the random seed 114 | """ 115 | envs = [ 116 | make_env(env_id=env_name, seed=seed, rank=i, log_dir=log_dir, 117 | allow_early_resets=allow_early_resets, 118 | max_episode_steps=max_episode_steps, args=args, train=train) 119 | for i in range(num_processes) 120 | ] 121 | 122 | if len(envs) > 1: 123 | envs = ShmemVecEnv(envs, context='fork') 124 | else: 125 | envs = DummyVecEnv(envs) 126 | 127 | envs = VecPyTorch(envs, device) 128 | 129 | return envs 130 | 131 | 132 | def load_balance_states_to_inputs(states): 133 | """ 134 | Transform states of LoadBalance Env to inputs sequences 135 | :param states: torch.Tensor of shape T x N_processes x (Num_servers + 2) 136 | :return: torch.Tensor of shape T x N_processes x 2 137 | """ 138 | return states[:, :, -2:] 139 | 140 | 141 | # Checks whether done was caused my timit limits or not 142 | class TimeLimitMask(gym.Wrapper): 143 | def step(self, action): 144 | obs, rew, done, info = self.env.step(action) 145 | if done and self.env._max_episode_steps == self.env._elapsed_steps: 146 | info['bad_transition'] = True 147 | 148 | return obs, rew, done, info 149 | 150 | def reset(self, **kwargs): 151 | return self.env.reset(**kwargs) 152 | 153 | 154 | # Can be used to test recurrent policies for Reacher-v2 155 | class MaskGoal(gym.ObservationWrapper): 156 | def observation(self, observation): 157 | if self.env._elapsed_steps > 0: 158 | observation[-2:] = 0 159 | return observation 160 | 161 | 162 | class TransposeObs(gym.ObservationWrapper): 163 | def __init__(self, env=None): 164 | """ 165 | Transpose observation space (base class) 166 | """ 167 | super(TransposeObs, self).__init__(env) 168 | 169 | 170 | class TransposeImage(TransposeObs): 171 | def __init__(self, env=None, op=[2, 0, 1]): 172 | """ 173 | Transpose observation space for images 174 | """ 175 | super(TransposeImage, self).__init__(env) 176 | assert len(op) == 3, "Error: Operation, " + str(op) + ", must be dim3" 177 | self.op = op 178 | obs_shape = self.observation_space.shape 179 | self.observation_space = Box( 180 | self.observation_space.low[0, 0, 0], 181 | self.observation_space.high[0, 0, 0], [ 182 | obs_shape[self.op[0]], obs_shape[self.op[1]], 183 | obs_shape[self.op[2]] 184 | ], 185 | dtype=self.observation_space.dtype) 186 | 187 | def observation(self, ob): 188 | return ob.transpose(self.op[0], self.op[1], self.op[2]) 189 | 190 | 191 | class VecPyTorch(VecEnvWrapper): 192 | def __init__(self, venv, device): 193 | """Return only every `skip`-th frame""" 194 | super(VecPyTorch, self).__init__(venv) 195 | self.device = device 196 | # TODO: Fix data types 197 | 198 | def reset(self): 199 | obs = self.venv.reset() 200 | obs = torch.from_numpy(obs).float().to(self.device) 201 | return obs 202 | 203 | def step_async(self, actions): 204 | if isinstance(actions, torch.LongTensor): 205 | # Squeeze the dimension for discrete actions 206 | actions = actions.squeeze(1) 207 | actions = actions.cpu().numpy() 208 | self.venv.step_async(actions) 209 | 210 | def step_wait(self): 211 | obs, reward, done, info = self.venv.step_wait() 212 | obs = torch.from_numpy(obs).float().to(self.device) 213 | reward = torch.from_numpy(reward).unsqueeze(dim=1).float() 214 | return obs, reward, done, info 215 | 216 | 217 | class VecNormalize(VecNormalize_): 218 | def __init__(self, *args, **kwargs): 219 | super(VecNormalize, self).__init__(*args, **kwargs) 220 | self.training = True 221 | 222 | def _obfilt(self, obs, update=True): 223 | if self.ob_rms: 224 | if self.training and update: 225 | self.ob_rms.update(obs) 226 | obs = np.clip((obs - self.ob_rms.mean) / 227 | np.sqrt(self.ob_rms.var + self.epsilon), 228 | -self.clipob, self.clipob) 229 | return obs 230 | else: 231 | return obs 232 | 233 | def train(self): 234 | self.training = True 235 | 236 | def eval(self): 237 | self.training = False 238 | 239 | 240 | # Derived from 241 | # https://github.com/openai/baselines/blob/master/baselines/common/vec_env/vec_frame_stack.py 242 | class VecPyTorchFrameStack(VecEnvWrapper): 243 | def __init__(self, venv, nstack, device=None): 244 | self.venv = venv 245 | self.nstack = nstack 246 | 247 | wos = venv.observation_space # wrapped ob space 248 | self.shape_dim0 = wos.shape[0] 249 | 250 | low = np.repeat(wos.low, self.nstack, axis=0) 251 | high = np.repeat(wos.high, self.nstack, axis=0) 252 | 253 | if device is None: 254 | device = torch.device('cpu') 255 | self.stacked_obs = torch.zeros((venv.num_envs, ) + 256 | low.shape).to(device) 257 | 258 | observation_space = gym.spaces.Box( 259 | low=low, high=high, dtype=venv.observation_space.dtype) 260 | VecEnvWrapper.__init__(self, venv, observation_space=observation_space) 261 | 262 | def step_wait(self): 263 | obs, rews, news, infos = self.venv.step_wait() 264 | self.stacked_obs[:, :-self.shape_dim0] = \ 265 | self.stacked_obs[:, self.shape_dim0:].clone() 266 | for (i, new) in enumerate(news): 267 | if new: 268 | self.stacked_obs[i] = 0 269 | self.stacked_obs[:, -self.shape_dim0:] = obs 270 | return self.stacked_obs, rews, news, infos 271 | 272 | def reset(self): 273 | obs = self.venv.reset() 274 | if torch.backends.cudnn.deterministic: 275 | self.stacked_obs = torch.zeros(self.stacked_obs.shape) 276 | else: 277 | self.stacked_obs.zero_() 278 | self.stacked_obs[:, -self.shape_dim0:] = obs 279 | return self.stacked_obs 280 | 281 | def close(self): 282 | self.venv.close() 283 | -------------------------------------------------------------------------------- /core/arguments.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import torch 4 | 5 | 6 | def get_args(): 7 | parser = argparse.ArgumentParser(description='RL') 8 | parser.add_argument( 9 | '--algo', default='lacie_a2c_memory', help='algorithm to use: a2c | ppo | acktr') 10 | parser.add_argument( 11 | '--lr', type=float, default=7e-4, help='learning rate (default: 7e-4)') 12 | parser.add_argument( 13 | '--cpc-lr', type=float, default=0.001, help='learning rate for contrastive module (default: 1e-3)') 14 | parser.add_argument( 15 | '--critic-lr', type=float, default=1e-3, help='learning rate of critic (default: 1e-3)') 16 | parser.add_argument( 17 | '--actor-lr', type=float, default=1e-3, help='learning rate of actor (default: 1e-3)') 18 | parser.add_argument( 19 | '--eps', 20 | type=float, 21 | default=1e-5, 22 | help='RMSprop optimizer epsilon (default: 1e-5)') 23 | parser.add_argument( 24 | '--alpha', 25 | type=float, 26 | default=0.99, 27 | help='RMSprop optimizer apha (default: 0.99)') 28 | parser.add_argument( 29 | '--gamma', 30 | type=float, 31 | default=0.99, 32 | help='discount factor for rewards (default: 0.99)') 33 | parser.add_argument( 34 | '--use-gae', 35 | action='store_true', 36 | default=False, 37 | help='use generalized advantage estimation') 38 | parser.add_argument( 39 | '--gae-lambda', 40 | type=float, 41 | default=0.95, 42 | help='gae lambda parameter (default: 0.95)') 43 | parser.add_argument( 44 | '--entropy-coef', 45 | type=float, 46 | default=0.01, 47 | help='entropy term coefficient (default: 0.01)') 48 | parser.add_argument( 49 | '--value-loss-coef', 50 | type=float, 51 | default=0.5, 52 | help='value loss coefficient (default: 0.5)') 53 | parser.add_argument( 54 | '--regularize-coef', 55 | type=float, 56 | default=0.05, 57 | help='cpc regularize loss coefficient (default: 0.05)') 58 | parser.add_argument( 59 | '--max-grad-norm', 60 | type=float, 61 | default=0.5, 62 | help='max norm of gradients (default: 0.5)') 63 | parser.add_argument( 64 | '--seed', type=int, default=1, help='random seed (default: 1)') 65 | parser.add_argument( 66 | '--cuda-deterministic', 67 | action='store_true', 68 | default=False, 69 | help="sets flags for determinism when using CUDA (potentially slow!)") 70 | parser.add_argument( 71 | '--num-processes', 72 | type=int, 73 | default=16, 74 | help='how many training CPU processes to use (default: 16)') 75 | parser.add_argument( 76 | '--num-steps', 77 | type=int, 78 | default=100, 79 | help='number of forward steps in A2C (default: 100)') 80 | parser.add_argument( 81 | '--ppo-epoch', 82 | type=int, 83 | default=4, 84 | help='number of ppo epochs (default: 4)') 85 | parser.add_argument( 86 | '--num-mini-batch', 87 | type=int, 88 | default=32, 89 | help='number of batches for ppo (default: 32)') 90 | parser.add_argument( 91 | '--clip-param', 92 | type=float, 93 | default=0.2, 94 | help='ppo clip parameter (default: 0.2)') 95 | parser.add_argument( 96 | '--log-interval', 97 | type=int, 98 | default=10, 99 | help='log interval, one log per n updates (default: 10)') 100 | parser.add_argument( 101 | '--save-interval', 102 | type=int, 103 | default=100, 104 | help='save interval, one save per n updates (default: 100)') 105 | parser.add_argument( 106 | '--eval-interval', 107 | type=int, 108 | default=None, 109 | help='eval interval, one eval per n updates (default: None)') 110 | parser.add_argument( 111 | '--num-env-steps', 112 | type=int, 113 | default=10e6, 114 | help='number of environment steps to train (default: 10e6)') 115 | parser.add_argument( 116 | '--env-name', 117 | default='load_balance', 118 | help='environment to train on (default: load_balance)') 119 | parser.add_argument( 120 | '--log-dir', 121 | default='logs', 122 | help='directory to save agent logs (default: logs)') 123 | parser.add_argument( 124 | '--save-dir', 125 | default='./trained_models/', 126 | help='directory to save agent logs (default: ./trained_models/)') 127 | parser.add_argument( 128 | '--resume-dir', 129 | default=None, 130 | type=str, 131 | help='directory to trained agent for resuming (default: None)') 132 | parser.add_argument( 133 | '--no-cuda', 134 | action='store_true', 135 | default=False, 136 | help='disables CUDA training') 137 | parser.add_argument( 138 | '--use-proper-time-limits', 139 | action='store_true', 140 | default=False, 141 | help='compute returns taking into account time limits') 142 | parser.add_argument( 143 | '--max-episode-steps', 144 | default=1000, 145 | type=int, 146 | help='maximum number of steps per episode of environment (default: 1000)') 147 | parser.add_argument( 148 | '--num-frame-stack', 149 | default=1, 150 | help='number of observation that will be grouped together (default: 4)') 151 | parser.add_argument( 152 | '--recurrent-policy', 153 | action='store_true', 154 | default=False, 155 | help='use a recurrent policy') 156 | parser.add_argument( 157 | '--use-linear-lr-decay', 158 | action='store_true', 159 | default=False, 160 | help='use a linear schedule on the learning rate') 161 | 162 | # IMITATION LEARNING 163 | parser.add_argument( 164 | '--use-imitation-learning', 165 | action='store_true', 166 | default=False, 167 | help='if True then apply imitation learning during training') 168 | parser.add_argument( 169 | '--il-coef', 170 | type=float, 171 | default=10, 172 | help='coefficient of imitation learning (default: 10)') 173 | 174 | # META INPUT-DEPENDENT BASELINE 175 | parser.add_argument( 176 | '--fix-job-sequence', 177 | action='store_true', 178 | default=False, 179 | help='if True then jobs arriving to servers will be fixed for every episode') 180 | parser.add_argument( 181 | '--num-inner-steps', 182 | type=int, 183 | default=4, 184 | help='number of gradient steps for adapting to new input sequences (default: 4)') 185 | parser.add_argument( 186 | '--adapt-lr', 187 | type=float, 188 | default=5e-3, 189 | help='learning rate of innerloop when adapting to new input sequences (default: 2e-3)') 190 | parser.add_argument( 191 | '--use-memory-to-pred-weights', 192 | action='store_true', 193 | default=False, 194 | help='if True then use memory in storage to predict weights of advantages') 195 | 196 | # LACIE 197 | parser.add_argument( 198 | '--lacie-buffer-size', 199 | type=int, 200 | default=400, 201 | help='Size of buffer contains obs, actions for learning hindsight ratio (default: 400)') 202 | parser.add_argument( 203 | '--lacie-batch-size', 204 | type=int, 205 | default=64, 206 | help='Batch size of every update to learn hindsight ratio via contrastive loss (default: 64)') 207 | parser.add_argument( 208 | '--lacie-num-iter', 209 | type=int, 210 | default=10, 211 | help='Number of iterations to learn hindsight ratio each update (default: 10)') 212 | 213 | # LOAD BALANCE ENVIRONMENT 214 | parser.add_argument( 215 | '--load-balance-service-rates', 216 | default=[0.15, 0.25, 0.35, 0.45, 0.55, 0.65, 0.75, 0.85, 0.95, 1.05], 217 | nargs='+', 218 | type=float, 219 | help='Service rates of each servers of load balance environment') 220 | parser.add_argument( 221 | '--num-curriculum-time', 222 | default=65, 223 | type=int, 224 | help='number of time we would like to increase the num-stream-jobs in load balance env (default: 65)') 225 | parser.add_argument( 226 | '--num-stream-jobs-factor', 227 | default=1.1, 228 | type=float, 229 | help='exponentially increase the number of stream jobs in environment after some interval (default: 1.1)') 230 | parser.add_argument( 231 | '--job-size-norm-factor', 232 | default=1000, 233 | type=float, 234 | help='normalize factor of job size in load balance env (default: 10)') 235 | parser.add_argument( 236 | '--server-load-norm-factor', 237 | default=5000, 238 | type=float, 239 | help='normalize factor of server load in load balance env (default: 50)') 240 | parser.add_argument( 241 | '--elapsed-time-norm-factor', 242 | default=55, 243 | type=float, 244 | help='normalize factor of elapsed time between 2 consecutive events in load balance env (default: 55)') 245 | parser.add_argument( 246 | '--highest-server-obs', 247 | default=20, 248 | type=float, 249 | help='Clip server having higher load than this value in load balance environment (default: 2000)') 250 | parser.add_argument( 251 | '--highest-job-obs', 252 | default=10, 253 | type=float, 254 | help='Clip job having greater size than this value in load balance environment (default: 1000)') 255 | parser.add_argument( 256 | '--highest-elapsed-time', 257 | default=10, 258 | type=float, 259 | help='Clip elapsed time longer than this value in load balance environment (default: 1000)') 260 | parser.add_argument( 261 | '--reward-norm-factor', 262 | default=10000, 263 | type=float, 264 | help='normalize factor of reward in training (default: 1000)') 265 | parser.add_argument( 266 | '--max-random-init-steps', 267 | default=1, 268 | type=int, 269 | help='maximum number of random initial steps after resetting (default: 50)') 270 | parser.add_argument( 271 | '--num-stream-jobs', 272 | default=1000, 273 | type=int, 274 | help='number of stream jobs of load balance env in training (default: 1000)') 275 | 276 | args = parser.parse_args() 277 | 278 | args.cuda = not args.no_cuda and torch.cuda.is_available() 279 | 280 | assert args.algo in ['a2c', 'ppo', 'acktr', 281 | 'mib_a2c', 'mib_ppo', 'lacie_a2c', 'lacie_ppo', 'lacie_a2c_memory', 'lacie_ppo_memory'] 282 | if args.recurrent_policy: 283 | assert args.algo in ['a2c', 'ppo', 'mib_a2c', 'mib_ppo', 'lacie_a2c', 'lacie_ppo', 'lacie_a2c_memory', 'lacie_ppo_memory'], \ 284 | 'Recurrent policy is not implemented for ACKTR' 285 | 286 | return args 287 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /core/algorithms/lacie/lacie_a2c.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | from itertools import chain 5 | from torch import optim 6 | from core.algorithms.lacie.base_lacie import LacieAlgo 7 | from core.storage import LacieStorage 8 | 9 | 10 | class LACIE_A2C(LacieAlgo): 11 | """ 12 | Meta Input-dependent Baseline A2C. \ 13 | This A2C class leverages input-dependent baseline, which is learned with meta learning, \ 14 | to reduce variance when updating parameters 15 | """ 16 | 17 | def __init__(self, 18 | actor_critic, 19 | value_coef, 20 | entropy_coef, 21 | regularize_coef, 22 | eps=None, 23 | alpha=None, 24 | state_to_input_seq=None, 25 | lr=1e-3, 26 | max_grad_norm=None, 27 | expert=None, 28 | il_coef=1, 29 | num_cpc_steps=10, 30 | cpc_lr=1e-3): 31 | super().__init__(actor_critic=actor_critic, 32 | lr=lr, 33 | value_coef=value_coef, 34 | entropy_coef=entropy_coef, 35 | regularize_coef=regularize_coef, 36 | state_to_input_seq=state_to_input_seq, 37 | expert=expert, 38 | il_coef=il_coef, 39 | num_cpc_steps=num_cpc_steps, 40 | cpc_lr=cpc_lr) 41 | self.max_grad_norm = max_grad_norm 42 | 43 | def update(self, rollouts): 44 | obs_shape = rollouts.obs.size()[2:] 45 | action_shape = rollouts.actions.size()[-1] 46 | num_steps, num_processes, _ = rollouts.rewards.size() 47 | 48 | # Estimate baseline 49 | values, action_log_probs, dist_entropy, _ = self.actor_critic.evaluate_actions( 50 | rollouts.obs[:-1].view(-1, *obs_shape), 51 | rollouts.recurrent_hidden_states[:-1].view( 52 | -1, self.actor_critic.recurrent_hidden_state_size), 53 | rollouts.masks[:-1].view(-1, 1), 54 | rollouts.actions.view(-1, action_shape)) 55 | values = values.view(num_steps, num_processes, 1) 56 | action_log_probs = action_log_probs.view(num_steps, num_processes, 1) 57 | 58 | advantages = rollouts.returns[:-1] - values 59 | returns = rollouts.returns[:-1] 60 | 61 | # Value loss for updating Critic Net 62 | value_loss = advantages.pow(2).mean() 63 | 64 | # LEARNING CONTRASTIVE PREDICTIVE MODEL 65 | # compute contrastive loss and accuracy 66 | contrastive_loss, contrastive_accuracy, regularize_loss = self.compute_contrastive_loss( 67 | rollouts.obs, rollouts.actions, rollouts.masks, returns) 68 | contrastive_loss = contrastive_loss.item() 69 | regularize_loss = regularize_loss.item() 70 | # computed weighted advantage according to its dependency with input sequences 71 | 72 | # learn cpc model for n steps 73 | for _ in range(self.num_cpc_steps): 74 | cpc_loss, _, cpc_regularize_loss = self.compute_contrastive_loss( 75 | rollouts.obs, rollouts.actions, rollouts.masks, returns) 76 | 77 | self.cpc_optimizer.zero_grad() 78 | (cpc_loss + self.regularize_coef * cpc_regularize_loss).backward() 79 | 80 | # nn.utils.clip_grad_norm_(chain(self.advantage_encoder.parameters(), 81 | # self.input_seq_encoder.parameters(), 82 | # self.state_encoder.parameters(), 83 | # self.condition_encoder.parameters(), 84 | # self.action_encoder.parameters()), 85 | # self.max_grad_norm) 86 | 87 | self.cpc_optimizer.step() 88 | 89 | # IMPORTANCE: we need to compute the weighted before learn cpc model 90 | # FIXME: Move to training to top to verify if the model can estimate density ratio 91 | weighted_advantages = self.compute_weighted_advantages( 92 | rollouts.obs, rollouts.actions, rollouts.masks, returns) - values 93 | 94 | # Action loss of Actor Net 95 | action_loss = -(weighted_advantages.detach() * action_log_probs).mean() 96 | 97 | # IMITATION LEARNING 98 | imitation_loss, imitation_accuracy = torch.tensor( 99 | 0).to(rollouts.obs.device), 0 100 | if self.expert: 101 | imitation_loss, imitation_accuracy = self.imitation_learning( 102 | rollouts.obs[:-1].view(-1, *obs_shape), 103 | rollouts.recurrent_hidden_states[0].view( 104 | -1, self.actor_critic.recurrent_hidden_state_size), 105 | rollouts.masks[:-1].view(-1, 1), 106 | self.expert) 107 | 108 | self.optimizer.zero_grad() 109 | 110 | (imitation_loss * self.il_coef + value_loss * self.value_coef + action_loss - 111 | dist_entropy * self.entropy_coef).backward() 112 | 113 | nn.utils.clip_grad_norm_(self.actor_critic.parameters(), 114 | self.max_grad_norm) 115 | 116 | self.optimizer.step() 117 | self.after_update() 118 | 119 | return { 120 | 'value loss': value_loss.item(), 121 | 'action loss': action_loss.item(), 122 | 'entropy loss': dist_entropy.item(), 123 | 'imitation loss': imitation_loss.item(), 124 | 'imitation accuracy': imitation_accuracy, 125 | 'contrastive loss': contrastive_loss, 126 | 'contrastive accuracy': contrastive_accuracy, 127 | 'regularize loss': regularize_loss 128 | } 129 | 130 | 131 | class LACIE_A2C_Memory(LACIE_A2C): 132 | def __init__(self, 133 | actor_critic, 134 | value_coef, 135 | entropy_coef, 136 | regularize_coef, 137 | eps=None, 138 | alpha=None, 139 | state_to_input_seq=None, 140 | lr=1e-3, 141 | max_grad_norm=None, 142 | expert=None, 143 | il_coef=1, 144 | num_cpc_steps=10, 145 | lacie_batch_size=64, 146 | lacie_buffer=None, 147 | use_memory_to_pred_weights=False, 148 | cpc_lr=1e-3): 149 | super().__init__(actor_critic, 150 | value_coef, 151 | entropy_coef, 152 | regularize_coef, 153 | eps, 154 | alpha, 155 | state_to_input_seq, 156 | lr, 157 | max_grad_norm, 158 | expert, 159 | il_coef, 160 | num_cpc_steps, 161 | cpc_lr) 162 | self.lacie_batch_size = lacie_batch_size 163 | self.lacie_buffer = lacie_buffer 164 | self.use_memory_to_pred_weights = use_memory_to_pred_weights 165 | 166 | def update(self, rollouts): 167 | obs_shape = rollouts.obs.size()[2:] 168 | action_shape = rollouts.actions.size()[-1] 169 | num_steps, num_processes, _ = rollouts.rewards.size() 170 | 171 | # Estimate baseline 172 | values, action_log_probs, dist_entropy, _ = self.actor_critic.evaluate_actions( 173 | rollouts.obs[:-1].view(-1, *obs_shape), 174 | rollouts.recurrent_hidden_states[:-1].view( 175 | -1, self.actor_critic.recurrent_hidden_state_size), 176 | rollouts.masks[:-1].view(-1, 1), 177 | rollouts.actions.view(-1, action_shape)) 178 | values = values.view(num_steps, num_processes, 1) 179 | action_log_probs = action_log_probs.view(num_steps, num_processes, 1) 180 | 181 | advantages = rollouts.returns[:-1] - values 182 | returns = rollouts.returns[:-1] 183 | 184 | # Value loss for updating Critic Net 185 | value_loss = advantages.pow(2).mean() 186 | 187 | # LEARNING CONTRASTIVE PREDICTIVE MODEL 188 | # update LACIE_Storage 189 | self.lacie_buffer.insert(rollouts, advantages.detach()) 190 | # compute contrastive loss and accuracy 191 | contrastive_loss, contrastive_accuracy, regularize_loss = self.compute_contrastive_loss( 192 | rollouts.obs, rollouts.actions, rollouts.masks, advantages.detach()) 193 | contrastive_loss = contrastive_loss.item() 194 | regularize_loss = regularize_loss.item() 195 | 196 | # computed weighted advantage according to its dependency with input sequences 197 | # learn cpc model for n steps 198 | for _ in range(self.num_cpc_steps): 199 | data = self.lacie_buffer.sample() 200 | obs, actions, masks, sample_advantages = data['obs'], data['actions'], data['masks'], data['advantages'] 201 | cpc_loss, _, cpc_regularize_loss = self.compute_contrastive_loss( 202 | obs, actions, masks, sample_advantages) 203 | 204 | self.cpc_optimizer.zero_grad() 205 | (cpc_loss + self.regularize_coef * cpc_regularize_loss).backward() 206 | 207 | # nn.utils.clip_grad_norm_(chain(self.advantage_encoder.parameters(), 208 | # self.input_seq_encoder.parameters(), 209 | # self.state_encoder.parameters(), 210 | # self.condition_encoder.parameters(), 211 | # self.action_encoder.parameters()), 212 | # self.max_grad_norm) 213 | 214 | self.cpc_optimizer.step() 215 | 216 | # IMPORTANCE: we need to compute the weighted before learn cpc model 217 | # FIXME: Move the cpc training on top to verify if it can learn useful estimation 218 | if not self.use_memory_to_pred_weights: 219 | weighted_advantages = self.compute_weighted_advantages( 220 | rollouts.obs, rollouts.actions, rollouts.masks, advantages.detach()) 221 | else: 222 | data = self.lacie_buffer.sample_most_recent() 223 | obs, actions, masks, sample_advantages = data['obs'], data[ 224 | 'actions'], data['masks'], data['advantages'] 225 | weighted_advantages = self.compute_weighted_advantages( 226 | obs, actions, masks, sample_advantages, rollouts.actions.shape[1]) 227 | 228 | # Action loss of Actor Net 229 | action_loss = -(weighted_advantages.detach() * action_log_probs).mean() 230 | 231 | # IMITATION LEARNING 232 | imitation_loss, imitation_accuracy = torch.tensor( 233 | 0).to(rollouts.obs.device), 0 234 | if self.expert: 235 | imitation_loss, imitation_accuracy = self.imitation_learning( 236 | rollouts.obs[:-1].view(-1, *obs_shape), 237 | rollouts.recurrent_hidden_states[0].view( 238 | -1, self.actor_critic.recurrent_hidden_state_size), 239 | rollouts.masks[:-1].view(-1, 1), 240 | self.expert) 241 | 242 | self.optimizer.zero_grad() 243 | 244 | (imitation_loss * self.il_coef + value_loss * self.value_coef + action_loss - 245 | dist_entropy * self.entropy_coef).backward() 246 | 247 | nn.utils.clip_grad_norm_(self.actor_critic.parameters(), 248 | self.max_grad_norm) 249 | 250 | self.optimizer.step() 251 | self.after_update() 252 | 253 | return { 254 | 'value loss': value_loss.item(), 255 | 'action loss': action_loss.item(), 256 | 'entropy loss': dist_entropy.item(), 257 | 'imitation loss': imitation_loss.item(), 258 | 'imitation accuracy': imitation_accuracy, 259 | 'contrastive loss': contrastive_loss, 260 | 'contrastive accuracy': contrastive_accuracy, 261 | 'regularize loss': regularize_loss 262 | } 263 | -------------------------------------------------------------------------------- /utils/plot.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | # switch backend in driver file 4 | import matplotlib 5 | import matplotlib.pyplot as plt 6 | 7 | import os 8 | import os.path as osp 9 | import glob 10 | from scipy.signal import medfilt 11 | 12 | 13 | def smooth_reward_curve(x, y): 14 | # Halfwidth of our smoothing convolution 15 | halfwidth = min(31, int(np.ceil(len(x) / 30))) 16 | k = halfwidth 17 | xsmoo = x[k:-k] 18 | ysmoo = np.convolve(y, np.ones(2 * k + 1), mode='valid') / \ 19 | np.convolve(np.ones_like(y), np.ones(2 * k + 1), mode='valid') 20 | downsample = max(int(np.floor(len(xsmoo) / 1e3)), 1) 21 | return xsmoo[::downsample], ysmoo[::downsample] 22 | 23 | 24 | def fix_point(x, y, interval): 25 | np.insert(x, 0, 0) 26 | np.insert(y, 0, 0) 27 | 28 | fx, fy = [], [] 29 | pointer = 0 30 | 31 | ninterval = int(max(x) / interval + 1) 32 | 33 | for i in range(ninterval): 34 | tmpx = interval * i 35 | 36 | while pointer + 1 < len(x) and tmpx > x[pointer + 1]: 37 | pointer += 1 38 | 39 | if pointer + 1 < len(x): 40 | alpha = (y[pointer + 1] - y[pointer]) / \ 41 | (x[pointer + 1] - x[pointer]) 42 | tmpy = y[pointer] + alpha * (tmpx - x[pointer]) 43 | fx.append(tmpx) 44 | fy.append(tmpy) 45 | 46 | return fx, fy 47 | 48 | 49 | def load_reward_data(indir, smooth, bin_size): 50 | datas = [] 51 | infiles = glob.glob(os.path.join(indir, '*.monitor.csv')) 52 | 53 | for inf in infiles: 54 | with open(inf, 'r') as f: 55 | f.readline() 56 | f.readline() 57 | for line in f: 58 | tmp = line.split(',') 59 | t_time = float(tmp[2]) 60 | tmp = [t_time, int(tmp[1]), float(tmp[0])] 61 | datas.append(tmp) 62 | 63 | datas = sorted(datas, key=lambda d_entry: d_entry[0]) 64 | result = [] 65 | timesteps = 0 66 | for i in range(len(datas)): 67 | result.append([timesteps, datas[i][-1]]) 68 | timesteps += datas[i][1] 69 | 70 | if len(result) < bin_size: 71 | return [None, None] 72 | 73 | x, y = np.array(result)[:, 0], np.array(result)[:, 1] 74 | 75 | if smooth == 1: 76 | x, y = smooth_reward_curve(x, y) 77 | 78 | if smooth == 2: 79 | y = medfilt(y, kernel_size=9) 80 | 81 | x, y = fix_point(x, y, bin_size) 82 | return [x, y] 83 | 84 | # TODO: only works for Experience Replay style training for now 85 | 86 | 87 | def load_custom_data(indir, stat_file, smooth, bin_size): 88 | datas = [] 89 | infiles = glob.glob(os.path.join(indir, stat_file)) 90 | 91 | for inf in infiles: # should be 1 92 | with open(inf, 'r') as f: 93 | for line in f: 94 | tmp = line.split(',') 95 | tmp = [int(tmp[0]), float(tmp[1])] 96 | datas.append(tmp) 97 | 98 | datas = sorted(datas, key=lambda d_entry: d_entry[0]) 99 | result = [] 100 | for i in range(len(datas)): 101 | result.append([datas[i][0], datas[i][1]]) 102 | 103 | if len(result) < bin_size: 104 | return [None, None] 105 | 106 | x, y = np.array(result)[:, 0], np.array(result)[:, 1] 107 | 108 | if smooth == 1: 109 | x, y = smooth_reward_curve(x, y) 110 | 111 | if smooth == 2: 112 | y = medfilt(y, kernel_size=9) 113 | 114 | x, y = fix_point(x, y, bin_size) 115 | return [x, y] 116 | 117 | # TODO: only works for Experience Replay style training for now 118 | 119 | 120 | def load_action_data(indir, smooth, bin_size): 121 | datas = [] 122 | infiles = glob.glob(os.path.join(indir, 'action_log.csv')) 123 | 124 | for inf in infiles: # should be 1 125 | with open(inf, 'r') as f: 126 | for line in f: 127 | tmp = line.split(',') 128 | tmp = [int(tmp[0])] + [float(tmp[i]) 129 | for i in range(1, len(tmp))] 130 | datas.append(tmp) 131 | 132 | datas = sorted(datas, key=lambda d_entry: d_entry[0]) 133 | result = datas 134 | # for i in range(len(datas)): 135 | # result.append([datas[i][0], datas[i][1]]) 136 | 137 | if len(result) < bin_size: 138 | return [None, None] 139 | 140 | x, y = np.array(result)[:, 0], np.array(result)[:, 1:] 141 | 142 | '''if smooth == 1: 143 | x, y = smooth_reward_curve(x, y) 144 | 145 | if smooth == 2: 146 | y = medfilt(y, kernel_size=9) 147 | 148 | x, y = fix_point(x, y, bin_size)''' 149 | return [x, np.transpose(y)] 150 | 151 | 152 | def visdom_plot(viz, win, folder, game, name, num_steps, bin_size=100, smooth=1): 153 | tx, ty = load_reward_data(folder, smooth, bin_size) 154 | if tx is None or ty is None: 155 | return win 156 | 157 | fig = plt.figure() 158 | plt.plot(tx, ty, label="{}".format(name)) 159 | 160 | tick_fractions = np.array([0.1, 0.2, 0.4, 0.6, 0.8, 1.0]) 161 | ticks = tick_fractions * num_steps 162 | tick_names = ["{:.0e}".format(tick) for tick in ticks] 163 | plt.xticks(ticks, tick_names) 164 | plt.xlim(0, num_steps * 1.01) 165 | 166 | plt.xlabel('Number of Timesteps') 167 | plt.ylabel('Rewards') 168 | 169 | plt.title(game) 170 | plt.legend(loc=4) 171 | plt.show() 172 | plt.draw() 173 | 174 | image = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='') 175 | image = image.reshape(fig.canvas.get_width_height()[::-1] + (3, )) 176 | plt.close(fig) 177 | 178 | # Show it in visdom 179 | image = np.transpose(image, (2, 0, 1)) 180 | 181 | return viz.image(image, win=win) 182 | 183 | 184 | def plot(folder, game, name, num_steps, bin_size=100, smooth=1): 185 | matplotlib.rcParams.update({'font.size': 20}) 186 | tx, ty = load_reward_data(folder, smooth, bin_size) 187 | 188 | if tx is None or ty is None: 189 | return 190 | 191 | fig = plt.figure(figsize=(20, 5)) 192 | plt.plot(tx, ty, label="{}".format(name)) 193 | 194 | tick_fractions = np.array([0.1, 0.2, 0.4, 0.6, 0.8, 1.0]) 195 | ticks = tick_fractions * num_steps 196 | tick_names = ["{:.0e}".format(tick) for tick in ticks] 197 | plt.xticks(ticks, tick_names) 198 | plt.xlim(0, num_steps * 1.01) 199 | 200 | plt.xlabel('Number of Timesteps') 201 | plt.ylabel('Rewards') 202 | 203 | plt.title(game) 204 | plt.legend(loc=4) 205 | plt.savefig(osp.join(folder, 'plot.png')) 206 | plt.close() 207 | # plt.show() 208 | 209 | 210 | def make_patch_spines_invisible(ax): 211 | ax.set_frame_on(True) 212 | ax.patch.set_visible(False) 213 | for sp in ax.spines.values(): 214 | sp.set_visible(False) 215 | 216 | 217 | def plot_all_data(folder, game, name, num_steps, bin_size=(10, 100, 100, 1), smooth=1, time=None, save_filename='results.png', ipynb=False): 218 | matplotlib.rcParams.update({'font.size': 20}) 219 | params = { 220 | 'xtick.labelsize': 20, 221 | 'ytick.labelsize': 15, 222 | 'legend.fontsize': 15 223 | } 224 | plt.rcParams.update(params) 225 | 226 | tx, ty = load_reward_data(folder, smooth, bin_size[0]) 227 | 228 | if tx is None or ty is None: 229 | return 230 | 231 | if time is not None: 232 | title = 'Avg. Last 10 Rewards: ' + \ 233 | str(np.round(np.mean(ty[-10]))) + ' || ' + \ 234 | game + ' || Elapsed Time: ' + str(time) 235 | else: 236 | title = 'Avg. Last 10 Rewards: ' + \ 237 | str(np.round(np.mean(ty[-10]))) + ' || ' + game 238 | 239 | tick_fractions = np.array([0.1, 0.2, 0.4, 0.6, 0.8, 1.0]) 240 | ticks = tick_fractions * num_steps 241 | tick_names = ["{:.0e}".format(tick) for tick in ticks] 242 | 243 | fig, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize=(20, 15), subplot_kw=dict( 244 | xticks=ticks, xlim=(0, num_steps*1.15), xlabel='Timestep', title=title)) 245 | ax1.set_xticklabels(tick_names) 246 | ax2.set_xticklabels(tick_names) 247 | ax3.set_xticklabels(tick_names) 248 | 249 | ax1.set_ylabel('Reward') 250 | 251 | p1, = ax1.plot(tx, ty, label="Reward") 252 | #lines = [p1] 253 | 254 | ax1.yaxis.label.set_color(p1.get_color()) 255 | ax1.tick_params(axis='y', colors=p1.get_color()) 256 | 257 | ax1.legend([p1], [p1.get_label()], loc=4) 258 | 259 | # Load td data if it exists 260 | tx, ty = load_custom_data(folder, 'td.csv', smooth, bin_size[1]) 261 | 262 | ax2.set_title('Loss vs Timestep') 263 | 264 | if tx is not None or ty is not None: 265 | ax2.set_ylabel('Avg .Temporal Difference') 266 | p2, = ax2.plot(tx, ty, 'r-', label='Avg. TD') 267 | g2_lines = [p2] 268 | 269 | ax2.yaxis.label.set_color(p2.get_color()) 270 | ax2.tick_params(axis='y', colors=p2.get_color()) 271 | 272 | ax2.legend(g2_lines, [l.get_label() for l in g2_lines], loc=4) 273 | 274 | # Load Sigma Parameter Data if it exists 275 | tx, ty = load_custom_data(folder, 'sig_param_mag.csv', smooth, bin_size[2]) 276 | 277 | if tx is not None or ty is not None: 278 | # need to update g2 title if sig data will be included 279 | ax2.set_title('Loss/Avg. Sigma Parameter Magnitude vs Timestep') 280 | 281 | ax4 = ax2.twinx() 282 | 283 | ax4.set_ylabel('Avg. Sigma Parameter Mag.') 284 | p4, = ax4.plot(tx, ty, 'g-', label='Avg. Sigma Mag.') 285 | g2_lines += [p4] 286 | 287 | ax4.yaxis.label.set_color(p4.get_color()) 288 | ax4.tick_params(axis='y', colors=p4.get_color()) 289 | 290 | #ax4.spines["right"].set_position(("axes", 1.05)) 291 | # make_patch_spines_invisible(ax4) 292 | # ax4.spines["right"].set_visible(True) 293 | 294 | # remake g2 legend because we have a new line 295 | ax2.legend(g2_lines, [l.get_label() for l in g2_lines], loc=4) 296 | 297 | # Load action selection data if it exists 298 | tx, ty = load_action_data(folder, smooth, bin_size[3]) 299 | 300 | ax3.set_title('Action Selection Frequency(%) vs Timestep') 301 | 302 | if tx is not None or ty is not None: 303 | ax3.set_ylabel('Action Selection Frequency(%)') 304 | labels = ['Action {}'.format(i) for i in range(ty.shape[0])] 305 | p3 = ax3.stackplot(tx, ty, labels=labels) 306 | 307 | base = 0.0 308 | for percent, index in zip(ty, range(ty.shape[0])): 309 | offset = base + percent[-1]/3.0 310 | ax3.annotate(str('{:.2f}'.format( 311 | ty[index][-1])), xy=(tx[-1], offset), color=p3[index].get_facecolor().ravel()) 312 | base += percent[-1] 313 | 314 | # ax3.yaxis.label.set_color(p3.get_color()) 315 | #ax3.tick_params(axis='y', colors=p3.get_color()) 316 | 317 | ax3.legend(loc=4) # remake g2 legend because we have a new line 318 | 319 | plt.tight_layout() # prevent label cutoff 320 | 321 | if ipynb: 322 | plt.show() 323 | else: 324 | plt.savefig(save_filename) 325 | plt.clf() 326 | plt.close() 327 | 328 | # return np.round(np.mean(ty[-10:])) 329 | 330 | 331 | def plot_reward(folder, game, name, num_steps, bin_size=10, smooth=1, time=None, save_filename='results.png', ipynb=False): 332 | matplotlib.rcParams.update({'font.size': 20}) 333 | tx, ty = load_reward_data(folder, smooth, bin_size) 334 | 335 | if tx is None or ty is None: 336 | return 337 | 338 | fig = plt.figure(figsize=(20, 5)) 339 | plt.plot(tx, ty, label="{}".format(name)) 340 | 341 | tick_fractions = np.array([0.1, 0.2, 0.4, 0.6, 0.8, 1.0]) 342 | ticks = tick_fractions * num_steps 343 | tick_names = ["{:.0e}".format(tick) for tick in ticks] 344 | plt.xticks(ticks, tick_names) 345 | plt.xlim(0, num_steps * 1.01) 346 | 347 | plt.xlabel('Number of Timesteps') 348 | plt.ylabel('Rewards') 349 | 350 | if time is not None: 351 | plt.title(game + ' || Last 10: ' + 352 | str(np.round(np.mean(ty[-10]))) + ' || Elapsed Time: ' + str(time)) 353 | else: 354 | plt.title(game + ' || Last 10: ' + str(np.round(np.mean(ty[-10])))) 355 | plt.legend(loc=4) 356 | if ipynb: 357 | plt.show() 358 | else: 359 | plt.savefig(save_filename) 360 | plt.clf() 361 | plt.close() 362 | 363 | return np.round(np.mean(ty[-10])) 364 | -------------------------------------------------------------------------------- /core/algorithms/lacie/base_lacie.py: -------------------------------------------------------------------------------- 1 | """ 2 | Implement the algorithm `Learning to Assign Credit in Input-Driven Environment' 3 | """ 4 | from core.algorithms.base_algo import BaseAlgo 5 | from torch import optim 6 | from itertools import chain 7 | 8 | import torch 9 | import random 10 | import torch.nn as nn 11 | 12 | 13 | class LacieAlgo(BaseAlgo): 14 | """ 15 | Base class for LACIE algorithm. Support `cpc` (contrastive predictive coding) to estimate \ 16 | the independent between input-process and future states. 17 | :param actor_critic: nn.Module - the actor critic object 18 | :param entropy_coef: float - weight of entropy loss 19 | :param max_grad_norm: float - maximum value of gradient 20 | :param n_steps: int - n-steps advantage estimation with hindsight 21 | :param state_to_input_seq: function - a function object that decompose input-processes from states\ 22 | the signature of function should be: foo(states) where states is torch.Tensor of shape \ 23 | T x N_processes x Obs_shape 24 | """ 25 | UPPER_BOUND_CLIP_THRESHOLD = 4 26 | LOWER_BOUND_CLIP_THRESHOLD = 1/100 27 | WEIGHT_CLIP_GROWTH_FACTOR = 1.002 28 | WEIGHT_CLIP_DECAY_FACTOR = 0.998 29 | CPC_HIDDEN_DIM = 96 30 | ADVANTAGE_ENC_DIM = CPC_HIDDEN_DIM//3 31 | INPUT_ENC_DIM = 32 32 | 33 | def __init__(self, 34 | actor_critic, 35 | lr, 36 | value_coef, 37 | entropy_coef, 38 | regularize_coef=0.05, 39 | state_to_input_seq=None, 40 | expert=None, 41 | il_coef=1, 42 | num_cpc_steps=10, 43 | cpc_lr=0.001): 44 | super().__init__(actor_critic, lr, value_coef, entropy_coef, expert, il_coef) 45 | self.regularize_coef = regularize_coef 46 | self.state_to_input_seq = state_to_input_seq 47 | self.num_cpc_steps = num_cpc_steps 48 | 49 | self.device = next(self.actor_critic.parameters()).device 50 | 51 | # encoder for advantages 52 | self.advantage_encoder = nn.Sequential( 53 | nn.Linear(self.ADVANTAGE_ENC_DIM, self.CPC_HIDDEN_DIM//3, bias=True), 54 | nn.LeakyReLU(inplace=True), 55 | nn.Linear(self.CPC_HIDDEN_DIM//3, 56 | self.CPC_HIDDEN_DIM//3, bias=True) 57 | ).to(self.device) 58 | 59 | # encoder for states 60 | # FIXME: hard code for 1D env 61 | self.state_encoder = nn.Sequential( 62 | nn.Linear(self.actor_critic.obs_shape[0], 63 | self.CPC_HIDDEN_DIM//3, bias=True), 64 | nn.LeakyReLU(inplace=True), 65 | nn.Linear(self.CPC_HIDDEN_DIM//3, self.CPC_HIDDEN_DIM//3) 66 | ).to(self.device) 67 | 68 | # encoder for action 69 | self.action_encoder = nn.Sequential( 70 | nn.Embedding(self.actor_critic.action_space.n, 71 | self.CPC_HIDDEN_DIM//3), 72 | nn.LeakyReLU(inplace=True), 73 | nn.Linear(self.CPC_HIDDEN_DIM//3, self.CPC_HIDDEN_DIM//3) 74 | ).to(self.device) 75 | 76 | # encoding conditions (i.e. advantages + states + actions) 77 | self.condition_encoder = nn.Sequential( 78 | nn.LeakyReLU(inplace=True), 79 | nn.Linear(self.CPC_HIDDEN_DIM, self.CPC_HIDDEN_DIM, bias=True), 80 | nn.LeakyReLU(inplace=True), 81 | nn.Linear(self.CPC_HIDDEN_DIM, self.CPC_HIDDEN_DIM) 82 | ).to(self.device) 83 | 84 | # input sequence encoder 85 | self.input_seq_encoder = nn.GRU( 86 | self.INPUT_ENC_DIM, self.CPC_HIDDEN_DIM, 1).to(self.device) 87 | 88 | # optimizer to learn the parameters for cpc loss 89 | self.cpc_optimizer = optim.Adam( 90 | chain( 91 | self.advantage_encoder.parameters(), 92 | self.input_seq_encoder.parameters(), 93 | self.state_encoder.parameters(), 94 | self.action_encoder.parameters(), 95 | self.condition_encoder.parameters() 96 | ), 97 | lr=cpc_lr 98 | ) 99 | 100 | self.softmax = nn.Softmax(dim=-1) 101 | self.log_softmax = nn.LogSoftmax(dim=-1) 102 | self.cpc_criterion = nn.CrossEntropyLoss() 103 | self.regularization_criterion = nn.L1Loss() 104 | 105 | self.upper_bound_clip_threshold = 1 106 | self.lower_bound_clip_threshold = 1 107 | 108 | # Initialize weights 109 | def _weights_init(m): 110 | if isinstance(m, nn.Linear): 111 | nn.init.kaiming_normal_( 112 | m.weight, mode='fan_out', nonlinearity='relu') 113 | if isinstance(m, nn.Conv1d): 114 | nn.init.kaiming_normal_( 115 | m.weight, mode='fan_out', nonlinearity='relu') 116 | elif isinstance(m, nn.BatchNorm1d): 117 | nn.init.constant_(m.weight, 1) 118 | nn.init.constant_(m.bias, 0) 119 | 120 | # initialize gru 121 | for layer_p in self.input_seq_encoder._all_weights: 122 | for p in layer_p: 123 | if 'weight' in p: 124 | nn.init.kaiming_normal_(self.input_seq_encoder.__getattr__( 125 | p), mode='fan_out', nonlinearity='relu') 126 | 127 | self.condition_encoder.apply(_weights_init) 128 | self.state_encoder.apply(_weights_init) 129 | self.action_encoder.apply(_weights_init) 130 | self.advantage_encoder.apply(_weights_init) 131 | 132 | def _encode_input_sequences(self, obs, masks): 133 | num_steps, n_processes, _ = obs.shape 134 | # obs is tensor of shape (n_steps + 1, n_processes, obs_shape) 135 | num_steps -= 1 136 | # INPUT SEQUENCES AND MASKS 137 | # the stochastic input will be defined by last 2 scalar 138 | input_seq = obs[1:, :, -2:] 139 | 140 | # transform input_seq with fourier features 141 | jobs, intervals = input_seq[:, :, 0].reshape(-1, 1), input_seq[:, :, 1].reshape(-1, 1) 142 | jobs, intervals = self.encode_fourier_features(jobs, self.INPUT_ENC_DIM//2), self.encode_fourier_features(intervals, self.INPUT_ENC_DIM//2) 143 | jobs = jobs.reshape(num_steps, n_processes, self.INPUT_ENC_DIM//2) 144 | intervals = intervals.reshape(num_steps, n_processes, self.INPUT_ENC_DIM//2) 145 | input_seq = torch.cat([jobs, intervals], dim=-1) 146 | 147 | masks = masks[1:].reshape(num_steps, n_processes) 148 | # reverse the input seq order since we want to compute from right to left 149 | input_seq = torch.flip(input_seq, [0]) 150 | masks = torch.flip(masks, [0]) 151 | # encode the input sequence 152 | # Let's figure out which steps in the sequence have a zero for any agent 153 | has_zeros = ((masks[1:-1] == 0.0) 154 | .any(dim=-1) 155 | .nonzero() 156 | .squeeze() 157 | .cpu()) 158 | 159 | # +1 to correct the masks[1:] 160 | if has_zeros.dim() == 0: 161 | # Deal with scalar 162 | has_zeros = [has_zeros.item() + 1] 163 | else: 164 | has_zeros = (has_zeros + 1).numpy().tolist() 165 | 166 | # add t=0 and t=T to the list 167 | has_zeros = [-1] + has_zeros + [num_steps - 1] 168 | 169 | outputs = [] 170 | 171 | for i in range(len(has_zeros) - 1): 172 | # We can now process steps that don't have any zeros in masks together! 173 | # This is much faster 174 | start_idx = has_zeros[i] 175 | end_idx = has_zeros[i + 1] 176 | 177 | output, hxs = self.input_seq_encoder( 178 | input_seq[start_idx + 1: end_idx + 1], 179 | hxs * masks[start_idx].view(1, -1, 1) if start_idx > -1 else None) 180 | 181 | outputs.append(output) 182 | 183 | # x is a (T, N, -1) tensor 184 | input_seq = torch.cat(outputs, dim=0) 185 | assert len(input_seq) == num_steps 186 | # reverse back 187 | input_seq = torch.flip(input_seq, [0]) 188 | 189 | return input_seq 190 | 191 | def _encode_advantages(self, advantages): 192 | # FIXME: only compatible with 1D observation 193 | num_steps, n_processes, _ = advantages.shape 194 | # ADVANTAGES 195 | # encode 196 | # n_steps x n_process x hidden_dim/2 197 | advantages = advantages.reshape(-1, 1) 198 | advantages = self.encode_fourier_features(advantages, self.ADVANTAGE_ENC_DIM) 199 | advantages = self.advantage_encoder(advantages).reshape(num_steps, n_processes, -1) 200 | 201 | return advantages 202 | 203 | def _encode_states(self, obs): 204 | num_steps, n_processes, _ = obs.shape 205 | num_steps -= 1 206 | # STATES 207 | # encode 208 | # n_steps x n_process x hidden_dim/2 209 | states = obs[:-1] 210 | # FIXME: hard code for 1D env 211 | states_shape = states.shape[2:][0] 212 | states = self.state_encoder( 213 | states.reshape(-1, states_shape)).reshape(num_steps, n_processes, -1) 214 | 215 | return states 216 | 217 | def _encode_actions(self, actions): 218 | num_steps, n_processes, _ = actions.shape 219 | # ACTION 220 | # encode 221 | # n_steps x n_process x 1 222 | actions = self.action_encoder( 223 | actions.reshape(-1)).reshape(num_steps, n_processes, -1) 224 | 225 | return actions 226 | 227 | def _encode_conditions(self, conditions): 228 | num_steps, n_processes, hidden_dim = conditions.shape 229 | # ACTION 230 | # encode 231 | # n_steps x n_process x 1 232 | conditions = self.condition_encoder( 233 | conditions.reshape(-1, hidden_dim)).reshape(num_steps, n_processes, -1) 234 | 235 | return conditions 236 | 237 | def compute_contrastive_loss(self, obs, actions, masks, advantages): 238 | """ 239 | Contrastive Predictive Coding for learning representation and density ratio 240 | :param rollouts: Storage's instance 241 | :param advantage: tensor of shape: (timestep, n_processes, 1) 242 | """ 243 | # FIXME: only compatible with 1D observation 244 | num_steps, n_processes, _ = advantages.shape 245 | 246 | # encoded all the input 247 | encoded_input_seq = self._encode_input_sequences(obs, masks) 248 | encoded_advantages = self._encode_advantages(advantages) 249 | encoded_states = self._encode_states(obs) 250 | encoded_actions = self._encode_actions(actions) 251 | 252 | # condition = STATE + ADVANTAGE + ACTIONS 253 | conditions = torch.cat( 254 | [encoded_advantages, encoded_states, encoded_actions], dim=-1) 255 | conditions = self._encode_conditions(conditions) 256 | # reshape to n_steps x hidden_dim x n_processes 257 | encoded_input_seq = encoded_input_seq.permute(0, 2, 1) 258 | 259 | # compute nce 260 | # create label mask 261 | label = torch.tensor(torch.arange( 262 | 0, n_processes).tolist() * num_steps).to(self.device) 263 | 264 | # broadcast compute matmul 265 | f_value = torch.bmm( 266 | conditions, encoded_input_seq).reshape(-1, n_processes) 267 | 268 | # compute accuracy 269 | correct = torch.sum(torch.eq(torch.argmax( 270 | self.softmax(f_value), dim=1), label)) 271 | accuracy = correct.item()/(n_processes*num_steps) 272 | 273 | # compute loss 274 | contrastive_loss = self.cpc_criterion(f_value, label) 275 | regularization_loss = self.regularization_criterion( 276 | self.softmax(f_value) * n_processes, torch.ones_like(f_value)) 277 | 278 | return contrastive_loss, accuracy, regularization_loss 279 | 280 | def compute_weighted_advantages(self, obs, actions, masks, advantages, n_envs=None): 281 | """ 282 | Compute return for rollout experience with trained contrastive module 283 | """ 284 | with torch.no_grad(): 285 | # FIXME: only compatible with 1D observation 286 | num_steps, batch_size, _ = advantages.shape 287 | 288 | input_seq = self._encode_input_sequences( 289 | obs, masks) 290 | encoded_advantages = self._encode_advantages(advantages) 291 | encoded_states = self._encode_states(obs) 292 | encoded_actions = self._encode_actions(actions) 293 | 294 | # condition = STATE + ADVANTAGE 295 | conditions = torch.cat( 296 | [encoded_advantages, encoded_states, encoded_actions], dim=-1) 297 | conditions = self._encode_conditions(conditions) 298 | 299 | # reshape to n_steps x hidden_dim x n_processes 300 | input_seq = input_seq.permute(0, 2, 1) 301 | 302 | # weight of each advantage score 303 | weights = torch.zeros((num_steps, n_envs if n_envs else batch_size, 1)).to( 304 | self.device) 305 | 306 | for i in range(num_steps): 307 | # n_steps x n_steps 308 | density_ratio = self.softmax( 309 | torch.mm(conditions[i], input_seq[i])) 310 | if n_envs: 311 | # N is not None => used memory for predicting weights 312 | density_ratio = density_ratio[:n_envs, :n_envs] 313 | # take the diag element 314 | density_ratio = density_ratio.diag().reshape( 315 | n_envs if n_envs else batch_size, 1) 316 | 317 | weights[i] = density_ratio 318 | 319 | weights *= batch_size 320 | weights = 1/(weights+1e-5) 321 | weights = torch.clamp( 322 | weights, 323 | self.lower_bound_clip_threshold, 324 | self.upper_bound_clip_threshold 325 | ) 326 | 327 | if random.randint(0, 9) == 0: 328 | print('weights mean: ', weights.mean()) 329 | print('weights max: ', weights.max()) 330 | print('weights min: ', weights.min()) 331 | weighted_advantages = advantages[:, :n_envs] * \ 332 | weights if n_envs else advantages*weights 333 | 334 | return weighted_advantages 335 | 336 | def update_weight_clip_threshold(self): 337 | self.upper_bound_clip_threshold = min( 338 | self.upper_bound_clip_threshold * self.WEIGHT_CLIP_GROWTH_FACTOR, 339 | self.UPPER_BOUND_CLIP_THRESHOLD 340 | ) 341 | self.lower_bound_clip_threshold = max( 342 | self.lower_bound_clip_threshold * self.WEIGHT_CLIP_DECAY_FACTOR, 343 | self.LOWER_BOUND_CLIP_THRESHOLD 344 | ) 345 | 346 | def after_update(self): 347 | super().after_update() 348 | self.update_weight_clip_threshold() 349 | 350 | def encode_fourier_features(self, x, d=10): 351 | """ 352 | Encode input with fourier features according to https://arxiv.org/abs/2006.10739 353 | :param x: torch.Tensor of shape Nx1 354 | :param d: int - encoded dimension 355 | """ 356 | if (d//2)*2-d != 0: 357 | raise ValueError("Dimension must be even number...") 358 | N = x.shape[0] 359 | 360 | position_enc = torch.zeros(N, d).to(self.device) 361 | idx = torch.arange(d//2).reshape(1, -1).to(self.device) 362 | 363 | position_enc[:, 0::2] = torch.sin(x*2**idx) 364 | position_enc[:, 1::2] = torch.cos(x*2**idx) 365 | 366 | return position_enc 367 | -------------------------------------------------------------------------------- /core/algorithms/lacie/lacie_ppo.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import torch.optim as optim 5 | 6 | from itertools import chain 7 | from .base_lacie import LacieAlgo 8 | 9 | 10 | class LACIE_PPO(LacieAlgo): 11 | def __init__(self, 12 | actor_critic, 13 | clip_param, 14 | ppo_epoch, 15 | num_mini_batch, 16 | value_loss_coef, 17 | entropy_coef, 18 | regularize_coef, 19 | state_to_input_seq=None, 20 | lr=None, 21 | eps=None, 22 | max_grad_norm=None, 23 | use_clipped_value_loss=True, 24 | expert=None, 25 | il_coef=1, 26 | num_cpc_steps=10, 27 | cpc_lr=1e-3): 28 | super().__init__(actor_critic=actor_critic, 29 | lr=lr, 30 | value_coef=value_loss_coef, 31 | entropy_coef=entropy_coef, 32 | regularize_coef=regularize_coef, 33 | state_to_input_seq=state_to_input_seq, 34 | expert=expert, 35 | il_coef=il_coef, 36 | num_cpc_steps=num_cpc_steps, 37 | cpc_lr=cpc_lr) 38 | 39 | self.clip_param = clip_param 40 | self.ppo_epoch = ppo_epoch 41 | self.num_mini_batch = num_mini_batch 42 | 43 | self.max_grad_norm = max_grad_norm 44 | self.use_clipped_value_loss = use_clipped_value_loss 45 | 46 | def update(self, rollouts): 47 | obs_shape = rollouts.obs.size()[2:] 48 | advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1] 49 | 50 | # contrastive learning loss 51 | contrastive_loss_epoch, contrastive_accuracy_epoch = self.compute_contrastive_loss( 52 | rollouts.obs, rollouts.actions, rollouts.masks, advantages.detach()) 53 | contrastive_loss_epoch = contrastive_loss_epoch.item() 54 | 55 | # weighted advantages 56 | weighted_advantages = self.compute_weighted_advantages( 57 | rollouts.obs, rollouts.actions, rollouts.masks, advantages.detach()) 58 | weighted_advantages = (weighted_advantages - weighted_advantages.mean()) / ( 59 | weighted_advantages.std() + 1e-5) 60 | 61 | value_loss_epoch = 0 62 | action_loss_epoch = 0 63 | dist_entropy_epoch = 0 64 | imitation_loss_epoch = 0 65 | accuracy_epoch = 0 66 | 67 | for e in range(self.ppo_epoch): 68 | if self.actor_critic.is_recurrent: 69 | data_generator = rollouts.recurrent_generator( 70 | weighted_advantages, self.num_mini_batch) 71 | else: 72 | data_generator = rollouts.feed_forward_generator( 73 | weighted_advantages, self.num_mini_batch) 74 | 75 | for sample in data_generator: 76 | obs_batch, recurrent_hidden_states_batch, actions_batch, \ 77 | value_preds_batch, return_batch, masks_batch, old_action_log_probs_batch, \ 78 | adv_targ = sample 79 | 80 | # Reshape to do in a single forward pass for all steps 81 | values, action_log_probs, dist_entropy, _ = self.actor_critic.evaluate_actions( 82 | obs_batch, recurrent_hidden_states_batch, masks_batch, 83 | actions_batch) 84 | 85 | ratio = torch.exp(action_log_probs - 86 | old_action_log_probs_batch) 87 | surr1 = ratio * adv_targ 88 | surr2 = torch.clamp(ratio, 1.0 - self.clip_param, 89 | 1.0 + self.clip_param) * adv_targ 90 | action_loss = -torch.min(surr1, surr2).mean() 91 | 92 | if self.use_clipped_value_loss: 93 | value_pred_clipped = value_preds_batch + \ 94 | (values - value_preds_batch).clamp(-self.clip_param, 95 | self.clip_param) 96 | value_losses = (values - return_batch).pow(2) 97 | value_losses_clipped = ( 98 | value_pred_clipped - return_batch).pow(2) 99 | value_loss = 0.5 * torch.max(value_losses, 100 | value_losses_clipped).mean() 101 | else: 102 | value_loss = 0.5 * (return_batch - values).pow(2).mean() 103 | 104 | # imitation learning 105 | imitation_loss, accuracy = torch.tensor( 106 | 0).to(action_loss.device), 0 107 | if self.expert: 108 | imitation_loss, accuracy = self.imitation_learning( 109 | rollouts.obs[:-1].view(-1, *obs_shape), 110 | rollouts.recurrent_hidden_states[0].view( 111 | -1, self.actor_critic.recurrent_hidden_state_size), 112 | rollouts.masks[:-1].view(-1, 1), 113 | self.expert) 114 | 115 | # contrastive learning density ratio 116 | contrastive_loss, _ = self.compute_contrastive_loss( 117 | rollouts.obs, rollouts.actions, rollouts.masks, advantages) 118 | 119 | self.optimizer.zero_grad() 120 | self.cpc_optimizer.zero_grad() 121 | (imitation_loss * self.il_coef * self.value_coef + action_loss - 122 | dist_entropy * self.entropy_coef + contrastive_loss).backward() 123 | nn.utils.clip_grad_norm_(chain(self.actor_critic.parameters(), 124 | self.input_seq_encoder.parameters(), 125 | self.advantage_encoder.parameters(), 126 | self.state_encoder.parameters(), 127 | self.condition_encoder.parameters(), 128 | self.action_encoder.parameters()), 129 | self.max_grad_norm) 130 | self.optimizer.step() 131 | self.cpc_optimizer.step() 132 | 133 | value_loss_epoch += value_loss.item() 134 | action_loss_epoch += action_loss.item() 135 | dist_entropy_epoch += dist_entropy.item() 136 | imitation_loss_epoch += imitation_loss.item() 137 | accuracy_epoch += accuracy 138 | 139 | num_updates = self.ppo_epoch * self.num_mini_batch 140 | 141 | value_loss_epoch /= num_updates 142 | action_loss_epoch /= num_updates 143 | dist_entropy_epoch /= num_updates 144 | imitation_loss_epoch /= num_updates 145 | accuracy_epoch /= num_updates 146 | 147 | self.after_update() 148 | 149 | return { 150 | "value loss": value_loss_epoch, 151 | "action loss": action_loss_epoch, 152 | "entropy loss": dist_entropy_epoch, 153 | "imitation loss": imitation_loss_epoch, 154 | "accuracy": accuracy_epoch, 155 | "contrastive loss": contrastive_loss_epoch, 156 | "contrastive accuracy": contrastive_accuracy_epoch 157 | } 158 | 159 | 160 | class LACIE_PPO_Memory(LACIE_PPO): 161 | def __init__(self, 162 | actor_critic, 163 | clip_param, 164 | ppo_epoch, 165 | num_mini_batch, 166 | value_loss_coef, 167 | entropy_coef, 168 | regularize_coef, 169 | state_to_input_seq=None, 170 | lr=None, 171 | eps=None, 172 | max_grad_norm=None, 173 | use_clipped_value_loss=True, 174 | expert=None, 175 | il_coef=1, 176 | num_cpc_steps=10, 177 | lacie_buffer=None, 178 | lacie_batch_size=64, 179 | use_memory_to_pred_weights=False, 180 | cpc_lr=1e-3): 181 | super().__init__(actor_critic, 182 | clip_param, 183 | ppo_epoch, 184 | num_mini_batch, 185 | value_loss_coef, 186 | entropy_coef, 187 | regularize_coef, 188 | state_to_input_seq, 189 | lr, 190 | eps, 191 | max_grad_norm, 192 | use_clipped_value_loss, 193 | expert, 194 | il_coef, 195 | num_cpc_steps, 196 | cpc_lr=cpc_lr) 197 | 198 | self.lacie_buffer = lacie_buffer 199 | self.lacie_buffer_size = lacie_batch_size 200 | self.use_memory_to_pred_weights = use_memory_to_pred_weights 201 | 202 | def update(self, rollouts): 203 | obs_shape = rollouts.obs.size()[2:] 204 | advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1] 205 | 206 | # update LACIE_Storage 207 | self.lacie_buffer.insert(rollouts, advantages.detach()) 208 | 209 | # contrastive learning loss 210 | contrastive_loss_epoch, contrastive_accuracy_epoch, regularize_loss_epoch = self.compute_contrastive_loss( 211 | rollouts.obs, rollouts.actions, rollouts.masks, advantages.detach()) 212 | contrastive_loss_epoch = contrastive_loss_epoch.item() 213 | regularize_loss_epoch = regularize_loss_epoch.item() 214 | 215 | # --------------------------------------------------------------------------- 216 | # learn cpc model for n steps 217 | 218 | for _ in range(self.num_cpc_steps): 219 | data = self.lacie_buffer.sample() 220 | obs, actions, masks, sample_advantages = data['obs'], data['actions'], data['masks'], data['advantages'] 221 | cpc_loss, _, cpc_regularize_loss = self.compute_contrastive_loss( 222 | obs, actions, masks, sample_advantages) 223 | 224 | self.cpc_optimizer.zero_grad() 225 | (cpc_loss + self.regularize_coef * cpc_regularize_loss).backward() 226 | 227 | nn.utils.clip_grad_norm_(chain(self.advantage_encoder.parameters(), 228 | self.input_seq_encoder.parameters(), 229 | self.state_encoder.parameters(), 230 | self.condition_encoder.parameters(), 231 | self.action_encoder.parameters()), 232 | self.max_grad_norm) 233 | 234 | self.cpc_optimizer.step() 235 | 236 | # weighted advantages 237 | if not self.use_memory_to_pred_weights: 238 | weighted_advantages = self.compute_weighted_advantages( 239 | rollouts.obs, rollouts.actions, rollouts.masks, advantages.detach()) 240 | else: 241 | data = self.lacie_buffer.sample_most_recent() 242 | obs, actions, masks, sample_advantages = data['obs'], data[ 243 | 'actions'], data['masks'], data['advantages'] 244 | weighted_advantages = self.compute_weighted_advantages( 245 | obs, actions, masks, sample_advantages, rollouts.actions.shape[1]) 246 | # normalize advantages 247 | # TODO: Conduct Ablation Study to verify if we should normalize the advantages or not 248 | weighted_advantages = (weighted_advantages - weighted_advantages.mean()) / ( 249 | weighted_advantages.std() + 1e-5) 250 | 251 | # --------------------------------------------------------------------------- 252 | # learn actor and critic 253 | 254 | value_loss_epoch = 0 255 | action_loss_epoch = 0 256 | dist_entropy_epoch = 0 257 | imitation_loss_epoch = 0 258 | accuracy_epoch = 0 259 | 260 | for e in range(self.ppo_epoch): 261 | if self.actor_critic.is_recurrent: 262 | data_generator = rollouts.recurrent_generator( 263 | weighted_advantages, self.num_mini_batch) 264 | else: 265 | data_generator = rollouts.feed_forward_generator( 266 | weighted_advantages, self.num_mini_batch) 267 | 268 | for sample in data_generator: 269 | obs_batch, recurrent_hidden_states_batch, actions_batch, \ 270 | value_preds_batch, return_batch, masks_batch, old_action_log_probs_batch, \ 271 | adv_targ = sample 272 | 273 | # Reshape to do in a single forward pass for all steps 274 | values, action_log_probs, dist_entropy, _ = self.actor_critic.evaluate_actions( 275 | obs_batch, recurrent_hidden_states_batch, masks_batch, 276 | actions_batch) 277 | 278 | ratio = torch.exp(action_log_probs - 279 | old_action_log_probs_batch) 280 | surr1 = ratio * adv_targ 281 | surr2 = torch.clamp(ratio, 1.0 - self.clip_param, 282 | 1.0 + self.clip_param) * adv_targ 283 | action_loss = -torch.min(surr1, surr2).mean() 284 | 285 | if self.use_clipped_value_loss: 286 | value_pred_clipped = value_preds_batch + \ 287 | (values - value_preds_batch).clamp(-self.clip_param, 288 | self.clip_param) 289 | value_losses = (values - return_batch).pow(2) 290 | value_losses_clipped = ( 291 | value_pred_clipped - return_batch).pow(2) 292 | value_loss = 0.5 * torch.max(value_losses, 293 | value_losses_clipped).mean() 294 | else: 295 | value_loss = 0.5 * (return_batch - values).pow(2).mean() 296 | 297 | # imitation learning 298 | imitation_loss, accuracy = torch.tensor( 299 | 0).to(action_loss.device), 0 300 | if self.expert: 301 | imitation_loss, accuracy = self.imitation_learning( 302 | rollouts.obs[:-1].view(-1, *obs_shape), 303 | rollouts.recurrent_hidden_states[0].view( 304 | -1, self.actor_critic.recurrent_hidden_state_size), 305 | rollouts.masks[:-1].view(-1, 1), 306 | self.expert) 307 | 308 | self.optimizer.zero_grad() 309 | (imitation_loss * self.il_coef * self.value_coef + action_loss - 310 | dist_entropy * self.entropy_coef).backward() 311 | nn.utils.clip_grad_norm_(self.actor_critic.parameters(), 312 | self.max_grad_norm) 313 | self.optimizer.step() 314 | 315 | value_loss_epoch += value_loss.item() 316 | action_loss_epoch += action_loss.item() 317 | dist_entropy_epoch += dist_entropy.item() 318 | imitation_loss_epoch += imitation_loss.item() 319 | accuracy_epoch += accuracy 320 | 321 | num_updates = self.ppo_epoch * self.num_mini_batch 322 | 323 | value_loss_epoch /= num_updates 324 | action_loss_epoch /= num_updates 325 | dist_entropy_epoch /= num_updates 326 | imitation_loss_epoch /= num_updates 327 | accuracy_epoch /= num_updates 328 | 329 | self.after_update() 330 | 331 | return { 332 | "value loss": value_loss_epoch, 333 | "action loss": action_loss_epoch, 334 | "entropy loss": dist_entropy_epoch, 335 | "imitation loss": imitation_loss_epoch, 336 | "accuracy": accuracy_epoch, 337 | "contrastive loss": contrastive_loss_epoch, 338 | "contrastive accuracy": contrastive_accuracy_epoch, 339 | "regularization loss": regularize_loss_epoch 340 | } 341 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import numpy as np 4 | import torch 5 | import os.path as osp 6 | 7 | from collections import deque 8 | from core import algorithms, utils 9 | from core.agents import Policy 10 | from core.agents.heuristic.load_balance import ShortestProcessingTimeAgent, \ 11 | EarliestCompletionTimeAgent, LeastWorkAgent 12 | from core.arguments import get_args 13 | from core.envs import make_vec_envs 14 | from core.storage import RolloutStorage, LacieStorage 15 | from evaluation import evaluate 16 | from tensorboardX import SummaryWriter 17 | from utils.plot import plot 18 | 19 | 20 | def main(): 21 | args = get_args() 22 | 23 | torch.manual_seed(args.seed) 24 | torch.cuda.manual_seed_all(args.seed) 25 | 26 | if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: 27 | torch.backends.cudnn.benchmark = False 28 | torch.backends.cudnn.deterministic = True 29 | 30 | base_dir = osp.expanduser(args.log_dir) 31 | log_dir = osp.join(base_dir, 'train_log') 32 | eval_log_dir = osp.join(base_dir, "eval_log") 33 | tensorboard_dir = osp.join(base_dir, "tensorboard_log") 34 | 35 | utils.cleanup_log_dir(log_dir) 36 | utils.cleanup_log_dir(eval_log_dir) 37 | utils.cleanup_log_dir(tensorboard_dir) 38 | utils.dump_config(args, osp.join(base_dir, 'config.txt')) 39 | 40 | torch.set_num_threads(1) 41 | device = torch.device("cuda:0" if args.cuda else "cpu") 42 | writer = SummaryWriter(tensorboard_dir) 43 | 44 | # limited the number of steps for each episode 45 | # IMPORTANT: for load balance / spark-sim we automatically do this by setting 46 | # the number of stream jobs 47 | if not args.use_proper_time_limits: 48 | envs = make_vec_envs(env_name=args.env_name, 49 | seed=args.seed, 50 | num_processes=args.num_processes, 51 | log_dir=log_dir, 52 | device=device, 53 | allow_early_resets=False, 54 | args=args) 55 | else: 56 | envs = make_vec_envs(env_name=args.env_name, 57 | seed=args.seed, 58 | num_processes=args.num_processes, 59 | log_dir=log_dir, 60 | device=device, 61 | allow_early_resets=True, 62 | max_episode_steps=args.max_episode_steps, 63 | args=args) 64 | 65 | # create actor critic 66 | actor_critic = Policy( 67 | envs.observation_space.shape, 68 | envs.action_space, 69 | base_kwargs={'recurrent': args.recurrent_policy}) 70 | # if the resume directory is provided, then directly load that checkpoint 71 | if args.resume_dir is not None: 72 | print("=> Resuming from checkpoint: {}".format(args.resume_dir)) 73 | actor_critic = torch.load(args.resume_dir, map_location='cpu')[0] 74 | actor_critic.to(device) 75 | 76 | # expert for imitation learning 77 | if args.use_imitation_learning: 78 | expert = LeastWorkAgent() 79 | else: 80 | expert = None 81 | 82 | if args.algo == 'a2c': 83 | agent = algorithms.A2C_ACKTR( 84 | actor_critic, 85 | args.value_loss_coef, 86 | args.entropy_coef, 87 | lr=args.lr, 88 | eps=args.eps, 89 | alpha=args.alpha, 90 | max_grad_norm=args.max_grad_norm, 91 | expert=expert, 92 | il_coef=args.il_coef) 93 | elif args.algo == 'ppo': 94 | agent = algorithms.PPO( 95 | actor_critic, 96 | args.clip_param, 97 | args.ppo_epoch, 98 | args.num_mini_batch, 99 | args.value_loss_coef, 100 | args.entropy_coef, 101 | lr=args.lr, 102 | eps=args.eps, 103 | max_grad_norm=args.max_grad_norm, 104 | expert=expert, 105 | il_coef=args.il_coef) 106 | elif args.algo == 'acktr': 107 | agent = algorithms.A2C_ACKTR( 108 | actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) 109 | elif args.algo == 'mib_a2c': 110 | agent = algorithms.MIB_A2C( 111 | actor_critic, 112 | args.entropy_coef, 113 | lr=args.lr, 114 | adapt_lr=args.adapt_lr, 115 | num_inner_steps=args.num_inner_steps, 116 | max_grad_norm=args.max_grad_norm, 117 | expert=expert, 118 | il_coef=args.il_coef 119 | ) 120 | elif args.algo == 'mib_ppo': 121 | agent = algorithms.MIB_PPO( 122 | actor_critic=actor_critic, 123 | clip_param=args.clip_param, 124 | ppo_epoch=args.ppo_epoch, 125 | num_mini_batch=args.num_mini_batch, 126 | entropy_coef=args.entropy_coef, 127 | lr=args.lr, 128 | adapt_lr=args.adapt_lr, 129 | num_inner_steps=args.num_inner_steps, 130 | max_grad_norm=args.max_grad_norm, 131 | expert=expert, 132 | il_coef=args.il_coef 133 | ) 134 | elif args.algo == 'lacie_a2c': 135 | agent = algorithms.LACIE_A2C( 136 | actor_critic=actor_critic, 137 | value_coef=args.value_loss_coef, 138 | entropy_coef=args.entropy_coef, 139 | regularize_coef=args.regularize_coef, 140 | lr=args.lr, 141 | eps=args.eps, 142 | alpha=args.alpha, 143 | max_grad_norm=args.max_grad_norm, 144 | expert=expert, 145 | il_coef=args.il_coef, 146 | num_cpc_steps=args.lacie_num_iter, 147 | cpc_lr=args.cpc_lr 148 | ) 149 | elif args.algo == 'lacie_a2c_memory': 150 | lacie_buffer = LacieStorage(args.num_steps, 151 | envs.observation_space.shape, 152 | envs.action_space, 153 | max_size=args.lacie_buffer_size, 154 | batch_size=args.lacie_batch_size, 155 | n_processes=args.num_processes) 156 | lacie_buffer.to(device) 157 | agent = algorithms.LACIE_A2C_Memory( 158 | actor_critic=actor_critic, 159 | value_coef=args.value_loss_coef, 160 | entropy_coef=args.entropy_coef, 161 | regularize_coef=args.regularize_coef, 162 | lr=args.lr, 163 | eps=args.eps, 164 | alpha=args.alpha, 165 | max_grad_norm=args.max_grad_norm, 166 | expert=expert, 167 | il_coef=args.il_coef, 168 | num_cpc_steps=args.lacie_num_iter, 169 | lacie_batch_size=args.lacie_batch_size, 170 | lacie_buffer=lacie_buffer, 171 | use_memory_to_pred_weights=args.use_memory_to_pred_weights, 172 | cpc_lr=args.cpc_lr 173 | ) 174 | elif args.algo == 'lacie_ppo': 175 | agent = algorithms.LACIE_PPO( 176 | actor_critic, 177 | args.clip_param, 178 | args.ppo_epoch, 179 | args.num_mini_batch, 180 | args.value_loss_coef, 181 | args.entropy_coef, 182 | regularize_coef=args.regularize_coef, 183 | lr=args.lr, 184 | eps=args.eps, 185 | max_grad_norm=args.max_grad_norm, 186 | expert=expert, 187 | il_coef=args.il_coef, 188 | cpc_lr=args.cpc_lr) 189 | elif args.algo == 'lacie_ppo_memory': 190 | lacie_buffer = LacieStorage(args.num_steps, 191 | envs.observation_space.shape, 192 | envs.action_space, 193 | max_size=args.lacie_buffer_size, 194 | batch_size=args.lacie_batch_size, 195 | n_processes=args.num_processes) 196 | lacie_buffer.to(device) 197 | agent = algorithms.LACIE_PPO_Memory( 198 | actor_critic, 199 | args.clip_param, 200 | args.ppo_epoch, 201 | args.num_mini_batch, 202 | args.value_loss_coef, 203 | args.entropy_coef, 204 | regularize_coef=args.regularize_coef, 205 | lr=args.lr, 206 | eps=args.eps, 207 | max_grad_norm=args.max_grad_norm, 208 | expert=expert, 209 | il_coef=args.il_coef, 210 | num_cpc_steps=args.lacie_num_iter, 211 | lacie_batch_size=args.lacie_batch_size, 212 | lacie_buffer=lacie_buffer, 213 | use_memory_to_pred_weights=args.use_memory_to_pred_weights, 214 | cpc_lr=args.cpc_lr 215 | ) 216 | else: 217 | raise ValueError("Not Implemented algorithm...") 218 | 219 | rollouts = RolloutStorage(args.num_steps, args.num_processes, 220 | envs.observation_space.shape, envs.action_space, 221 | actor_critic.recurrent_hidden_state_size) 222 | 223 | obs = envs.reset() 224 | rollouts.obs[0].copy_(obs) 225 | rollouts.to(device) 226 | 227 | episode_rewards = deque(maxlen=10) 228 | 229 | start = time.time() 230 | 231 | num_updates = int( 232 | args.num_env_steps) // args.num_steps // args.num_processes 233 | 234 | # the gradient update interval to increase number of stream jobs 235 | curriculum_interval = int(num_updates / args.num_curriculum_time) 236 | 237 | for j in range(num_updates): 238 | random_seed = args.seed if args.fix_job_sequence else args.seed + j 239 | # if using load_balance environment: \ 240 | # we have to gradually increase number of stream jos 241 | # if (args.env_name == 'load_balance') and ((j + 1) % curriculum_interval) == 0: 242 | # args.num_stream_jobs = int( 243 | # args.num_stream_jobs * args.num_stream_jobs_factor) 244 | 245 | # # reconstruct environments to increase the number of stream jobs 246 | # # also alter the random seed 247 | # if not args.use_proper_time_limits: 248 | # envs = make_vec_envs(env_name=args.env_name, 249 | # seed=random_seed, 250 | # num_processes=args.num_processes, 251 | # log_dir=log_dir, 252 | # device=device, 253 | # allow_early_resets=False, 254 | # args=args) 255 | # else: 256 | # envs = make_vec_envs(env_name=args.env_name, 257 | # seed=random_seed, 258 | # num_processes=args.num_processes, 259 | # log_dir=log_dir, 260 | # device=device, 261 | # allow_early_resets=True, 262 | # max_episode_steps=args.max_episode_steps, 263 | # args=args) 264 | 265 | # print("Increase the number of stream jobs to " + 266 | # str(args.num_stream_jobs)) 267 | # obs = envs.reset() 268 | # rollouts.obs[0].copy_(obs) 269 | # rollouts.to(device) 270 | 271 | # decrease learning rate linearly 272 | if args.use_linear_lr_decay: 273 | cur_lr = utils.update_linear_schedule( 274 | agent.optimizer, j, num_updates, 275 | agent.optimizer.lr if args.algo == "acktr" else args.lr) 276 | if args.algo.startswith('lacie'): 277 | cur_lr = utils.update_linear_schedule( 278 | agent.cpc_optimizer, j, num_updates, args.cpc_lr 279 | ) 280 | else: 281 | cur_lr = agent.optimizer.param_groups[0]["lr"] 282 | 283 | # Rolling out, collecting and storing SARS (State, action, reward, new state) 284 | for step in range(args.num_steps): 285 | # Sample actions 286 | with torch.no_grad(): 287 | value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( 288 | rollouts.obs[step], rollouts.recurrent_hidden_states[step], 289 | rollouts.masks[step]) 290 | 291 | # Obser reward and next obs 292 | # TODO: park env does not support cuda tensor??? 293 | obs, reward, done, infos = envs.step(action.cpu()) 294 | for info in infos: 295 | if 'episode' in info.keys(): 296 | episode_rewards.append(info['episode']['r']) 297 | 298 | # If done then clean the history of observations. 299 | masks = torch.FloatTensor( 300 | [[0.0] if done_ else [1.0] for done_ in done]) 301 | bad_masks = torch.FloatTensor( 302 | [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) 303 | rollouts.insert(obs, recurrent_hidden_states, action, 304 | action_log_prob, value, reward, masks, bad_masks) 305 | 306 | with torch.no_grad(): 307 | next_value = actor_critic.get_value( 308 | rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], 309 | rollouts.masks[-1]).detach() 310 | 311 | rollouts.compute_returns(next_value, args.use_gae, args.gamma, 312 | args.gae_lambda, args.use_proper_time_limits) 313 | 314 | results = agent.update(rollouts) 315 | 316 | rollouts.after_update() 317 | 318 | # SAVE trained model 319 | if (j % args.save_interval == 0 320 | or j == num_updates - 1) and args.save_dir != "": 321 | save_path = os.path.join(args.save_dir, args.algo) 322 | try: 323 | os.makedirs(save_path) 324 | except OSError: 325 | pass 326 | 327 | torch.save([ 328 | actor_critic, 329 | getattr(utils.get_vec_normalize(envs), 'ob_rms', None) 330 | ], os.path.join(save_path, args.env_name + ".pt")) 331 | 332 | # LOG TRAINING results 333 | if j % args.log_interval == 0 and len(episode_rewards) > 1: 334 | total_num_steps = (j + 1) * args.num_processes * args.num_steps 335 | end = time.time() 336 | print("="*90) 337 | print("Updates {}, num timesteps {}, FPS {}, LR: {}" 338 | "\n=> Last {} training episodes: mean/median reward " 339 | "{:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}".format( 340 | j, total_num_steps, 341 | int(total_num_steps / (end - start)), 342 | cur_lr, 343 | len(episode_rewards), np.mean(episode_rewards), 344 | np.median(episode_rewards), np.min(episode_rewards), 345 | np.max(episode_rewards))) 346 | result_str = "=> " 347 | for k, v in results.items(): 348 | result_str = result_str + "{}: {:.2f} ".format(k, v) 349 | print(result_str) 350 | 351 | writer.add_scalar("train/reward", np.mean(episode_rewards), j) 352 | for k, v in results.items(): 353 | writer.add_scalar("train/"+k.replace(' ', '_'), v, j) 354 | 355 | plot(log_dir, 'load-balance', args.algo, 356 | args.num_env_steps) 357 | 358 | # EVALUATE performance of learned policy along with heuristic 359 | if (args.eval_interval is not None and len(episode_rewards) > 1 360 | and j % args.eval_interval == 0): 361 | # alter the random seed 362 | eval_results = evaluate(actor_critic, args.env_name, seed=args.seed, 363 | num_processes=args.num_processes, eval_log_dir=eval_log_dir, 364 | device=device, env_args=args) 365 | writer.add_scalars( 366 | 'eval/reward', 367 | {k: np.mean(v) for k, v in eval_results.items()}, 368 | j) 369 | # plot(eval_log_dir, 'load-balance', args.algo, 370 | # args.num_env_steps) 371 | 372 | writer.close() 373 | 374 | 375 | if __name__ == "__main__": 376 | main() 377 | --------------------------------------------------------------------------------