├── rl_algorithms ├── version ├── acer │ ├── __init__.py │ └── buffer.py ├── common │ ├── apex │ │ ├── __init__.py │ │ └── learner.py │ ├── abstract │ │ ├── __init__.py │ │ ├── architecture.py │ │ ├── reward_fn.py │ │ ├── buffer.py │ │ ├── her.py │ │ ├── learner.py │ │ └── distributed_worker.py │ ├── __init__.py │ ├── env │ │ ├── __init__.py │ │ ├── normalizers.py │ │ └── utils.py │ ├── buffer │ │ ├── __init__.py │ │ ├── gail_buffer.py │ │ ├── distillation_buffer.py │ │ └── segment_tree.py │ ├── networks │ │ ├── __init__.py │ │ └── backbones │ │ │ ├── __init__.py │ │ │ └── cnn.py │ ├── noise.py │ ├── saliency_map.py │ └── grad_cam.py ├── a2c │ ├── __init__.py │ └── learner.py ├── bc │ ├── __init__.py │ ├── ddpg_learner.py │ ├── sac_learner.py │ └── her.py ├── ddpg │ └── __init__.py ├── dqn │ ├── __init__.py │ ├── distributed_logger.py │ ├── linear.py │ └── distributed_worker.py ├── fd │ ├── __init__.py │ ├── dqn_learner.py │ ├── ddpg_learner.py │ └── sac_learner.py ├── gail │ ├── __init__.py │ ├── utils.py │ └── networks.py ├── ppo │ ├── __init__.py │ └── utils.py ├── sac │ └── __init__.py ├── td3 │ └── __init__.py ├── recurrent │ └── __init__.py ├── distillation │ ├── __init__.py │ └── README.md ├── utils │ ├── __init__.py │ ├── config.py │ └── registry.py ├── registry.py └── __init__.py ├── configs ├── reacher_v2 │ ├── __init__.py │ ├── td3.yaml │ ├── ddpg.yaml │ ├── bc_ddpg.yaml │ ├── sac.yaml │ └── bc_sac.yaml ├── lunarlander_v2 │ ├── __init__.py │ ├── acer.yaml │ ├── ppo.yaml │ ├── dqn.yaml │ ├── r2d1.yaml │ ├── distillation_dqn.yaml │ └── dqfd.yaml ├── pong_no_frameskip_v4 │ ├── __init__.py │ ├── ppo.yaml │ ├── r2d1.yaml │ ├── dqn.yaml │ ├── dqn_resnet.yaml │ ├── apex_dqn.yaml │ └── distillation_dqn.yaml └── lunarlander_continuous_v2 │ ├── __init__.py │ ├── a2c.yaml │ ├── td3.yaml │ ├── ddpg.yaml │ ├── ppo.yaml │ ├── bc_ddpg.yaml │ ├── ddpgfd.yaml │ ├── sac.yaml │ ├── bc_sac.yaml │ ├── gail_ppo.yaml │ └── sacfd.yaml ├── MANIFEST.in ├── tools ├── run_test.sh ├── check_version.sh ├── run_reacher_v2.sh ├── run_descrete_env.sh └── run_lunarlander_continuous_v2.sh ├── data ├── reacher_demo.pkl ├── lunarlander_discrete_demo.pkl └── lunarlander_continuous_demo.pkl ├── mypy.ini ├── .isort.cfg ├── .gitignore ├── .flake8 ├── .pre-commit-config.yaml ├── .github ├── CODEOWNERS └── workflows │ └── python-publish.yaml ├── requirements.txt ├── Jenkinsfile ├── requirements-dev.txt ├── Makefile ├── Dockerfile ├── LICENSE.md ├── setup.py ├── tests ├── buffer │ ├── test_uniform_buffer.py │ ├── test_distillation_buffer.py │ └── test_prioritized_buffer.py ├── test_helper_funcion.py ├── integration │ ├── test_run_apex.py │ ├── test_run_agent.py │ └── test_run_distillation_agent.py ├── test_config_registry.py └── test_cnn_cfg.py ├── .all-contributorsrc ├── run_lunarlander_v2.py ├── run_lunarlander_continuous_v2.py ├── run_reacher_v2.py └── run_pong_no_frameskip_v4.py /rl_algorithms/version: -------------------------------------------------------------------------------- 1 | 1.2.0 -------------------------------------------------------------------------------- /rl_algorithms/acer/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rl_algorithms/common/apex/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rl_algorithms/common/abstract/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /configs/reacher_v2/__init__.py: -------------------------------------------------------------------------------- 1 | """Empty.""" 2 | -------------------------------------------------------------------------------- /rl_algorithms/a2c/__init__.py: -------------------------------------------------------------------------------- 1 | """Empty.""" 2 | -------------------------------------------------------------------------------- /rl_algorithms/bc/__init__.py: -------------------------------------------------------------------------------- 1 | """Empty.""" 2 | -------------------------------------------------------------------------------- /rl_algorithms/ddpg/__init__.py: -------------------------------------------------------------------------------- 1 | """Empty.""" 2 | -------------------------------------------------------------------------------- /rl_algorithms/dqn/__init__.py: -------------------------------------------------------------------------------- 1 | """Empty.""" 2 | -------------------------------------------------------------------------------- /rl_algorithms/fd/__init__.py: -------------------------------------------------------------------------------- 1 | """Empty.""" 2 | -------------------------------------------------------------------------------- /rl_algorithms/gail/__init__.py: -------------------------------------------------------------------------------- 1 | """Empty.""" 2 | -------------------------------------------------------------------------------- /rl_algorithms/ppo/__init__.py: -------------------------------------------------------------------------------- 1 | """Empty.""" 2 | -------------------------------------------------------------------------------- /rl_algorithms/sac/__init__.py: -------------------------------------------------------------------------------- 1 | """Empty.""" 2 | -------------------------------------------------------------------------------- /rl_algorithms/td3/__init__.py: -------------------------------------------------------------------------------- 1 | """Empty.""" 2 | -------------------------------------------------------------------------------- /configs/lunarlander_v2/__init__.py: -------------------------------------------------------------------------------- 1 | """Empty.""" 2 | -------------------------------------------------------------------------------- /rl_algorithms/common/__init__.py: -------------------------------------------------------------------------------- 1 | """Empty.""" 2 | -------------------------------------------------------------------------------- /rl_algorithms/common/env/__init__.py: -------------------------------------------------------------------------------- 1 | """Empty.""" 2 | -------------------------------------------------------------------------------- /rl_algorithms/recurrent/__init__.py: -------------------------------------------------------------------------------- 1 | """Empty.""" 2 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include requirements.txt 2 | include */version -------------------------------------------------------------------------------- /configs/pong_no_frameskip_v4/__init__.py: -------------------------------------------------------------------------------- 1 | """Empty.""" 2 | -------------------------------------------------------------------------------- /rl_algorithms/common/buffer/__init__.py: -------------------------------------------------------------------------------- 1 | """Empty.""" 2 | -------------------------------------------------------------------------------- /rl_algorithms/common/networks/__init__.py: -------------------------------------------------------------------------------- 1 | """Empty.""" 2 | -------------------------------------------------------------------------------- /rl_algorithms/distillation/__init__.py: -------------------------------------------------------------------------------- 1 | """Empty.""" 2 | -------------------------------------------------------------------------------- /configs/lunarlander_continuous_v2/__init__.py: -------------------------------------------------------------------------------- 1 | """Empty.""" 2 | -------------------------------------------------------------------------------- /tools/run_test.sh: -------------------------------------------------------------------------------- 1 | sh ./tools/run_lunarlander_continuous_v2.sh 2 | sh ./tools/run_descrete_env.sh -------------------------------------------------------------------------------- /data/reacher_demo.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/medipixel/rl_algorithms/HEAD/data/reacher_demo.pkl -------------------------------------------------------------------------------- /mypy.ini: -------------------------------------------------------------------------------- 1 | # Global options: 2 | 3 | [mypy] 4 | python_version = 3.6 5 | ignore_missing_imports = True 6 | -------------------------------------------------------------------------------- /data/lunarlander_discrete_demo.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/medipixel/rl_algorithms/HEAD/data/lunarlander_discrete_demo.pkl -------------------------------------------------------------------------------- /data/lunarlander_continuous_demo.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/medipixel/rl_algorithms/HEAD/data/lunarlander_continuous_demo.pkl -------------------------------------------------------------------------------- /.isort.cfg: -------------------------------------------------------------------------------- 1 | [settings] 2 | multi_line_output=3 3 | include_trailing_comma=True 4 | force_grid_wrap=0 5 | combine_as_imports=True 6 | line_length=88 7 | force_sort_within_sections=True 8 | known_third_party=wandb, ray 9 | -------------------------------------------------------------------------------- /rl_algorithms/common/networks/backbones/__init__.py: -------------------------------------------------------------------------------- 1 | from rl_algorithms.common.networks.backbones.cnn import CNN 2 | from rl_algorithms.common.networks.backbones.resnet import ResNet 3 | 4 | __all__ = [ 5 | "CNN", 6 | "ResNet", 7 | ] 8 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | wandb* 3 | checkpoint 4 | .idea 5 | .mypy_cache 6 | .vscode 7 | MUJOCO_LOG.TXT 8 | .coverage 9 | 10 | # build 11 | build 12 | dist 13 | *.egg-info 14 | 15 | # data 16 | data/distillation_buffer 17 | data/saliency_map/ 18 | -------------------------------------------------------------------------------- /rl_algorithms/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .config import YamlConfig 2 | from .registry import Registry, build_from_cfg, build_ray_obj_from_cfg 3 | 4 | __all__ = [ 5 | "Registry", 6 | "build_from_cfg", 7 | "build_ray_obj_from_cfg", 8 | "YamlConfig", 9 | ] 10 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | # This is an example .flake8 config, used when developing *Black* itself. 2 | # Keep in sync with setup.cfg which is used for source packages. 3 | 4 | [flake8] 5 | ignore = E203, E266, E501, W503 6 | max-line-length = 88 7 | max-complexity = 18 8 | select = C,E,F,W,B,B950 9 | -------------------------------------------------------------------------------- /rl_algorithms/gail/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def compute_gail_reward(discriminator_score: torch.Tensor): 5 | """Compute gail(imitation) reward of data generated by policy.""" 6 | return ( 7 | -torch.log(torch.sigmoid(discriminator_score) + 1e-8).detach().cpu().numpy()[0] 8 | ) 9 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: local 3 | hooks: 4 | - id: format 5 | name: format 6 | language: system 7 | entry: make format 8 | types: [python] 9 | - id: test 10 | name: test 11 | language: system 12 | entry: make test 13 | types: [python] 14 | -------------------------------------------------------------------------------- /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | # Lines starting with '#' are comments. 2 | # Reference: https://docs.github.com/en/github/creating-cloning-and-archiving-repositories/about-code-owners 3 | # Each line is a file pattern followed by one or more owners. 4 | 5 | # These owners will be the default owners for everything in the repo. 6 | * @isk03276 -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==1.18.0 2 | torch==1.6.0 3 | gym==0.17.3 4 | atari-py==0.2.6 5 | box2d-py==2.3.8 6 | scipy==1.5.4 7 | opencv-python==4.4.0.46 8 | addict==2.4.0 9 | tqdm 10 | 11 | # for distributed learning 12 | redis==3.5.3 # for ray 13 | ray==1.3.0 14 | pyzmq==20.0.0 15 | pyarrow==3.0.0 16 | 17 | # for log 18 | six>=1.13.0 19 | wandb==0.10.11 20 | matplotlib==3.3.3 21 | plotly==4.13.0 22 | -------------------------------------------------------------------------------- /Jenkinsfile: -------------------------------------------------------------------------------- 1 | pipeline { 2 | agent { 3 | dockerfile{ 4 | filename "Dockerfile" 5 | args "-v /home/mpadmin/.ssh/:/root/.ssh/" 6 | } 7 | } 8 | stages { 9 | stage('Test') { 10 | steps { 11 | echo 'Testing...' 12 | sh 'make jenkins-dev' 13 | sh 'make test' 14 | sh 'make integration-test' 15 | } 16 | } 17 | } 18 | } -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | pre-commit 2 | 3 | # formatting 4 | black == 21.6b0 5 | isort == 5.6.4 6 | 7 | # testing 8 | pylint == 2.6.0 9 | mypy == 0.761 10 | flake8 == 3.8.4 11 | flake8-bugbear == 20.11.1 12 | flake8-docstrings == 1.5.0 13 | pluggy == 0.13.1 14 | pytest == 6.1.2 15 | pytest-pylint == 0.18.0 16 | pytest-flake8 == 1.0.6 17 | pytest-mypy == 0.4.2 18 | pytest-cov == 2.10.1 19 | 20 | # for jenkins 21 | setuptools==40.3.0 -------------------------------------------------------------------------------- /tools/check_version.sh: -------------------------------------------------------------------------------- 1 | TAG=$1 2 | TAG=`echo $TAG | sed -e "s/v//g"` 3 | 4 | PACKAGE_NAME="rl_algorithms" 5 | 6 | echo "A tag triggered build. $TAG" 7 | 8 | VERSION=`cat "$PACKAGE_NAME"/version` 9 | echo "The version is $VERSION" 10 | 11 | if [ "$VERSION" = "$TAG" ]; then 12 | echo "Version is correct! Deploy to local server." 13 | 14 | else 15 | echo "Tag and version are not same. Check again." 16 | exit 1 17 | fi; 18 | -------------------------------------------------------------------------------- /tools/run_reacher_v2.sh: -------------------------------------------------------------------------------- 1 | python run_reacher_v2.py --cfg-path ./configs/reacher_v2/ddpg.yaml --off-render --log 2 | python run_reacher_v2.py --cfg-path ./configs/reacher_v2/sac.yaml --off-render --log 3 | python run_reacher_v2.py --cfg-path ./configs/reacher_v2/td3.yaml --off-render --log 4 | python run_reacher_v2.py --cfg-path ./configs/reacher_v2/bc_ddpg.yaml --off-render --log 5 | python run_reacher_v2.py --cfg-path ./configs/reacher_v2/bc_sac.yaml --off-render --log -------------------------------------------------------------------------------- /rl_algorithms/common/abstract/architecture.py: -------------------------------------------------------------------------------- 1 | """Abstract class for distributed architectures. 2 | 3 | - Author: Chris Yoon 4 | - Contact: chris.yoon@medipixel.io 5 | """ 6 | 7 | from abc import ABC, abstractmethod 8 | 9 | 10 | class Architecture(ABC): 11 | """Abstract class for distributed architectures""" 12 | 13 | @abstractmethod 14 | def _spawn(self): 15 | pass 16 | 17 | @abstractmethod 18 | def train(self): 19 | pass 20 | 21 | @abstractmethod 22 | def test(self): 23 | pass 24 | -------------------------------------------------------------------------------- /rl_algorithms/common/abstract/reward_fn.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Abstract class for computing reward. 3 | 4 | - Author: Kyunghwan Kim 5 | - Contact: kh.kim@medipixel.io 6 | """ 7 | from abc import ABC, abstractmethod 8 | 9 | import numpy as np 10 | 11 | 12 | class RewardFn(ABC): 13 | """Abstract class for computing reward. 14 | New compute_reward class should redefine __call__() 15 | 16 | """ 17 | 18 | @abstractmethod 19 | def __call__(self, transition: tuple, goal_state: np.ndarray) -> np.float64: 20 | pass 21 | -------------------------------------------------------------------------------- /configs/lunarlander_v2/acer.yaml: -------------------------------------------------------------------------------- 1 | type: ACERAgent 2 | hyper_params: 3 | gamma: 0.98 4 | c: 1 5 | buffer_size: 5000 6 | n_rollout: 10 7 | replay_ratio: 16 8 | start_from: 100 9 | gradient_clip: 10 10 | tau: 0.005 11 | 12 | learner_cfg: 13 | type: ACERLearner 14 | backbone: 15 | actor: 16 | critic: 17 | shared_actor_critic: 18 | head: 19 | actor: 20 | type: MLP 21 | configs: 22 | hidden_sizes: [256, 256] 23 | output_activation: identity 24 | critic: 25 | type: MLP 26 | configs: 27 | hidden_sizes: [256, 256] 28 | output_activation: identity 29 | optim_cfg: 30 | lr: 0.0002 31 | weight_decay: 0.0 32 | adam_eps: 0.00000001 33 | trust_region: 34 | use_trust_region: true 35 | delta: 1 36 | -------------------------------------------------------------------------------- /tools/run_descrete_env.sh: -------------------------------------------------------------------------------- 1 | python run_lunarlander_v2.py --cfg-path ./configs/lunarlander_v2/dqn.yaml --off-render --log 2 | python run_lunarlander_v2.py --cfg-path ./configs/lunarlander_v2/dqfd.yaml --off-render --log 3 | python run_lunarlander_v2.py --cfg-path ./configs/lunarlander_v2/r2d1.yaml --off-render --log 4 | 5 | python run_pong_no_frameskip_v4.py --cfg-path ./configs/pong_no_frameskip_v4/dqn.yaml --off-render --log 6 | python run_pong_no_frameskip_v4.py --cfg-path ./configs/pong_no_frameskip_v4/r2d1.yaml --off-render --log 7 | python run_pong_no_frameskip_v4.py --cfg-path configs/pong_no_frameskip_v4/apex_dqn.yaml --off-render --log 8 | 9 | python run_pong_no_frameskip_v4.py --cfg-path ./configs/pong_no_frameskip_v4/dqn_resnet.yaml --off-render --log -------------------------------------------------------------------------------- /configs/lunarlander_continuous_v2/a2c.yaml: -------------------------------------------------------------------------------- 1 | type: "A2CAgent" 2 | hyper_params: 3 | gamma: 0.99 4 | w_entropy: 0.001 5 | gradient_clip_ac: 0.1 6 | gradient_clip_cr: 0.5 7 | 8 | learner_cfg: 9 | type: "A2CLearner" 10 | backbone: 11 | actor: 12 | critic: 13 | shared_actor_critic: 14 | head: 15 | actor: 16 | type: "GaussianDist" 17 | configs: 18 | hidden_sizes: [256, 256] 19 | output_activation: "identity" 20 | fixed_logstd: True 21 | critic: 22 | type: "MLP" 23 | configs: 24 | hidden_sizes: [256, 256] 25 | output_size: 1 26 | output_activation: "identity" 27 | optim_cfg: 28 | lr_actor: 0.00004 29 | lr_critic: 0.0003 30 | weight_decay: 0.0 31 | -------------------------------------------------------------------------------- /configs/reacher_v2/td3.yaml: -------------------------------------------------------------------------------- 1 | type: "TD3Agent" 2 | hyper_params: 3 | gamma: 0.95 4 | tau: 0.005 5 | buffer_size: 100000 6 | batch_size: 100 7 | initial_random_action: 10000 8 | policy_update_freq: 2 9 | 10 | learner_cfg: 11 | type: "TD3Learner" 12 | backbone: 13 | actor: 14 | critic: 15 | head: 16 | actor: 17 | type: "MLP" 18 | configs: 19 | hidden_sizes: [400, 300] 20 | output_activation: "tanh" 21 | critic: 22 | type: "MLP" 23 | configs: 24 | hidden_sizes: [400, 300] 25 | output_size: 1 26 | output_activation: "identity" 27 | optim_cfg: 28 | lr_actor: 0.001 29 | lr_critic: 0.001 30 | weight_decay: 0.0 31 | 32 | noise_cfg: 33 | exploration_noise: 0.1 34 | target_policy_noise: 0.2 35 | target_policy_noise_clip: 0.5 36 | -------------------------------------------------------------------------------- /configs/lunarlander_continuous_v2/td3.yaml: -------------------------------------------------------------------------------- 1 | type: "TD3Agent" 2 | hyper_params: 3 | gamma: 0.99 4 | tau: 0.005 5 | buffer_size: 100000 6 | batch_size: 100 7 | initial_random_action: 10000 8 | policy_update_freq: 2 9 | 10 | learner_cfg: 11 | type: "TD3Learner" 12 | backbone: 13 | actor: 14 | critic: 15 | shared_actor_critic: 16 | head: 17 | actor: 18 | type: "MLP" 19 | configs: 20 | hidden_sizes: [400, 300] 21 | output_activation: "tanh" 22 | critic: 23 | type: "MLP" 24 | configs: 25 | hidden_sizes: [400, 300] 26 | output_size: 1 27 | output_activation: "identity" 28 | optim_cfg: 29 | lr_actor: 0.001 30 | lr_critic: 0.001 31 | weight_decay: 0.0 32 | 33 | noise_cfg: 34 | exploration_noise: 0.1 35 | target_policy_noise: 0.2 36 | target_policy_noise_clip: 0.5 37 | -------------------------------------------------------------------------------- /configs/reacher_v2/ddpg.yaml: -------------------------------------------------------------------------------- 1 | type: "DDPGAgent" 2 | hyper_params: 3 | gamma: 0.99 4 | tau: 0.001 5 | buffer_size: 10000 6 | batch_size: 128 7 | initial_random_action: 10000 8 | multiple_update: 1 # multiple learning updates 9 | gradient_clip_ac: 0.5 10 | gradient_clip_cr: 1.0 11 | 12 | learner_cfg: 13 | type: "DDPGLearner" 14 | backbone: 15 | actor: 16 | critic: 17 | head: 18 | actor: 19 | type: "MLP" 20 | configs: 21 | hidden_sizes: [256, 256] 22 | output_activation: "tanh" 23 | critic: 24 | type: "MLP" 25 | configs: 26 | hidden_sizes: [256, 256] 27 | output_size: 1 28 | output_activation: "identity" 29 | optim_cfg: 30 | lr_actor: 0.001 31 | lr_critic: 0.001 32 | weight_decay: 0.000001 33 | 34 | noise_cfg: 35 | ou_noise_theta: 0.0 36 | ou_noise_sigma: 0.0 37 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | format: 2 | black . --exclude checkpoint wandb 3 | isort . --skip checkpoint --skip wandb --skip data 4 | 5 | test: 6 | black . --check 7 | isort . --check --diff --skip checkpoint --skip wandb --skip data 8 | env PYTHONPATH=. pytest --pylint --flake8 --cov=tests --ignore=checkpoint --ignore=data --ignore=wandb --ignore tests/integration 9 | 10 | integration-test: 11 | env PYTHONPATH=. pytest tests/integration --cov=tests 12 | 13 | docker-push: 14 | docker build -t medipixel/rl_algorithms . 15 | docker push medipixel/rl_algorithms 16 | 17 | dev: 18 | pip install -U -r requirements.txt 19 | pip install -U -r requirements-dev.txt 20 | pre-commit install 21 | python setup.py develop 22 | 23 | dep: 24 | pip install -U -r requirements.txt 25 | python setup.py install 26 | 27 | jenkins-dev: 28 | pip install -U -r requirements-dev.txt 29 | python setup.py develop -------------------------------------------------------------------------------- /configs/lunarlander_continuous_v2/ddpg.yaml: -------------------------------------------------------------------------------- 1 | type: "DDPGAgent" 2 | hyper_params: 3 | gamma: 0.99 4 | tau: 0.005 5 | buffer_size: 10000 6 | batch_size: 64 7 | initial_random_action: 10000 8 | multiple_update: 1 # multiple learning updates 9 | gradient_clip_ac: 0.5 10 | gradient_clip_cr: 1.0 11 | 12 | learner_cfg: 13 | type: "DDPGLearner" 14 | backbone: 15 | actor: 16 | critic: 17 | shared_actor_critic: 18 | head: 19 | actor: 20 | type: "MLP" 21 | configs: 22 | hidden_sizes: [256, 256] 23 | output_activation: "tanh" 24 | critic: 25 | type: "MLP" 26 | configs: 27 | hidden_sizes: [256, 256] 28 | output_size: 1 29 | output_activation: "identity" 30 | optim_cfg: 31 | lr_actor: 0.0003 32 | lr_critic: 0.0003 33 | weight_decay: 0.000001 34 | 35 | noise_cfg: 36 | ou_noise_theta: 0.0 37 | ou_noise_sigma: 0.0 38 | -------------------------------------------------------------------------------- /configs/lunarlander_v2/ppo.yaml: -------------------------------------------------------------------------------- 1 | type: "PPOAgent" 2 | hyper_params: 3 | gamma: 0.99 4 | tau: 0.95 5 | batch_size: 32 6 | max_epsilon: 0.2 7 | min_epsilon: 0.2 8 | epsilon_decay_period: 1500 9 | w_value: 1.0 10 | w_entropy: 0.001 11 | gradient_clip_ac: 0.5 12 | gradient_clip_cr: 1.0 13 | epoch: 16 14 | rollout_len: 256 15 | n_workers: 12 16 | use_clipped_value_loss: False 17 | standardize_advantage: True 18 | 19 | learner_cfg: 20 | type: "PPOLearner" 21 | backbone: 22 | actor: 23 | critic: 24 | shared_actor_critic: 25 | head: 26 | actor: 27 | type: "CategoricalDist" 28 | configs: 29 | hidden_sizes: [256, 256] 30 | output_activation: "identity" 31 | critic: 32 | type: "MLP" 33 | configs: 34 | hidden_sizes: [256, 256] 35 | output_size: 1 36 | output_activation: "identity" 37 | optim_cfg: 38 | lr_actor: 0.0003 39 | lr_critic: 0.001 40 | weight_decay: 0.0 41 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04 2 | 3 | RUN rm /etc/apt/sources.list.d/cuda.list 4 | RUN rm /etc/apt/sources.list.d/nvidia-ml.list 5 | 6 | RUN apt-get update 7 | RUN apt-get install -y software-properties-common vim 8 | RUN apt-get install -y libsm6 libxext6 libxrender-dev libusb-1.0-0-dev && apt-get update 9 | RUN apt-get install -y git 10 | RUN apt-get install -y python3-pip python3-dev \ 11 | && cd /usr/local/bin \ 12 | && ln -s /usr/bin/python3 python \ 13 | && pip3 install --upgrade pip 14 | 15 | # set workspace 16 | RUN mkdir /workspace/ 17 | WORKDIR /workspace 18 | 19 | COPY requirements.txt /workspace/requirements.txt 20 | RUN pip install -U Cython numpy 21 | RUN pip install -U -r requirements.txt 22 | 23 | # set cuda path 24 | ENV CUDA_HOME /usr/local/cuda 25 | ENV PATH "/usr/local/cuda/bin:$PATH" 26 | ENV LD_LIBRARY_PATH "$LD_LIBRARY_PATH:/usr/local/cuda/lib64" 27 | ENV LIBRARY_PATH "$LIBRARY_PATH:/usr/local/cuda/lib64" 28 | 29 | RUN apt-get update && apt-get install -y libgl1-mesa-glx -------------------------------------------------------------------------------- /configs/lunarlander_continuous_v2/ppo.yaml: -------------------------------------------------------------------------------- 1 | type: "PPOAgent" 2 | hyper_params: 3 | gamma: 0.99 4 | tau: 0.95 5 | batch_size: 32 6 | max_epsilon: 0.2 7 | min_epsilon: 0.2 8 | epsilon_decay_period: 1500 9 | w_value: 1.0 10 | w_entropy: 0.001 11 | gradient_clip_ac: 0.5 12 | gradient_clip_cr: 1.0 13 | epoch: 16 14 | rollout_len: 256 15 | n_workers: 12 16 | use_clipped_value_loss: False 17 | standardize_advantage: True 18 | 19 | learner_cfg: 20 | type: "PPOLearner" 21 | backbone: 22 | actor: 23 | critic: 24 | shared_actor_critic: 25 | head: 26 | actor: 27 | type: "GaussianDist" 28 | configs: 29 | hidden_sizes: [256, 256] 30 | hidden_activation: "tanh" 31 | output_activation: "identity" 32 | fixed_logstd: True 33 | critic: 34 | type: "MLP" 35 | configs: 36 | hidden_sizes: [256, 256] 37 | output_size: 1 38 | output_activation: "identity" 39 | optim_cfg: 40 | lr_actor: 0.0003 41 | lr_critic: 0.001 42 | weight_decay: 0.0 43 | -------------------------------------------------------------------------------- /configs/reacher_v2/bc_ddpg.yaml: -------------------------------------------------------------------------------- 1 | type: "BCDDPGAgent" 2 | hyper_params: 3 | gamma: 0.99 4 | tau: 0.001 5 | buffer_size: 100000 6 | batch_size: 512 7 | initial_random_action: 10000 8 | multiple_update: 1 # multiple learning updates 9 | gradient_clip_ac: 0.5 10 | gradient_clip_cr: 1.0 11 | # BC 12 | demo_batch_size: 64 13 | lambda1: 0.001 14 | demo_path: "data/reacher_demo.pkl" 15 | # HER 16 | use_her: False 17 | her: 18 | type: ReacherHER 19 | success_score: -5.0 20 | desired_states_from_demo: False 21 | 22 | learner_cfg: 23 | type: "BCDDPGLearner" 24 | backbone: 25 | actor: 26 | critic: 27 | head: 28 | actor: 29 | type: "MLP" 30 | configs: 31 | hidden_sizes: [256, 256] 32 | output_activation: "tanh" 33 | critic: 34 | type: "MLP" 35 | configs: 36 | hidden_sizes: [256, 256] 37 | output_size: 1 38 | output_activation: "identity" 39 | optim_cfg: 40 | lr_actor: 0.0001 41 | lr_critic: 0.001 42 | weight_decay: 0.000001 43 | 44 | noise_cfg: 45 | ou_noise_theta: 0.0 46 | ou_noise_sigma: 0.0 47 | -------------------------------------------------------------------------------- /configs/lunarlander_v2/dqn.yaml: -------------------------------------------------------------------------------- 1 | type: "DQNAgent" 2 | hyper_params: 3 | gamma: 0.99 4 | tau: 0.005 5 | buffer_size: 100000 # openai baselines: 10000 6 | batch_size: 64 # openai baselines: 32 7 | update_starts_from: 10000 # openai baselines: 10000 8 | multiple_update: 1 # multiple learning updates 9 | train_freq: 1 # in openai baselines, train_freq = 4 10 | gradient_clip: 10.0 # dueling: 10.0 11 | n_step: 3 12 | w_n_step: 1.0 13 | w_q_reg: 0.0000001 14 | per_alpha: 0.6 # openai baselines: 0.6 15 | per_beta: 0.4 16 | per_eps: 0.000001 17 | max_epsilon: 1.0 18 | min_epsilon: 0.01 # openai baselines: 0.01 19 | epsilon_decay: 0.00001 # openai baselines: 1e-7 / 1e-1 20 | 21 | learner_cfg: 22 | type: "DQNLearner" 23 | loss_type: 24 | type: "C51Loss" 25 | backbone: 26 | head: 27 | type: "C51DuelingMLP" 28 | configs: 29 | hidden_sizes: [128, 64] 30 | v_min: -300 31 | v_max: 300 32 | atom_size: 1530 33 | output_activation: "identity" 34 | use_noisy_net: False 35 | optim_cfg: 36 | lr_dqn: 0.0001 37 | weight_decay: 0.0000001 38 | adam_eps: 0.00000001 39 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2019 Medipixel 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /configs/lunarlander_continuous_v2/bc_ddpg.yaml: -------------------------------------------------------------------------------- 1 | type: "BCDDPGAgent" 2 | hyper_params: 3 | gamma: 0.99 4 | tau: 0.001 5 | buffer_size: 100000 6 | batch_size: 512 7 | initial_random_action: 10000 8 | multiple_update: 1 # multiple learning updates 9 | gradient_clip_ac: 0.5 10 | gradient_clip_cr: 0.5 11 | # BC 12 | demo_batch_size: 64 13 | lambda1: 0.001 14 | demo_path: "data/lunarlander_continuous_demo.pkl" 15 | # HER 16 | use_her: False 17 | her: 18 | type: LunarLanderContinuousHER 19 | success_score: 250.0 20 | desired_states_from_demo: True 21 | 22 | learner_cfg: 23 | type: "BCDDPGLearner" 24 | backbone: 25 | actor: 26 | critic: 27 | shared_actor_critic: 28 | head: 29 | actor: 30 | type: "MLP" 31 | configs: 32 | hidden_sizes: [256, 256] 33 | output_activation: "tanh" 34 | critic: 35 | type: "MLP" 36 | configs: 37 | hidden_sizes: [256, 256] 38 | output_size: 1 39 | output_activation: "identity" 40 | optim_cfg: 41 | lr_actor: 0.0001 42 | lr_critic: 0.001 43 | weight_decay: 0.0001 44 | 45 | noise_cfg: 46 | ou_noise_theta: 0.0 47 | ou_noise_sigma: 0.0 48 | -------------------------------------------------------------------------------- /configs/reacher_v2/sac.yaml: -------------------------------------------------------------------------------- 1 | type: "SACAgent" 2 | hyper_params: 3 | gamma: 0.99 4 | tau: 0.005 5 | buffer_size: 100000 6 | batch_size: 512 7 | initial_random_action: 20000 8 | multiple_update: 1 # multiple learning updates 9 | policy_update_freq: 2 10 | w_entropy: 0.001 11 | w_mean_reg: 0.001 12 | w_std_reg: 0.001 13 | w_pre_activation_reg: 0.0 14 | auto_entropy_tuning: True 15 | 16 | learner_cfg: 17 | type: "SACLearner" 18 | backbone: 19 | actor: 20 | critic_vf: 21 | critic_qf: 22 | head: 23 | actor: 24 | type: "TanhGaussianDistParams" 25 | configs: 26 | hidden_sizes: [256, 256] 27 | output_activation: "identity" 28 | critic_vf: 29 | type: "MLP" 30 | configs: 31 | hidden_sizes: [256, 256] 32 | output_size: 1 33 | output_activation: "identity" 34 | critic_qf: 35 | type: "MLP" 36 | configs: 37 | hidden_sizes: [256, 256] 38 | output_size: 1 39 | output_activation: "identity" 40 | optim_cfg: 41 | lr_actor: 0.0003 42 | lr_vf: 0.0003 43 | lr_qf1: 0.0003 44 | lr_qf2: 0.0003 45 | lr_entropy: 0.0003 46 | weight_decay: 0.0 47 | -------------------------------------------------------------------------------- /configs/pong_no_frameskip_v4/ppo.yaml: -------------------------------------------------------------------------------- 1 | type: "PPOAgent" 2 | hyper_params: 3 | gamma: 0.99 4 | tau: 0.95 5 | batch_size: 32 6 | max_epsilon: 0.2 7 | min_epsilon: 0.2 8 | epsilon_decay_period: 1500 9 | w_value: 1.0 10 | w_entropy: 0.001 11 | gradient_clip_ac: 0.5 12 | gradient_clip_cr: 1.0 13 | epoch: 16 14 | rollout_len: 256 15 | n_workers: 4 16 | use_clipped_value_loss: False 17 | standardize_advantage: True 18 | 19 | learner_cfg: 20 | type: "PPOLearner" 21 | backbone: 22 | actor: 23 | critic: 24 | shared_actor_critic: 25 | type: "CNN" 26 | configs: 27 | input_sizes: [4, 32, 64] 28 | output_sizes: [32, 64, 64] 29 | kernel_sizes: [8, 4, 3] 30 | strides: [4, 2, 1] 31 | paddings: [1, 0, 0] 32 | head: 33 | actor: 34 | type: "CategoricalDist" 35 | configs: 36 | hidden_sizes: [512] 37 | output_activation: "identity" 38 | critic: 39 | type: "MLP" 40 | configs: 41 | hidden_sizes: [512] 42 | output_size: 1 43 | output_activation: "identity" 44 | optim_cfg: 45 | lr_actor: 0.0003 46 | lr_critic: 0.001 47 | weight_decay: 0.0 48 | -------------------------------------------------------------------------------- /.github/workflows/python-publish.yaml: -------------------------------------------------------------------------------- 1 | # This workflows will upload a Python Package using Twine when a release is created 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries 3 | 4 | name: Upload package to local pypi server 5 | 6 | on: 7 | push: 8 | tags: 9 | - "v*" 10 | 11 | jobs: 12 | deploy: 13 | 14 | runs-on: ubuntu-latest 15 | 16 | steps: 17 | - uses: actions/checkout@v2 18 | - name: Set up Python 19 | uses: actions/setup-python@v2 20 | with: 21 | python-version: '3.x' 22 | - name: Install dependencies 23 | run: | 24 | python -m pip install --upgrade pip 25 | pip install setuptools wheel twine 26 | - name: Check version 27 | run: | 28 | sh tools/check_version.sh ${GITHUB_REF#refs/*/} 29 | - name: Build and publish 30 | env: 31 | TWINE_REPOSITORY_URL: ${{ secrets.PYPI_REPOSITORY_URL }} 32 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} 33 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 34 | run: | 35 | python setup.py sdist 36 | twine upload dist/* 37 | -------------------------------------------------------------------------------- /tools/run_lunarlander_continuous_v2.sh: -------------------------------------------------------------------------------- 1 | python run_lunarlander_continuous_v2.py --cfg-path ./configs/lunarlander_continuous_v2/a2c.yaml --off-render --log 2 | python run_lunarlander_continuous_v2.py --cfg-path ./configs/lunarlander_continuous_v2/ddpg.yaml --off-render --log 3 | python run_lunarlander_continuous_v2.py --cfg-path ./configs/lunarlander_continuous_v2/td3.yaml --off-render --log 4 | python run_lunarlander_continuous_v2.py --cfg-path ./configs/lunarlander_continuous_v2/sac.yaml --off-render --log 5 | python run_lunarlander_continuous_v2.py --cfg-path ./configs/lunarlander_continuous_v2/ppo.yaml --off-render --log 6 | python run_lunarlander_continuous_v2.py --cfg-path ./configs/lunarlander_continuous_v2/bc_ddpg.yaml --off-render --log 7 | python run_lunarlander_continuous_v2.py --cfg-path ./configs/lunarlander_continuous_v2/bc_sac.yaml --off-render --log 8 | python run_lunarlander_continuous_v2.py --cfg-path ./configs/lunarlander_continuous_v2/per_ddpg.yaml --off-render --log 9 | python run_lunarlander_continuous_v2.py --cfg-path ./configs/lunarlander_continuous_v2/ddpgfd.yaml --off-render --log 10 | python run_lunarlander_continuous_v2.py --cfg-path ./configs/lunarlander_continuous_v2/sacfd.yaml --off-render --log -------------------------------------------------------------------------------- /rl_algorithms/common/env/normalizers.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Collection of normalizers. 3 | 4 | - Author: Curt Park 5 | - Contact: curt.park@medipixel.io 6 | """ 7 | 8 | import gym 9 | import numpy as np 10 | 11 | 12 | class ActionNormalizer(gym.ActionWrapper): 13 | """Rescale and relocate the actions.""" 14 | 15 | def action(self, action: np.ndarray) -> np.ndarray: 16 | """Change the range (-1, 1) to (low, high).""" 17 | low = self.action_space.low 18 | high = self.action_space.high 19 | 20 | scale_factor = (high - low) / 2 21 | reloc_factor = high - scale_factor 22 | 23 | action = action * scale_factor + reloc_factor 24 | action = np.clip(action, low, high) 25 | 26 | return action 27 | 28 | def reverse_action(self, action: np.ndarray) -> np.ndarray: 29 | """Change the range (low, high) to (-1, 1).""" 30 | low = self.action_space.low 31 | high = self.action_space.high 32 | 33 | scale_factor = (high - low) / 2 34 | reloc_factor = high - scale_factor 35 | 36 | action = (action - reloc_factor) / scale_factor 37 | action = np.clip(action, -1.0, 1.0) 38 | 39 | return action 40 | -------------------------------------------------------------------------------- /configs/lunarlander_continuous_v2/ddpgfd.yaml: -------------------------------------------------------------------------------- 1 | type: "DDPGfDAgent" 2 | hyper_params: 3 | gamma: 0.99 4 | tau: 0.005 5 | buffer_size: 10000 6 | batch_size: 64 7 | initial_random_action: 10000 8 | multiple_update: 1 # multiple learning updates 9 | gradient_clip_ac: 0.5 10 | gradient_clip_cr: 1.0 11 | # fD 12 | per_alpha: 0.3 13 | per_beta: 1.0 14 | per_eps: 0.000006 15 | per_eps_demo: 1.0 16 | n_step: 1 17 | pretrain_step: 5000 18 | lambda1: 1.0 # N-step return weight 19 | # lambda2 = weight_decay 20 | lambda3: 1.0 # actor loss contribution of prior weight 21 | demo_path: "data/lunarlander_continuous_demo.pkl" 22 | 23 | learner_cfg: 24 | type: "DDPGfDLearner" 25 | backbone: 26 | actor: 27 | critic: 28 | shared_actor_critic: 29 | head: 30 | actor: 31 | type: "MLP" 32 | configs: 33 | hidden_sizes: [256, 256] 34 | output_activation: "tanh" 35 | critic: 36 | type: "MLP" 37 | configs: 38 | hidden_sizes: [256, 256] 39 | output_size: 1 40 | output_activation: "identity" 41 | optim_cfg: 42 | lr_actor: 0.0003 43 | lr_critic: 0.0003 44 | weight_decay: 0.0001 45 | 46 | noise_cfg: 47 | ou_noise_theta: 0.0 48 | ou_noise_sigma: 0.0 49 | -------------------------------------------------------------------------------- /configs/lunarlander_continuous_v2/sac.yaml: -------------------------------------------------------------------------------- 1 | type: "SACAgent" 2 | hyper_params: 3 | gamma: 0.99 4 | tau: 0.005 5 | buffer_size: 100000 6 | batch_size: 128 7 | initial_random_action: 5000 8 | multiple_update: 1 # multiple learning updates 9 | policy_update_freq: 2 10 | w_entropy: 0.001 11 | w_mean_reg: 0.0 12 | w_std_reg: 0.0 13 | w_pre_activation_reg: 0.0 14 | auto_entropy_tuning: True 15 | 16 | learner_cfg: 17 | type: "SACLearner" 18 | backbone: 19 | actor: 20 | critic_vf: 21 | critic_qf: 22 | shared_actor_critic: 23 | head: 24 | actor: 25 | type: "TanhGaussianDistParams" 26 | configs: 27 | hidden_sizes: [256, 256] 28 | output_activation: "identity" 29 | fixed_logstd: False 30 | critic_vf: 31 | type: "MLP" 32 | configs: 33 | hidden_sizes: [256, 256] 34 | output_size: 1 35 | output_activation: "identity" 36 | critic_qf: 37 | type: "MLP" 38 | configs: 39 | hidden_sizes: [256, 256] 40 | output_size: 1 41 | output_activation: "identity" 42 | optim_cfg: 43 | lr_actor: 0.0003 44 | lr_vf: 0.0003 45 | lr_qf1: 0.0003 46 | lr_qf2: 0.0003 47 | lr_entropy: 0.0003 48 | weight_decay: 0.0 49 | -------------------------------------------------------------------------------- /configs/lunarlander_v2/r2d1.yaml: -------------------------------------------------------------------------------- 1 | type: "R2D1Agent" 2 | hyper_params: 3 | gamma: 0.99 4 | tau: 0.005 5 | buffer_size: 10000 # openai baselines: 10000 6 | batch_size: 64 # openai baselines: 32 7 | update_starts_from: 1000 # openai baselines: 10000 8 | multiple_update: 1 # multiple learning updates 9 | train_freq: 1 # in openai baselines, train_freq = 4 10 | gradient_clip: 10.0 # dueling: 10.0 11 | n_step: 3 12 | w_n_step: 1.0 13 | w_q_reg: 0.0 14 | per_alpha: 0.6 # openai baselines: 0.6 15 | per_beta: 0.4 16 | per_eps: 0.000001 17 | max_epsilon: 1.0 18 | min_epsilon: 0.01 # openai baselines: 0.01 19 | epsilon_decay: 0.00002 # openai baselines: 1e-7 / 1e-1 20 | # R2D1 21 | sequence_size: 32 22 | overlap_size: 16 23 | 24 | learner_cfg: 25 | type: "R2D1Learner" 26 | loss_type: 27 | type: "R2D1C51Loss" 28 | backbone: 29 | gru: 30 | rnn_hidden_size: 64 31 | burn_in_step: 16 32 | head: 33 | type: "C51DuelingMLP" 34 | configs: 35 | hidden_sizes: [128, 64] 36 | v_min: -300 37 | v_max: 300 38 | atom_size: 51 39 | output_activation: "identity" 40 | use_noisy_net: False 41 | optim_cfg: 42 | lr_dqn: 0.0001 43 | weight_decay: 0.0000001 44 | adam_eps: 0.00000001 45 | -------------------------------------------------------------------------------- /configs/lunarlander_v2/distillation_dqn.yaml: -------------------------------------------------------------------------------- 1 | type: "DistillationDQNAgent" 2 | hyper_params: 3 | gamma: 0.99 4 | tau: 0.005 5 | buffer_size: 100000 # openai baselines: 10000 6 | batch_size: 64 # openai baselines: 32 7 | update_starts_from: 10000 # openai baselines: 10000 8 | multiple_update: 1 # multiple learning updates 9 | train_freq: 1 # in openai baselines, train_freq = 4 10 | gradient_clip: 10.0 # dueling: 10.0 11 | n_step: 3 12 | w_n_step: 1.0 13 | w_q_reg: 0.01 14 | per_alpha: 0.6 # openai baselines: 0.6 15 | per_beta: 0.4 16 | per_eps: 0.000001 17 | max_epsilon: 1.0 18 | min_epsilon: 0.01 # openai baselines: 0.01 19 | epsilon_decay: 0.00001 # openai baselines: 1e-7 / 1e-1 20 | # Distillation 21 | dataset_path: [] 22 | save_dir: "data/" 23 | epochs: 20 24 | n_frame_from_last: 50000 25 | is_student: False 26 | 27 | learner_cfg: 28 | type: "DQNLearner" 29 | loss_type: 30 | type: "C51Loss" 31 | backbone: 32 | head: 33 | type: "C51DuelingMLP" 34 | configs: 35 | hidden_sizes: [128, 64] 36 | v_min: -300 37 | v_max: 300 38 | atom_size: 1530 39 | output_activation: "identity" 40 | use_noisy_net: False 41 | optim_cfg: 42 | lr_dqn: 0.0001 43 | weight_decay: 0.0000001 44 | adam_eps: 0.00000001 45 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import find_packages, setup 2 | 3 | with open("README.md", "r", encoding="utf-8") as fh: 4 | long_description = fh.read() 5 | 6 | with open("./requirements.txt", "r") as f: 7 | required = f.read().splitlines() 8 | 9 | version_file = "rl_algorithms/version" 10 | 11 | 12 | def get_version(): 13 | version = open(version_file, "r", encoding="utf-8").read().strip() 14 | return version 15 | 16 | 17 | setup( 18 | name="rl_algorithms", 19 | version=get_version(), 20 | author="medipixel", 21 | author_email="kh.kim@medipixel.io", 22 | description="Reinforcement Learning algorithms which are being used for research \ 23 | activities at Medipixel.", 24 | long_description=long_description, 25 | long_description_content_type="text/markdown", 26 | url="https://github.com/medipixel/rl_algorithms.git", 27 | keywords="reinforcement-learning python machine learning", 28 | packages=find_packages(), 29 | classifiers=[ 30 | "Programming Language :: Python :: 3", 31 | "Programming Language :: Python :: 3.6", 32 | "License :: OSI Approved :: MIT License", 33 | "Operating System :: OS Independent", 34 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 35 | ], 36 | python_requires=">=3.6", 37 | install_requires=required, 38 | include_package_data=True, 39 | zip_safe=False, 40 | ) 41 | -------------------------------------------------------------------------------- /configs/lunarlander_v2/dqfd.yaml: -------------------------------------------------------------------------------- 1 | type: "DQfDAgent" 2 | hyper_params: 3 | gamma: 0.99 4 | tau: 0.005 5 | buffer_size: 100000 # openai baselines: 10000 6 | batch_size: 64 # openai baselines: 32 7 | update_starts_from: 10000 # openai baselines: 10000 8 | multiple_update: 1 # multiple learning updates 9 | train_freq: 8 # in openai baselines, train_freq = 4 10 | gradient_clip: 0.5 # dueling: 10.0 11 | n_step: 3 12 | w_n_step: 1.0 13 | w_q_reg: 0.0000001 14 | per_alpha: 0.6 # openai baselines: 0.6 15 | per_beta: 0.4 16 | per_eps: 0.001 17 | # fD 18 | per_eps_demo: 1.0 19 | lambda1: 1.0 # N-step return weight 20 | lambda2: 1.0 # Supervised loss weight 21 | # lambda3 = weight_decay (l2 regularization weight) 22 | margin: 0.8 23 | pretrain_step: 100 24 | max_epsilon: 1.0 25 | min_epsilon: 0.0 # openai baselines: 0.01 26 | epsilon_decay: 0.00002 # openai baselines: 1e-7 / 1e-1 27 | demo_path: "data/lunarlander_discrete_demo.pkl" 28 | 29 | learner_cfg: 30 | type: "DQfDLearner" 31 | loss_type: 32 | type: "C51Loss" 33 | backbone: 34 | head: 35 | type: "C51DuelingMLP" 36 | configs: 37 | hidden_sizes: [128, 64] 38 | v_min: -300 39 | v_max: 300 40 | atom_size: 1530 41 | output_activation: "identity" 42 | use_noisy_net: False 43 | optim_cfg: 44 | lr_dqn: 0.0001 45 | weight_decay: 0.00001 46 | adam_eps: 0.00000001 47 | -------------------------------------------------------------------------------- /configs/reacher_v2/bc_sac.yaml: -------------------------------------------------------------------------------- 1 | type: "BCSACAgent" 2 | hyper_params: 3 | gamma: 0.99 4 | tau: 0.005 5 | buffer_size: 1000000 6 | batch_size: 512 7 | initial_random_action: 10000 8 | multiple_update: 1 # multiple learning updates 9 | policy_update_freq: 2 10 | w_entropy: 0.001 11 | w_mean_reg: 0.001 12 | w_std_reg: 0.001 13 | w_pre_activation_reg: 0.0 14 | auto_entropy_tuning: True 15 | # BC 16 | demo_batch_size: 64 17 | lambda1: 0.001 18 | demo_path: "data/reacher_demo.pkl" 19 | # HER 20 | use_her: True 21 | her: 22 | type: ReacherHER 23 | success_score: -5.0 24 | desired_states_from_demo: False 25 | 26 | learner_cfg: 27 | type: "BCSACLearner" 28 | backbone: 29 | actor: 30 | critic_vf: 31 | critic_qf: 32 | head: 33 | actor: 34 | type: "TanhGaussianDistParams" 35 | configs: 36 | hidden_sizes: [256, 256] 37 | output_activation: "identity" 38 | critic_vf: 39 | type: "MLP" 40 | configs: 41 | hidden_sizes: [256, 256] 42 | output_size: 1 43 | output_activation: "identity" 44 | critic_qf: 45 | type: "MLP" 46 | configs: 47 | hidden_sizes: [256, 256] 48 | output_size: 1 49 | output_activation: "identity" 50 | optim_cfg: 51 | lr_actor: 0.0003 52 | lr_vf: 0.0003 53 | lr_qf1: 0.0003 54 | lr_qf2: 0.0003 55 | lr_entropy: 0.0003 56 | weight_decay: 0.0 57 | -------------------------------------------------------------------------------- /rl_algorithms/common/abstract/buffer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Abstract Buffer & BufferWrapper class. 3 | 4 | - Author: Euijin Jeong 5 | - Contact: euijin.jeong@medipixel.io 6 | """ 7 | 8 | from abc import ABC, abstractmethod 9 | from typing import Any, Tuple 10 | 11 | import numpy as np 12 | 13 | 14 | class BaseBuffer(ABC): 15 | """Abstract Buffer used for replay buffer.""" 16 | 17 | @abstractmethod 18 | def add(self, transition: Tuple[Any, ...]) -> Tuple[Any, ...]: 19 | pass 20 | 21 | @abstractmethod 22 | def sample(self) -> Tuple[np.ndarray, ...]: 23 | pass 24 | 25 | @abstractmethod 26 | def __len__(self) -> int: 27 | pass 28 | 29 | 30 | class BufferWrapper(BaseBuffer): 31 | """Abstract BufferWrapper used for buffer wrapper. 32 | 33 | Attributes: 34 | buffer (Buffer): Hold replay buffer as am attribute 35 | """ 36 | 37 | def __init__(self, base_buffer: BaseBuffer): 38 | """Initialize a ReplayBuffer object. 39 | 40 | Args: 41 | base_buffer (int): ReplayBuffer which should be hold 42 | """ 43 | self.buffer = base_buffer 44 | 45 | def add(self, transition: Tuple[Any, ...]) -> Tuple[Any, ...]: 46 | return self.buffer.add(transition) 47 | 48 | def sample(self) -> Tuple[np.ndarray, ...]: 49 | return self.buffer.sample() 50 | 51 | def __len__(self) -> int: 52 | """Return the current size of internal memory.""" 53 | return len(self.buffer) 54 | -------------------------------------------------------------------------------- /configs/lunarlander_continuous_v2/bc_sac.yaml: -------------------------------------------------------------------------------- 1 | type: "BCSACAgent" 2 | hyper_params: 3 | gamma: 0.99 4 | tau: 0.005 5 | buffer_size: 1000000 6 | batch_size: 512 7 | initial_random_action: 10000 8 | multiple_update: 1 # multiple learning updates 9 | policy_update_freq: 2 10 | w_entropy: 0.001 11 | w_mean_reg: 0.001 12 | w_std_reg: 0.001 13 | w_pre_activation_reg: 0.0 14 | auto_entropy_tuning: True 15 | # BC 16 | demo_batch_size: 64 17 | lambda1: 0.001 18 | demo_path: "data/lunarlander_continuous_demo.pkl" 19 | # HER 20 | use_her: False 21 | her: 22 | type: LunarLanderContinuousHER 23 | success_score: 250.0 24 | desired_states_from_demo: True 25 | 26 | learner_cfg: 27 | type: "BCSACLearner" 28 | backbone: 29 | actor: 30 | critic_vf: 31 | critic_qf: 32 | shared_actor_critic: 33 | head: 34 | actor: 35 | type: "TanhGaussianDistParams" 36 | configs: 37 | hidden_sizes: [256, 256] 38 | output_activation: "identity" 39 | fixed_logstd: False 40 | critic_vf: 41 | type: "MLP" 42 | configs: 43 | hidden_sizes: [256, 256] 44 | output_size: 1 45 | output_activation: "identity" 46 | critic_qf: 47 | type: "MLP" 48 | configs: 49 | hidden_sizes: [256, 256] 50 | output_size: 1 51 | output_activation: "identity" 52 | optim_cfg: 53 | lr_actor: 0.0003 54 | lr_vf: 0.0003 55 | lr_qf1: 0.0003 56 | lr_qf2: 0.0003 57 | lr_entropy: 0.0003 58 | weight_decay: 0.0 59 | -------------------------------------------------------------------------------- /configs/lunarlander_continuous_v2/gail_ppo.yaml: -------------------------------------------------------------------------------- 1 | type: "GAILPPOAgent" 2 | hyper_params: 3 | gamma: 0.99 4 | tau: 0.95 5 | batch_size: 128 6 | max_epsilon: 0.2 7 | min_epsilon: 0.2 8 | epsilon_decay_period: 1500 9 | w_value: 1.0 10 | w_entropy: 0.001 11 | gradient_clip_ac: 0.5 12 | gradient_clip_cr: 1.0 13 | epoch: 10 14 | rollout_len: 1024 15 | n_workers: 4 16 | use_clipped_value_loss: False 17 | standardize_advantage: True 18 | gail_reward_weight: 1.0 19 | demo_path: "data/lunarlander_continuous_demo.pkl" 20 | 21 | learner_cfg: 22 | type: "GAILPPOLearner" 23 | backbone: 24 | actor: 25 | critic: 26 | discriminator: 27 | shared_actor_critic: 28 | head: 29 | actor: 30 | type: "GaussianDist" 31 | configs: 32 | hidden_sizes: [256, 256] 33 | output_activation: "identity" 34 | fixed_logstd: True 35 | critic: 36 | type: "MLP" 37 | configs: 38 | hidden_sizes: [256, 256] 39 | output_size: 1 40 | output_activation: "identity" 41 | discriminator: 42 | type: "MLP" 43 | configs: 44 | hidden_sizes: [256, 256] 45 | output_size: 1 46 | output_activation: "identity" 47 | aciton_embedder: 48 | type: "MLP" 49 | configs: 50 | hidden_sizes: [] 51 | output_size: 16 52 | output_activation: "identity" 53 | 54 | optim_cfg: 55 | lr_actor: 0.0003 56 | lr_critic: 0.001 57 | lr_discriminator: 0.0003 58 | weight_decay: 0.0 59 | discriminator_acc_threshold : 0.8 -------------------------------------------------------------------------------- /configs/pong_no_frameskip_v4/r2d1.yaml: -------------------------------------------------------------------------------- 1 | type: "R2D1Agent" 2 | hyper_params: 3 | gamma: 0.99 4 | tau: 0.005 5 | buffer_size: 4000 # openai baselines: 10000 6 | batch_size: 32 # openai baselines: 32 7 | update_starts_from: 4000 # openai baselines: 10000 8 | multiple_update: 1 # multiple learning updates 9 | train_freq: 4 # in openai baselines, train_freq = 4 10 | gradient_clip: 10.0 # dueling: 10.0 11 | n_step: 5 12 | w_n_step: 1.0 13 | w_q_reg: 0.0 14 | per_alpha: 0.6 # openai baselines: 0.6 15 | per_beta: 0.4 16 | per_eps: 0.000001 17 | max_epsilon: 1.0 18 | min_epsilon: 0.01 # openai baselines: 0.01 19 | epsilon_decay: 0.000003 # openai baselines: 1e-7 / 1e-1 20 | # Grad_cam 21 | grad_cam_layer_list: 22 | - "backbone.cnn.cnn_0.cnn" 23 | - "backbone.cnn.cnn_1.cnn" 24 | - "backbone.cnn.cnn_2.cnn" 25 | # R2D1 26 | sequence_size: 20 27 | overlap_size: 10 28 | 29 | learner_cfg: 30 | type: "R2D1Learner" 31 | loss_type: 32 | type: "R2D1DQNLoss" 33 | backbone: 34 | type: "CNN" 35 | configs: 36 | input_sizes: [4, 32, 64] 37 | output_sizes: [32, 64, 64] 38 | kernel_sizes: [8, 4, 3] 39 | strides: [4, 2, 1] 40 | paddings: [1, 0, 0] 41 | gru: 42 | rnn_hidden_size: 512 43 | burn_in_step: 10 44 | head: 45 | type: "DuelingMLP" 46 | configs: 47 | hidden_sizes: [512] 48 | output_activation: "identity" 49 | # NoisyNet 50 | use_noisy_net: False 51 | optim_cfg: 52 | lr_dqn: 0.0001 53 | weight_decay: 0.0 54 | adam_eps: 0.00000001 55 | -------------------------------------------------------------------------------- /configs/lunarlander_continuous_v2/sacfd.yaml: -------------------------------------------------------------------------------- 1 | type: "SACfDAgent" 2 | hyper_params: 3 | gamma: 0.99 4 | tau: 0.001 5 | buffer_size: 100000 6 | batch_size: 64 7 | initial_random_action: 5000 8 | multiple_update: 2 # multiple learning updates 9 | policy_update_freq: 2 10 | w_entropy: 0.001 11 | w_mean_reg: 0.001 12 | w_std_reg: 0.001 13 | w_pre_activation_reg: 0.0 14 | auto_entropy_tuning: True 15 | # fD 16 | per_alpha: 0.6 17 | per_beta: 0.4 18 | per_eps: 0.000001 19 | per_eps_demo: 1.0 20 | n_step: 3 21 | pretrain_step: 100 22 | lambda1: 1.0 # N-step return weight 23 | # lambda2 = weight_decay 24 | lambda3: 1.0 # actor loss contribution of prior weight 25 | demo_path: "data/lunarlander_continuous_demo.pkl" 26 | 27 | learner_cfg: 28 | type: "SACfDLearner" 29 | backbone: 30 | actor: 31 | critic_vf: 32 | critic_qf: 33 | shared_actor_critic: 34 | head: 35 | actor: 36 | type: "TanhGaussianDistParams" 37 | configs: 38 | hidden_sizes: [256, 256] 39 | output_activation: "identity" 40 | fixed_logstd: False 41 | critic_vf: 42 | type: "MLP" 43 | configs: 44 | hidden_sizes: [256, 256] 45 | output_size: 1 46 | output_activation: "identity" 47 | critic_qf: 48 | type: "MLP" 49 | configs: 50 | hidden_sizes: [256, 256] 51 | output_size: 1 52 | output_activation: "identity" 53 | optim_cfg: 54 | lr_actor: 0.0003 55 | lr_vf: 0.0003 56 | lr_qf1: 0.0003 57 | lr_qf2: 0.0003 58 | lr_entropy: 0.0003 59 | weight_decay: 0.00001 60 | -------------------------------------------------------------------------------- /configs/pong_no_frameskip_v4/dqn.yaml: -------------------------------------------------------------------------------- 1 | type: "DQNAgent" 2 | hyper_params: 3 | gamma: 0.99 4 | tau: 0.005 5 | buffer_size: 10000 # openai baselines: 10000 6 | batch_size: 32 # openai baselines: 32 7 | update_starts_from: 10000 # openai baselines: 10000 8 | multiple_update: 1 # multiple learning updates 9 | train_freq: 4 # in openai baselines, train_freq = 4 10 | gradient_clip: 10.0 # dueling: 10.0 11 | n_step: 3 12 | w_n_step: 1.0 13 | w_q_reg: 0.0 14 | per_alpha: 0.6 # openai baselines: 0.6 15 | per_beta: 0.4 16 | per_eps: 0.000001 17 | max_epsilon: 0.0 18 | min_epsilon: 0.0 # openai baselines: 0.01 19 | epsilon_decay: 0.000001 # openai baselines: 1e-7 / 1e-1 20 | # Grad_cam 21 | grad_cam_layer_list: 22 | - "backbone.cnn.cnn_0.cnn" 23 | - "backbone.cnn.cnn_1.cnn" 24 | - "backbone.cnn.cnn_2.cnn" 25 | 26 | learner_cfg: 27 | type: "DQNLearner" 28 | loss_type: 29 | type: "IQNLoss" 30 | backbone: 31 | type: "CNN" 32 | configs: 33 | input_sizes: [4, 32, 64] 34 | output_sizes: [32, 64, 64] 35 | kernel_sizes: [8, 4, 3] 36 | strides: [4, 2, 1] 37 | paddings: [1, 0, 0] 38 | head: 39 | type: "IQNMLP" 40 | configs: 41 | hidden_sizes: [512] 42 | n_tau_samples: 64 43 | n_tau_prime_samples: 64 44 | n_quantile_samples: 32 45 | quantile_embedding_dim: 64 46 | kappa: 1.0 47 | output_activation: "identity" 48 | # NoisyNet 49 | use_noisy_net: True 50 | std_init: 0.5 51 | optim_cfg: 52 | lr_dqn: 0.0001 53 | weight_decay: 0.0 54 | adam_eps: 0.00000001 55 | -------------------------------------------------------------------------------- /tests/buffer/test_uniform_buffer.py: -------------------------------------------------------------------------------- 1 | from typing import List, Tuple 2 | 3 | import numpy as np 4 | from scipy.stats import chisquare 5 | 6 | from rl_algorithms.common.buffer.replay_buffer import ReplayBuffer 7 | 8 | 9 | def generate_transition(idx: int) -> Tuple[np.ndarray, ...]: 10 | """Make dummy transition for testing buffer.""" 11 | obs = np.array([0]) 12 | act = np.array([0]) 13 | reward = idx 14 | next_obs = np.array([0]) 15 | done = False 16 | return (obs, act, reward, next_obs, done) 17 | 18 | 19 | def generate_sample_idx(buffer: ReplayBuffer) -> int: 20 | """Generate indices to test whether sampled uniformly or not.""" 21 | for i in range(buffer.max_len): 22 | buffer.add(generate_transition(i)) 23 | _, _, idx, _, _ = buffer.sample() 24 | return idx 25 | 26 | 27 | def check_uniform(lst: List) -> bool: 28 | """Check the distribution is Uniform Distribution.""" 29 | res = chisquare(lst) 30 | return res[1] >= 0.05 31 | 32 | 33 | def test_uniform_sample(buffer_length=32, batch_size=8): 34 | """Test whether transitions are uniformly sampled from replay buffer.""" 35 | 36 | n_repeat = 10000 37 | 38 | buffer = ReplayBuffer(max_len=buffer_length, batch_size=batch_size) 39 | 40 | sampled_lst = [0] * buffer.max_len 41 | # sampling index for the n_repeat times 42 | for _ in range(n_repeat): 43 | indices = generate_sample_idx(buffer) 44 | for idx in indices: 45 | sampled_lst[int(idx)] += 1 / n_repeat 46 | 47 | assert check_uniform(sampled_lst), "This distribution is not uniform." 48 | 49 | 50 | if __name__ == "__main__": 51 | test_uniform_sample() 52 | -------------------------------------------------------------------------------- /rl_algorithms/ppo/utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Utility functions for PPO. 3 | 4 | This module has PPO util functions. 5 | 6 | - Author: Curt Park 7 | - Contact: curt.park@medipixel.io 8 | - Paper: https://arxiv.org/abs/1707.06347 9 | """ 10 | 11 | from collections import deque 12 | from typing import List 13 | 14 | import numpy as np 15 | import torch 16 | 17 | 18 | def compute_gae( 19 | next_value: list, 20 | rewards: list, 21 | masks: list, 22 | values: list, 23 | gamma: float = 0.99, 24 | tau: float = 0.95, 25 | ) -> List: 26 | """Compute gae.""" 27 | values = values + [next_value] 28 | gae = 0 29 | returns: deque = deque() 30 | 31 | for step in reversed(range(len(rewards))): 32 | delta = rewards[step] + gamma * values[step + 1] * masks[step] - values[step] 33 | gae = delta + gamma * tau * masks[step] * gae 34 | returns.appendleft(gae + values[step]) 35 | 36 | return list(returns) 37 | 38 | 39 | def ppo_iter( 40 | epoch: int, 41 | mini_batch_size: int, 42 | states: torch.Tensor, 43 | actions: torch.Tensor, 44 | values: torch.Tensor, 45 | log_probs: torch.Tensor, 46 | returns: torch.Tensor, 47 | advantages: torch.Tensor, 48 | ): 49 | """Yield mini-batches.""" 50 | batch_size = states.size(0) 51 | for ep in range(epoch): 52 | for _ in range(batch_size // mini_batch_size): 53 | rand_ids = np.random.choice(batch_size, mini_batch_size) 54 | yield states[rand_ids, :], actions[rand_ids, :], values[ 55 | rand_ids, : 56 | ], log_probs[rand_ids, :], returns[rand_ids, :], advantages[rand_ids, :], ep 57 | -------------------------------------------------------------------------------- /rl_algorithms/common/env/utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Util functions for env. 3 | 4 | - Author: Curt Park 5 | - Contact: curt.park@medipixel.io 6 | """ 7 | 8 | from typing import Callable, List, Tuple 9 | 10 | import gym 11 | from gym.spaces import Discrete 12 | 13 | from rl_algorithms.common.env.multiprocessing_env import SubprocVecEnv 14 | from rl_algorithms.common.env.normalizers import ActionNormalizer 15 | 16 | 17 | def set_env( 18 | env: gym.Env, max_episode_steps: int, env_wrappers: List[gym.Wrapper] = None 19 | ) -> Tuple[gym.Env, int]: 20 | """Set environment according to user's config.""" 21 | if max_episode_steps > 0: 22 | env._max_episode_steps = max_episode_steps 23 | else: 24 | max_episode_steps = env._max_episode_steps 25 | 26 | if not isinstance(env.action_space, Discrete): 27 | env = ActionNormalizer(env) 28 | 29 | if env_wrappers: 30 | for env_wrapper in env_wrappers: 31 | env = env_wrapper(env) 32 | 33 | return env, max_episode_steps 34 | 35 | 36 | def env_generator( 37 | env_name: str, max_episode_steps: int, env_wrappers: List[gym.Wrapper] = None 38 | ) -> Callable: 39 | """Return env creating function (with normalizers).""" 40 | 41 | def _thunk(rank: int): 42 | env = gym.make(env_name) 43 | env.seed(777 + rank + 1) 44 | env, _ = set_env(env, max_episode_steps, env_wrappers) 45 | return env 46 | 47 | return _thunk 48 | 49 | 50 | def make_envs(env_gen: Callable, n_envs: int = 8) -> SubprocVecEnv: 51 | """Make multiple environments running on multiprocssors.""" 52 | envs = [env_gen(i) for i in range(n_envs)] 53 | subproc_env = SubprocVecEnv(envs) 54 | return subproc_env 55 | -------------------------------------------------------------------------------- /configs/pong_no_frameskip_v4/dqn_resnet.yaml: -------------------------------------------------------------------------------- 1 | type: "DQNAgent" 2 | hyper_params: 3 | gamma: 0.99 4 | tau: 0.005 5 | buffer_size: 10000 # openai baselines: 10000 6 | batch_size: 16 # openai baselines: 32 7 | update_starts_from: 10000 # openai baselines: 10000 8 | multiple_update: 1 # multiple learning updates 9 | train_freq: 4 # in openai baselines, train_freq = 4 10 | gradient_clip: 10.0 # dueling: 10.0 11 | n_step: 3 12 | w_n_step: 1.0 13 | w_q_reg: 0.0 14 | per_alpha: 0.6 # openai baselines: 0.6 15 | per_beta: 0.4 16 | per_eps: 0.000001 17 | max_epsilon: 0.0 18 | min_epsilon: 0.0 # openai baselines: 0.01 19 | epsilon_decay: 0.000001 # openai baselines: 1e-7 / 1e-1 20 | # Grad_cam 21 | grad_cam_layer_list: 22 | - "backbone.layer1.0.conv2" 23 | - "backbone.layer2.0.shortcut.0" 24 | - "backbone.layer3.0.shortcut.0" 25 | - "backbone.layer4.0.shortcut.0" 26 | - "backbone.conv_out" 27 | 28 | learner_cfg: 29 | type: "DQNLearner" 30 | loss_type: 31 | type: "IQNLoss" 32 | backbone: 33 | type: "ResNet" 34 | configs: 35 | use_bottleneck: False 36 | num_blocks: [1, 1, 1, 1] 37 | block_output_sizes: [32, 32, 64, 64] 38 | block_strides: [1, 2, 2, 2] 39 | first_input_size: 4 40 | first_output_size: 32 41 | expansion: 1 42 | channel_compression: 4 # compression ratio 43 | head: 44 | type: "IQNMLP" 45 | configs: 46 | hidden_sizes: [512] 47 | n_tau_samples: 64 48 | n_tau_prime_samples: 64 49 | n_quantile_samples: 32 50 | quantile_embedding_dim: 64 51 | kappa: 1.0 52 | output_activation: "identity" 53 | # NoisyNet 54 | use_noisy_net: True 55 | std_init: 0.5 56 | optim_cfg: 57 | lr_dqn: 0.0001 58 | weight_decay: 0.0 59 | adam_eps: 0.00000001 60 | -------------------------------------------------------------------------------- /tests/test_helper_funcion.py: -------------------------------------------------------------------------------- 1 | from collections import deque 2 | import random 3 | from typing import Deque 4 | 5 | import numpy as np 6 | 7 | from rl_algorithms.common.helper_functions import get_n_step_info 8 | 9 | 10 | def generate_dummy_buffer(maxlen: int, index: int) -> Deque: 11 | """Generate dummy n_step buffer.""" 12 | assert index <= maxlen 13 | n_step_buffer = deque(maxlen=maxlen) 14 | for i in range(maxlen): 15 | done = i == index 16 | transition = (np.array([i]), np.array([0]), i, np.array([i + 1]), done) 17 | n_step_buffer.append(transition) 18 | return n_step_buffer 19 | 20 | 21 | def check_case1(maxlen: int): 22 | """Test when the transition is terminal state.""" 23 | done_index = 0 24 | n_step_buffer = generate_dummy_buffer(maxlen, done_index) 25 | reward, next_state, _ = get_n_step_info(n_step_buffer, gamma=1) 26 | assert reward == done_index 27 | assert next_state == done_index + 1 28 | 29 | 30 | def check_case2(maxlen: int): 31 | """Test when there are no terminal within n_step.""" 32 | done_index = maxlen 33 | n_step_buffer = generate_dummy_buffer(maxlen, done_index) 34 | reward, next_state, _ = get_n_step_info(n_step_buffer, gamma=1) 35 | assert reward * 2 == maxlen * (maxlen - 1) 36 | assert next_state == maxlen 37 | 38 | 39 | def check_case3(maxlen: int): 40 | """Test when the terminal states exist within n_step.""" 41 | done_index = random.randint(1, maxlen - 1) 42 | n_step_buffer = generate_dummy_buffer(maxlen, done_index) 43 | reward, next_state, _ = get_n_step_info(n_step_buffer, gamma=1) 44 | assert reward * 2 == done_index * (done_index + 1) 45 | assert next_state == done_index + 1 46 | 47 | 48 | def test_get_n_step_info(maxlen=10): 49 | check_case1(maxlen) 50 | check_case2(maxlen) 51 | check_case3(maxlen) 52 | 53 | 54 | if __name__ == "__main__": 55 | test_get_n_step_info(maxlen=10) 56 | -------------------------------------------------------------------------------- /rl_algorithms/common/buffer/gail_buffer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Demo buffer for GAIL algorithm.""" 3 | 4 | import pickle 5 | from typing import List, Tuple 6 | 7 | import numpy as np 8 | import torch 9 | 10 | from rl_algorithms.common.abstract.buffer import BaseBuffer 11 | 12 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 13 | 14 | 15 | class GAILBuffer(BaseBuffer): 16 | """Buffer to store expert states and actions. 17 | 18 | Attributes: 19 | obs_buf (np.ndarray): observations 20 | acts_buf (np.ndarray): actions 21 | """ 22 | 23 | def __init__(self, dataset_path: str): 24 | """Initialize a Buffer. 25 | 26 | Args: 27 | dataset_path (str): path of the demo dataset 28 | """ 29 | 30 | self.obs_buf: np.ndarray = None 31 | self.acts_buf: np.ndarray = None 32 | 33 | self.load_demo(dataset_path) 34 | 35 | def load_demo(self, dataset_path: str): 36 | """load demo data.""" 37 | with open(dataset_path, "rb") as f: 38 | demo = list(pickle.load(f)) 39 | demo = np.array(demo) 40 | self.obs_buf = np.array(list(map(np.array, demo[:, 0]))) 41 | self.acts_buf = np.array(list(map(np.array, demo[:, 1]))) 42 | 43 | def add(self): 44 | pass 45 | 46 | def sample(self, batch_size, indices: List[int] = None) -> Tuple[np.ndarray, ...]: 47 | """Randomly sample a batch of experiences from memory.""" 48 | assert 0 < batch_size < len(self) 49 | 50 | if indices is None: 51 | indices = np.random.choice(len(self), size=batch_size) 52 | 53 | states = self.obs_buf[indices] 54 | actions = self.acts_buf[indices] 55 | 56 | return torch.Tensor(states).to(device), torch.Tensor(actions).to(device) 57 | 58 | def __len__(self) -> int: 59 | """Return the current size of internal memory.""" 60 | return len(self.obs_buf) 61 | -------------------------------------------------------------------------------- /tests/integration/test_run_apex.py: -------------------------------------------------------------------------------- 1 | """Test only one step of run file for training.""" 2 | 3 | import os 4 | import os.path as osp 5 | import re 6 | import shutil 7 | import subprocess 8 | 9 | 10 | def check_run_apex(config_root: str, run_file: str): 11 | """Test that 1 episode of run file works well.""" 12 | test_dir = osp.dirname(osp.abspath(__file__)) 13 | pkg_root_dir = osp.dirname(osp.dirname(test_dir)) 14 | os.chdir(pkg_root_dir) 15 | 16 | # loop of configs 17 | configs = os.listdir(config_root) 18 | for cfg in configs: 19 | # except such as __init__, __pycache__ 20 | if "__" in cfg or "apex" not in cfg: 21 | continue 22 | 23 | cmd = ( 24 | f"python {run_file} --cfg-path {config_root}{cfg} --integration-test " 25 | + f"--off-render --seed 12345 --interim-test-num 1" 26 | ) 27 | 28 | p = subprocess.Popen( 29 | cmd, 30 | stdout=subprocess.PIPE, 31 | stderr=subprocess.STDOUT, 32 | universal_newlines=True, 33 | shell=True, 34 | ) 35 | output, _ = p.communicate() 36 | print(str(output)) 37 | assert p.returncode == 0 38 | 39 | # Find saved checkpoint path 40 | pattern = r"./checkpoint/.+/" 41 | save_path = re.findall(pattern, str(output))[0] 42 | print(save_path) 43 | 44 | check_save_path(save_path) 45 | 46 | 47 | def check_save_path(save_path: str): 48 | """Check checkpoint that tested run file makes and remove the checkpoint.""" 49 | assert os.path.exists(save_path) 50 | 51 | # Remove checkpoint dir 52 | shutil.rmtree(save_path) 53 | 54 | 55 | def test_run_pong_no_frame_skip(): 56 | """Test all agents that train PongNoFrameskip-v4 env.""" 57 | check_run_apex("configs/pong_no_frameskip_v4/", "run_pong_no_frameskip_v4.py") 58 | 59 | 60 | if __name__ == "__main__": 61 | test_run_pong_no_frame_skip() 62 | -------------------------------------------------------------------------------- /configs/pong_no_frameskip_v4/apex_dqn.yaml: -------------------------------------------------------------------------------- 1 | type: "ApeX" 2 | hyper_params: 3 | gamma: 0.99 4 | tau: 0.005 5 | buffer_size: 100000 # openai baselines: 10000 6 | batch_size: 512 # openai baselines: 32 7 | update_starts_from: 30000 # openai baselines: 10000 8 | multiple_update: 1 # multiple learning updates 9 | train_freq: 1 # in openai baselines, train_freq = 4 10 | gradient_clip: 10.0 # dueling: 10.0 11 | n_step: 5 12 | w_n_step: 1.0 13 | w_q_reg: 0.0 14 | per_alpha: 0.6 # openai baselines: 0.6 15 | per_beta: 0.4 16 | per_eps: 0.000001 17 | max_epsilon: 1.0 18 | min_epsilon: 0.1 # openai baselines: 0.01 19 | epsilon_decay: 0.0000005 # openai baselines: 1e-7 / 1e-1 20 | # Grad_cam 21 | grad_cam_layer_list: 22 | - "backbone.cnn.cnn_0.cnn" 23 | - "backbone.cnn.cnn_1.cnn" 24 | - "backbone.cnn.cnn_2.cnn" 25 | # ApeX 26 | num_workers: 2 27 | local_buffer_max_size: 1000 28 | worker_update_interval: 50 29 | logger_interval: 1000 30 | max_update_step: 100000 31 | is_worker_log: True 32 | is_worker_render: False 33 | 34 | learner_cfg: 35 | type: "DQNLearner" 36 | loss_type: 37 | type: "DQNLoss" 38 | backbone: 39 | type: "CNN" 40 | configs: 41 | input_sizes: [4, 32, 64] 42 | output_sizes: [32, 64, 64] 43 | kernel_sizes: [8, 4, 3] 44 | strides: [4, 2, 1] 45 | paddings: [1, 0, 0] 46 | head: 47 | type: "DuelingMLP" 48 | configs: 49 | hidden_sizes: [512] 50 | output_activation: "identity" 51 | # NoisyNet 52 | use_noisy_net: False 53 | optim_cfg: 54 | lr_dqn: 0.0003 55 | weight_decay: 0.0 56 | adam_eps: 0.00000001 57 | 58 | worker_cfg: 59 | type: "DQNWorker" 60 | device: "cpu" 61 | 62 | logger_cfg: 63 | type: "DQNLogger" 64 | 65 | comm_cfg: 66 | learner_buffer_port: 6554 67 | learner_worker_port: 6555 68 | worker_buffer_port: 6556 69 | learner_logger_port: 6557 70 | send_batch_port: 6558 71 | priorities_port: 6559 72 | -------------------------------------------------------------------------------- /configs/pong_no_frameskip_v4/distillation_dqn.yaml: -------------------------------------------------------------------------------- 1 | type: "DistillationDQNAgent" 2 | hyper_params: 3 | gamma: 0.99 4 | tau: 0.005 5 | buffer_size: 10000 # openai baselines: 10000 6 | batch_size: 32 # openai baselines: 32 7 | update_starts_from: 10000 # openai baselines: 10000 8 | multiple_update: 1 # multiple learning updates 9 | train_freq: 4 # in openai baselines, train_freq = 4 10 | gradient_clip: 10.0 # dueling: 10.0 11 | n_step: 3 12 | w_n_step: 1.0 13 | w_q_reg: 0.0 14 | per_alpha: 0.6 # openai baselines: 0.6 15 | per_beta: 0.4 16 | per_eps: 0.000001 17 | max_epsilon: 0.0 18 | min_epsilon: 0.0 # openai baselines: 0.01 19 | epsilon_decay: 0.000001 # openai baselines: 1e-7 / 1e-1 20 | # Grad_cam 21 | grad_cam_layer_list: 22 | - "backbone.cnn.cnn_0.cnn" 23 | - "backbone.cnn.cnn_1.cnn" 24 | - "backbone.cnn.cnn_2.cnn" 25 | # Distillation 26 | dataset_path: 27 | - "data/distillation_buffer/PongNoFrameskip-v4/20200821134905" 28 | - "data/distillation_buffer/PongNoFrameskip-v4/20200821142921" 29 | - "data/distillation_buffer/PongNoFrameskip-v4/20200821145228" 30 | save_dir: "data/" 31 | epochs: 20 32 | n_frame_from_last: 50000 33 | is_student: False 34 | 35 | learner_cfg: 36 | type: "DQNLearner" 37 | loss_type: 38 | type: "IQNLoss" 39 | backbone: 40 | type: "CNN" 41 | configs: 42 | input_sizes: [4, 32, 64] 43 | output_sizes: [32, 64, 64] 44 | kernel_sizes: [8, 4, 3] 45 | strides: [4, 2, 1] 46 | paddings: [1, 0, 0] 47 | head: 48 | type: "IQNMLP" 49 | configs: 50 | hidden_sizes: [512] 51 | n_tau_samples: 64 52 | n_tau_prime_samples: 64 53 | n_quantile_samples: 32 54 | quantile_embedding_dim: 64 55 | kappa: 1.0 56 | output_activation: "identity" 57 | # NoisyNet 58 | use_noisy_net: True 59 | std_init: 0.5 60 | optim_cfg: 61 | lr_dqn: 0.0001 62 | weight_decay: 0.0 63 | adam_eps: 0.00000001 64 | -------------------------------------------------------------------------------- /rl_algorithms/registry.py: -------------------------------------------------------------------------------- 1 | from rl_algorithms.utils import Registry, build_from_cfg, build_ray_obj_from_cfg 2 | from rl_algorithms.utils.config import ConfigDict 3 | 4 | AGENTS = Registry("agents") 5 | LEARNERS = Registry("learners") 6 | BACKBONES = Registry("backbones") 7 | HEADS = Registry("heads") 8 | LOSSES = Registry("losses") 9 | HERS = Registry("hers") 10 | WORKERS = Registry("workers") 11 | LOGGERS = Registry("loggers") 12 | 13 | 14 | def build_agent(cfg: ConfigDict, build_args: dict = None): 15 | """Build agent using config and additional arguments.""" 16 | return build_from_cfg(cfg, AGENTS, build_args) 17 | 18 | 19 | def build_learner(cfg: ConfigDict, build_args: dict = None): 20 | """Build learner using config and additional arguments.""" 21 | return build_from_cfg(cfg, LEARNERS, build_args) 22 | 23 | 24 | def build_backbone(cfg: ConfigDict, build_args: dict = None): 25 | """Build backbone using config and additional arguments.""" 26 | return build_from_cfg(cfg, BACKBONES, build_args) 27 | 28 | 29 | def build_head(cfg: ConfigDict, build_args: dict = None): 30 | """Build head using config and additional arguments.""" 31 | return build_from_cfg(cfg, HEADS, build_args) 32 | 33 | 34 | def build_loss(cfg: ConfigDict, build_args: dict = None): 35 | """Build loss using config and additional arguments.""" 36 | return build_from_cfg(cfg, LOSSES, build_args) 37 | 38 | 39 | def build_her(cfg: ConfigDict, build_args: dict = None): 40 | """Build her using config and additional arguments.""" 41 | return build_from_cfg(cfg, HERS, build_args) 42 | 43 | 44 | def build_worker(cfg: ConfigDict, build_args: dict = None): 45 | """Build ray worker using config and additional arguments.""" 46 | # return build_ray_obj_from_cfg(cfg, WORKERS, build_args) 47 | return build_from_cfg(cfg, WORKERS, build_args) 48 | 49 | 50 | def build_logger(cfg: ConfigDict, build_args: dict = None): 51 | """Build ray worker using config and additional arguments.""" 52 | return build_ray_obj_from_cfg(cfg, LOGGERS, build_args) 53 | -------------------------------------------------------------------------------- /rl_algorithms/dqn/distributed_logger.py: -------------------------------------------------------------------------------- 1 | """DQN Logger for distributed training. 2 | 3 | - Author: Chris Yoon 4 | - Contact: chris.yoon@medipixel.io 5 | """ 6 | 7 | import numpy as np 8 | import torch 9 | import wandb 10 | 11 | from rl_algorithms.common.abstract.distributed_logger import DistributedLogger 12 | from rl_algorithms.registry import LOGGERS 13 | 14 | 15 | @LOGGERS.register_module 16 | class DQNLogger(DistributedLogger): 17 | """DQN Logger for distributed training.""" 18 | 19 | def load_params(self, path: str): 20 | """Load model and optimizer parameters.""" 21 | # Logger only runs on cpu 22 | DistributedLogger.load_params(self, path) 23 | 24 | params = torch.load(path, map_location="cpu") 25 | self.brain.load_state_dict(params["dqn_state_dict"]) 26 | print("[INFO] loaded the model and optimizer from", path) 27 | 28 | def select_action(self, state: np.ndarray): 29 | """Select action to be executed at given state.""" 30 | with torch.no_grad(): 31 | state = self._preprocess_state(state, self.device) 32 | selected_action = self.brain(state).argmax() 33 | selected_action = selected_action.cpu().numpy() 34 | 35 | return selected_action 36 | 37 | def write_log(self, log_value: dict): 38 | """Write log about loss and score.""" 39 | print( 40 | "[INFO] update_step %d, average score: %f, " 41 | "loss: %f, avg q-value: %f" 42 | % ( 43 | log_value["update_step"], 44 | log_value["avg_score"], 45 | log_value["step_info"][0], 46 | log_value["step_info"][1], 47 | ) 48 | ) 49 | 50 | if self.is_log: 51 | wandb.log( 52 | { 53 | "test score": log_value["avg_score"], 54 | "dqn loss": log_value["step_info"][0], 55 | "avg q values": log_value["step_info"][1], 56 | }, 57 | step=log_value["update_step"], 58 | ) 59 | -------------------------------------------------------------------------------- /rl_algorithms/common/noise.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Noise classes for algorithms.""" 3 | 4 | import copy 5 | import random 6 | 7 | import numpy as np 8 | 9 | 10 | class GaussianNoise: 11 | """Gaussian Noise. 12 | 13 | Taken from https://github.com/vitchyr/rlkit 14 | """ 15 | 16 | def __init__( 17 | self, 18 | action_dim: int, 19 | min_sigma: float = 1.0, 20 | max_sigma: float = 1.0, 21 | decay_period: int = 1000000, 22 | ): 23 | """Initialize.""" 24 | self.action_dim = action_dim 25 | self.max_sigma = max_sigma 26 | self.min_sigma = min_sigma 27 | self.decay_period = decay_period 28 | 29 | def sample(self, t: int = 0) -> float: 30 | """Get an action with gaussian noise.""" 31 | sigma = self.max_sigma - (self.max_sigma - self.min_sigma) * min( 32 | 1.0, t / self.decay_period 33 | ) 34 | return np.random.normal(0, sigma, size=self.action_dim) 35 | 36 | 37 | class OUNoise: 38 | """Ornstein-Uhlenbeck process. 39 | 40 | Taken from Udacity deep-reinforcement-learning github repository: 41 | https://github.com/udacity/deep-reinforcement-learning/blob/master/ 42 | ddpg-pendulum/ddpg_agent.py 43 | """ 44 | 45 | def __init__( 46 | self, size: int, mu: float = 0.0, theta: float = 0.15, sigma: float = 0.2 47 | ): 48 | """Initialize parameters and noise process.""" 49 | self.state = np.float64(0.0) 50 | self.mu = mu * np.ones(size) 51 | self.theta = theta 52 | self.sigma = sigma 53 | self.reset() 54 | 55 | def reset(self): 56 | """Reset the internal state (= noise) to mean (mu).""" 57 | self.state = copy.copy(self.mu) 58 | 59 | def sample(self) -> float: 60 | """Update internal state and return it as a noise sample.""" 61 | x = self.state 62 | dx = self.theta * (self.mu - x) + self.sigma * np.array( 63 | [random.random() for _ in range(len(x))] 64 | ) 65 | self.state = x + dx 66 | return self.state 67 | -------------------------------------------------------------------------------- /rl_algorithms/distillation/README.md: -------------------------------------------------------------------------------- 1 | # Using policy distillation 2 | 3 | 4 | We implemented 3 featues for training policy distillation. 5 | 6 | ## 1. Student training using trained agent's data (expert data) 7 | 8 | You can generate trained agent's data(expert data) by iterating the test episode. 9 | 10 | ``` 11 | python run_env_name.py --cfg-path --load-from --test 12 | ``` 13 | The collected states will be stored in directory: `data/distribution_buffer/`. 14 | 15 | 16 | If the expert data is generated, Put the path of the train-phase data in the dataset_path list in the distillation config file. Also change `is_student` to `True` in config file. And then execute the training just as the code below: 17 | 18 | ``` 19 | python run_env_name.py --cfg-path 20 | ``` 21 | 22 | You can set `epoch` and `batch_size` of the student learning through `epochs` and `batch_size` variables in the distillation config file. 23 | 24 | ## 2. Student training using training-phase states and trained agent 25 | 26 | This method provides the way to train the student using states that are generated as you train the agent(which we call it the train-phase data). 27 | 28 | Using distillation config file for training will automatically generate the train-phase data. 29 | ``` 30 | python run_env_name.py --cfg-path 31 | ``` 32 | 33 | The generated data will be stored in directory: `data/distribution_buffer/`. 34 | 35 | 36 | Since train-phase data doesn't contains the q value, you should load trained agent to generate q values for train-phase data. After putting the path of the train-phase data and changing `is_student` to `True` in the dataset_path list in the distillation config file, You can execute the training as the code below: 37 | ``` 38 | python run_env_name.py --cfg-path --load-from 39 | ``` 40 | 41 | ## 3. Test student agent 42 | If you only want to check the performance of the student agent, you should use the orginal agent config file instead of distillation config file. In pong environment for instance, you can use `dqn.py` config file instead of `distillation_dqn.py`. Using distillation config will also work well, but it will generate expert data while you're running the test. 43 | ``` 44 | python run_env_name.py --test --load-from --cfg-path 45 | ``` 46 | -------------------------------------------------------------------------------- /rl_algorithms/common/networks/backbones/cnn.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """CNN modules for RL algorithms. 3 | 4 | - Authors: Kyunghwan Kim, Curt Park 5 | - Contacts: kh.kim@medipixel.io 6 | curt.park@medipixel.io 7 | """ 8 | 9 | from typing import Callable 10 | 11 | import torch 12 | import torch.nn as nn 13 | import torch.nn.functional as F 14 | 15 | from rl_algorithms.common.helper_functions import identity 16 | from rl_algorithms.registry import BACKBONES 17 | from rl_algorithms.utils.config import ConfigDict 18 | 19 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 20 | 21 | 22 | # TODO: Remove it when upgrade torch>=1.7 23 | # pylint: disable=abstract-method 24 | class CNNLayer(nn.Module): 25 | def __init__( 26 | self, 27 | input_size: int, 28 | output_size: int, 29 | kernel_size: int, 30 | stride: int = 1, 31 | padding: int = 0, 32 | pre_activation_fn: Callable = identity, 33 | activation_fn: Callable = F.relu, 34 | post_activation_fn: Callable = identity, 35 | ): 36 | super(CNNLayer, self).__init__() 37 | 38 | self.cnn = nn.Conv2d( 39 | input_size, 40 | output_size, 41 | kernel_size=kernel_size, 42 | stride=stride, 43 | padding=padding, 44 | ) 45 | 46 | self.pre_activation_fn = pre_activation_fn 47 | self.activation_fn = activation_fn 48 | self.post_activation_fn = post_activation_fn 49 | 50 | def forward(self, x): 51 | x = self.cnn(x) 52 | x = self.pre_activation_fn(x) 53 | x = self.activation_fn(x) 54 | x = self.post_activation_fn(x) 55 | 56 | return x 57 | 58 | 59 | # TODO: Remove it when upgrade torch>=1.7 60 | # pylint: disable=abstract-method 61 | @BACKBONES.register_module 62 | class CNN(nn.Module): 63 | """Baseline of Convolution neural network.""" 64 | 65 | def __init__(self, configs: ConfigDict): 66 | super(CNN, self).__init__() 67 | 68 | cnn_layers = list(map(CNNLayer, *configs.values())) 69 | self.cnn = nn.Sequential() 70 | for i, cnn_layer in enumerate(cnn_layers): 71 | self.cnn.add_module("cnn_{}".format(i), cnn_layer) 72 | 73 | def forward(self, x: torch.Tensor) -> torch.Tensor: 74 | """Forward method implementation.""" 75 | if len(x.size()) == 3: 76 | x = x.unsqueeze(0) 77 | x = self.cnn(x) 78 | x = x.view(x.size(0), -1) 79 | return x 80 | -------------------------------------------------------------------------------- /tests/buffer/test_distillation_buffer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | import random 4 | import shutil 5 | 6 | import numpy as np 7 | import pytest 8 | 9 | from rl_algorithms.common.buffer.distillation_buffer import DistillationBuffer 10 | 11 | FOLDER_PATH_LIST = [ 12 | "data/distillation_buffer/test/expert_data/", 13 | "data/distillation_buffer/test/expert_data2/", 14 | "data/distillation_buffer/test/trainphase_data/", 15 | ] 16 | 17 | 18 | def gen_test_data(num_files: int): 19 | """Generate dummy data.""" 20 | for _dir in FOLDER_PATH_LIST: 21 | os.makedirs(_dir, exist_ok=True) 22 | 23 | for i, _dir in enumerate(FOLDER_PATH_LIST): 24 | for j in range(num_files): 25 | state = np.random.randint(0, 255, size=(3, 3, 2), dtype=np.uint8) 26 | action = np.zeros(3) 27 | action[random.randint(0, len(action) - 1)] = 1 28 | action = action.astype(np.int) 29 | if "trainphase" in _dir: 30 | with open(f"{FOLDER_PATH_LIST[i]}{j:07}.pkl", "wb") as f: 31 | pickle.dump([state], f) 32 | else: 33 | with open(f"{FOLDER_PATH_LIST[i]}{j:07}.pkl", "wb") as f: 34 | pickle.dump([state, action], f) 35 | 36 | 37 | def check_multiple_data_load(num_files: int): 38 | """Check if DistillationBuffer can load data from multiple path.""" 39 | batch_size = num_files * len(FOLDER_PATH_LIST[:-1]) 40 | memory = DistillationBuffer(batch_size, FOLDER_PATH_LIST[:-1]) 41 | memory.reset_dataloader() 42 | state, _ = memory.sample_for_diltillation() 43 | assert state.shape[0] == batch_size 44 | 45 | 46 | def check_mixture_data_assert(num_files: int): 47 | """Check if DistillationBuffer can check whether trainphase & expert data is mixed.""" 48 | memory = DistillationBuffer(num_files, FOLDER_PATH_LIST) 49 | with pytest.raises(AssertionError, match=r"mixture"): 50 | memory.reset_dataloader() 51 | 52 | 53 | def delete_path(path: str): 54 | """Delete directory.""" 55 | shutil.rmtree(path) 56 | 57 | 58 | def test_distillation_buffer(): 59 | """Test DistillationBuffer.""" 60 | try: 61 | num_file = 2 62 | gen_test_data(num_file) 63 | check_multiple_data_load(num_file) 64 | check_mixture_data_assert(num_file) 65 | 66 | except AssertionError as e: 67 | raise e 68 | 69 | finally: 70 | delete_path("data/distillation_buffer/test") 71 | 72 | 73 | if __name__ == "__main__": 74 | test_distillation_buffer() 75 | -------------------------------------------------------------------------------- /tests/integration/test_run_agent.py: -------------------------------------------------------------------------------- 1 | """Test only one step of run file for training.""" 2 | 3 | import os 4 | import os.path as osp 5 | import re 6 | import shutil 7 | import subprocess 8 | 9 | 10 | def check_run_env(config_root: str, run_file: str): 11 | """Test that 1 episode of run file works well.""" 12 | test_dir = osp.dirname(osp.abspath(__file__)) 13 | pkg_root_dir = osp.dirname(osp.dirname(test_dir)) 14 | os.chdir(pkg_root_dir) 15 | 16 | # loop of configs 17 | configs = os.listdir(config_root) 18 | for cfg in configs: 19 | # except such as __init__, __pycache__ 20 | if "__" in cfg or "apex" in cfg or "distillation" in cfg: 21 | continue 22 | 23 | cmd = ( 24 | f"python {run_file} --cfg-path {config_root}{cfg} --integration-test " 25 | + f"--off-render --episode-num 1 --max-episode-step 1 --seed 12345 " 26 | + f"--interim-test-num 1" 27 | ) 28 | 29 | p = subprocess.Popen( 30 | cmd, 31 | stdout=subprocess.PIPE, 32 | stderr=subprocess.STDOUT, 33 | universal_newlines=True, 34 | shell=True, 35 | ) 36 | output, _ = p.communicate() 37 | print(str(output)) 38 | assert p.returncode == 0, "Subprocess doesn't finished successfully." 39 | 40 | # Find saved checkpoint path 41 | pattern = r"./checkpoint/.+/" 42 | save_path = re.findall(pattern, str(output))[0] 43 | print(save_path) 44 | 45 | check_save_path(save_path) 46 | 47 | 48 | def check_save_path(save_path: str): 49 | """Check checkpoint that tested run file makes and remove the checkpoint.""" 50 | assert os.path.exists(save_path) 51 | 52 | # Remove checkpoint dir 53 | shutil.rmtree(save_path) 54 | 55 | 56 | def test_run_lunarlander_continuous(): 57 | """Test all agents that train LunarLanderContinuous-v2 env.""" 58 | check_run_env( 59 | "configs/lunarlander_continuous_v2/", "run_lunarlander_continuous_v2.py" 60 | ) 61 | 62 | 63 | def test_run_lunarlander(): 64 | """Test all agents that train LunarLander-v2 env.""" 65 | check_run_env("configs/lunarlander_v2/", "run_lunarlander_v2.py") 66 | 67 | 68 | def test_run_pong_no_frame_skip(): 69 | """Test all agents that train PongNoFrameskip-v4 env.""" 70 | check_run_env("configs/pong_no_frameskip_v4/", "run_pong_no_frameskip_v4.py") 71 | 72 | 73 | if __name__ == "__main__": 74 | test_run_lunarlander_continuous() 75 | test_run_lunarlander() 76 | test_run_pong_no_frame_skip() 77 | -------------------------------------------------------------------------------- /rl_algorithms/utils/config.py: -------------------------------------------------------------------------------- 1 | import collections.abc as collections_abc 2 | import os.path as osp 3 | 4 | from addict import Dict 5 | import yaml 6 | 7 | 8 | class ConfigDict(Dict): 9 | def __missing__(self, name): 10 | raise KeyError(name) 11 | 12 | def __getattr__(self, name): 13 | try: 14 | value = super(ConfigDict, self).__getattr__(name) 15 | except KeyError: 16 | ex = AttributeError( 17 | "'{}' object has no attribute '{}'".format( 18 | self.__class__.__name__, name 19 | ) 20 | ) 21 | except Exception as e: 22 | ex = e 23 | else: 24 | return value 25 | raise ex 26 | 27 | def __setitem__(self, name, value): 28 | if isinstance(value, dict): 29 | value = ConfigDict(value) 30 | 31 | super(ConfigDict, self).__setitem__(name, value) 32 | 33 | 34 | def add_args(parser, cfg, prefix=""): 35 | for k, v in cfg.items(): 36 | if isinstance(v, str): 37 | parser.add_argument("--" + prefix + k) 38 | elif isinstance(v, int): 39 | parser.add_argument("--" + prefix + k, type=int) 40 | elif isinstance(v, float): 41 | parser.add_argument("--" + prefix + k, type=float) 42 | elif isinstance(v, bool): 43 | parser.add_argument("--" + prefix + k, action="store_true") 44 | elif isinstance(v, dict): 45 | add_args(parser, v, k + ".") 46 | elif isinstance(v, collections_abc.Iterable): 47 | parser.add_argument("--" + prefix + k, type=type(v[0]), nargs="+") 48 | else: 49 | print("connot parse key {} of type {}".format(prefix + k, type(v))) 50 | return parser 51 | 52 | 53 | class YamlConfig: 54 | """Manager of ConfigDict from yaml.""" 55 | 56 | def __init__(self, config_paths: dict): 57 | """Make ConfigDict from yaml path.""" 58 | self.cfg = ConfigDict() 59 | for key, path in config_paths.items(): 60 | self.cfg[key] = self._yaml_to_config_dict(path) 61 | 62 | @staticmethod 63 | def _yaml_to_config_dict(path: str) -> ConfigDict: 64 | """Return ConfigDict from yaml.""" 65 | try: 66 | with open(path) as f: 67 | data = yaml.load(f, Loader=yaml.FullLoader) 68 | except FileNotFoundError: 69 | with open(osp.expanduser(path)) as f: 70 | data = yaml.load(f, Loader=yaml.FullLoader) 71 | return ConfigDict(data) 72 | 73 | def get_config_dict(self): 74 | return self.cfg 75 | -------------------------------------------------------------------------------- /rl_algorithms/__init__.py: -------------------------------------------------------------------------------- 1 | from .a2c.agent import A2CAgent 2 | from .a2c.learner import A2CLearner 3 | from .acer.agent import ACERAgent 4 | from .acer.learner import ACERLearner 5 | from .bc.ddpg_agent import BCDDPGAgent 6 | from .bc.ddpg_learner import BCDDPGLearner 7 | from .bc.her import LunarLanderContinuousHER, ReacherHER 8 | from .bc.sac_agent import BCSACAgent 9 | from .bc.sac_learner import BCSACLearner 10 | from .common.apex.architecture import ApeX 11 | from .common.networks.backbones import CNN, ResNet 12 | from .ddpg.agent import DDPGAgent 13 | from .ddpg.learner import DDPGLearner 14 | from .distillation.dqn_agent import DistillationDQNAgent 15 | from .dqn.agent import DQNAgent 16 | from .dqn.distributed_logger import DQNLogger 17 | from .dqn.distributed_worker import DQNWorker 18 | from .dqn.learner import DQNLearner 19 | from .dqn.losses import C51Loss, DQNLoss, IQNLoss 20 | from .fd.ddpg_agent import DDPGfDAgent 21 | from .fd.ddpg_learner import DDPGfDLearner 22 | from .fd.dqn_agent import DQfDAgent 23 | from .fd.dqn_learner import DQfDLearner 24 | from .fd.sac_agent import SACfDAgent 25 | from .fd.sac_learner import SACfDLearner 26 | from .gail.agent import GAILPPOAgent 27 | from .gail.learner import GAILPPOLearner 28 | from .ppo.agent import PPOAgent 29 | from .ppo.learner import PPOLearner 30 | from .recurrent.dqn_agent import R2D1Agent 31 | from .recurrent.learner import R2D1Learner 32 | from .recurrent.losses import R2D1C51Loss, R2D1DQNLoss, R2D1IQNLoss 33 | from .registry import build_agent, build_her 34 | from .sac.agent import SACAgent 35 | from .sac.learner import SACLearner 36 | from .td3.agent import TD3Agent 37 | from .td3.learner import TD3Learner 38 | 39 | __all__ = [ 40 | "A2CAgent", 41 | "BCDDPGAgent", 42 | "BCSACAgent", 43 | "DDPGAgent", 44 | "DQNAgent", 45 | "DDPGfDAgent", 46 | "DQfDAgent", 47 | "R2D1Agent", 48 | "SACfDAgent", 49 | "PPOAgent", 50 | "SACAgent", 51 | "TD3Agent", 52 | "GAILPPOAgent", 53 | "A2CLearner", 54 | "BCDDPGLearner", 55 | "BCSACLearner", 56 | "DDPGLearner", 57 | "DQNLearner", 58 | "DDPGfDLearner", 59 | "DQfDLearner", 60 | "SACfDLearner", 61 | "PPOLearner", 62 | "SACLearner", 63 | "TD3Learner", 64 | "GAILPPOLearner", 65 | "R2D1Learner", 66 | "LunarLanderContinuousHER", 67 | "ReacherHER", 68 | "build_agent", 69 | "build_her", 70 | "CNN", 71 | "ResNet", 72 | "IQNLoss", 73 | "C51Loss", 74 | "DQNLoss", 75 | "R2D1IQNLoss", 76 | "R2D1C51Loss", 77 | "R2D1DQNLoss", 78 | "ApeX", 79 | "DQNWorker", 80 | "DQNLogger", 81 | "ACERLearner", 82 | "ACERAgent", 83 | "DistillationDQNAgent", 84 | ] 85 | -------------------------------------------------------------------------------- /rl_algorithms/gail/networks.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from typing import Tuple, Union 4 | 5 | import torch 6 | import torch.nn as nn 7 | 8 | from rl_algorithms.common.helper_functions import identity 9 | from rl_algorithms.registry import build_backbone, build_head 10 | from rl_algorithms.utils.config import ConfigDict 11 | 12 | 13 | # TODO: Remove it when upgrade torch>=1.7 14 | # pylint: disable=abstract-method 15 | class Discriminator(nn.Module): 16 | """Discriminator to classify experience data and expert data""" 17 | 18 | def __init__( 19 | self, 20 | backbone_cfg: ConfigDict, 21 | head_cfg: ConfigDict, 22 | action_embedder_cfg: ConfigDict, 23 | shared_backbone: nn.Module = None, 24 | ): 25 | nn.Module.__init__(self) 26 | if shared_backbone is not None: 27 | self.backbone = shared_backbone 28 | head_cfg.configs.input_size = self.calculate_fc_input_size( 29 | head_cfg.configs.state_size 30 | ) 31 | elif not backbone_cfg: 32 | self.backbone = identity 33 | head_cfg.configs.input_size = head_cfg.configs.state_size[0] 34 | else: 35 | self.backbone = build_backbone(backbone_cfg) 36 | head_cfg.configs.input_size = self.calculate_fc_input_size( 37 | head_cfg.configs.state_size 38 | ) 39 | 40 | self.action_embedder = None 41 | if action_embedder_cfg: 42 | action_embedder_cfg.configs.input_size = head_cfg.configs.action_size 43 | self.action_embedder = build_head(action_embedder_cfg) 44 | head_cfg.configs.input_size += action_embedder_cfg.configs.output_size 45 | else: 46 | head_cfg.configs.input_size += head_cfg.configs.action_size 47 | 48 | self.head = build_head(head_cfg) 49 | 50 | def forward( 51 | self, state_action: Tuple[torch.Tensor, torch.Tensor] 52 | ) -> Union[torch.Tensor, Tuple[torch.Tensor, ...]]: 53 | """Forward method implementation. Use in get_action method in agent.""" 54 | state_feature = self.backbone(state_action[0]) 55 | action_feature = state_action[1] 56 | if self.action_embedder: 57 | action_feature = self.forward_action_embedder(action_feature) 58 | return self.head(torch.cat([state_feature, action_feature], dim=-1)) 59 | 60 | def forward_action_embedder( 61 | self, x: torch.Tensor 62 | ) -> Union[torch.Tensor, Tuple[torch.Tensor, ...]]: 63 | """Forward method of action embedder.""" 64 | return self.action_embedder(x) 65 | 66 | def calculate_fc_input_size(self, state_dim: tuple): 67 | """Calculate fc input size according to the shape of cnn.""" 68 | x = torch.zeros(state_dim).unsqueeze(0) 69 | output = self.backbone(x).detach().view(-1) 70 | return output.shape[0] 71 | -------------------------------------------------------------------------------- /tests/buffer/test_prioritized_buffer.py: -------------------------------------------------------------------------------- 1 | from typing import List, Tuple 2 | 3 | import numpy as np 4 | from scipy.stats import ks_2samp 5 | 6 | from rl_algorithms.common.buffer.replay_buffer import ReplayBuffer 7 | from rl_algorithms.common.buffer.wrapper import PrioritizedBufferWrapper 8 | 9 | 10 | def generate_prioritized_buffer( 11 | buffer_length: int, batch_size: int, idx_lst=None, prior_lst=None 12 | ) -> Tuple[PrioritizedBufferWrapper, List]: 13 | """Generate Prioritized Replay Buffer with random Prior.""" 14 | buffer = ReplayBuffer(max_len=buffer_length, batch_size=batch_size) 15 | prioritized_buffer = PrioritizedBufferWrapper(buffer) 16 | priority = np.random.randint(10, size=buffer_length) 17 | for i, j in enumerate(priority): 18 | prioritized_buffer.sum_tree[i] = j 19 | if idx_lst: 20 | for i, j in list(zip(idx_lst, prior_lst)): 21 | priority[i] = j 22 | prioritized_buffer.sum_tree[i] = j 23 | 24 | prop_lst = [i / sum(priority) for i in priority] 25 | prioritized_buffer.buffer.length = buffer_length 26 | 27 | return prioritized_buffer, prop_lst 28 | 29 | 30 | def sample_dummy(prioritized_buffer: PrioritizedBufferWrapper, times: int) -> List: 31 | """Sample from prioritized buffer and Return indices.""" 32 | assert isinstance(prioritized_buffer, PrioritizedBufferWrapper) 33 | 34 | sampled_lst = [0] * prioritized_buffer.buffer.max_len 35 | for _ in range(times): 36 | indices = prioritized_buffer._sample_proportional( 37 | prioritized_buffer.buffer.batch_size 38 | ) 39 | for idx in indices: 40 | sampled_lst[idx] += 1 / (times * prioritized_buffer.buffer.batch_size) 41 | return sampled_lst 42 | 43 | 44 | def check_prioritized(prop_lst: List, sampled_lst: List) -> bool: 45 | """Check two input lists have same distribution by kstest. 46 | 47 | Reference: 48 | https://en.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test 49 | """ 50 | res = ks_2samp(prop_lst, sampled_lst) 51 | return res[1] >= 0.05 52 | 53 | 54 | def test_prioritized(buffer_length=32, batch_size=4): 55 | """Test whether transitions are prioritized sampled from replay buffer.""" 56 | 57 | n_repeat = 1000 58 | idx_lst = [0, 1, 2, 3] 59 | prior_lst = [100, 10, 1, 1] 60 | 61 | # generate prioitized buffer, return buffer and its proportion 62 | buffer, prop = generate_prioritized_buffer( 63 | buffer_length, batch_size, idx_lst, prior_lst 64 | ) 65 | assert isinstance(buffer, PrioritizedBufferWrapper) 66 | sampled_lst = [0] * buffer.buffer.max_len 67 | # sample index from buffer 68 | for _ in range(n_repeat): 69 | indices = buffer._sample_proportional(buffer.buffer.batch_size) 70 | for idx in indices: 71 | sampled_lst[idx] += 1 / (n_repeat * buffer.buffer.batch_size) 72 | 73 | assert check_prioritized(prop, sampled_lst), "Two distributions are different." 74 | 75 | 76 | if __name__ == "__main__": 77 | test_prioritized() 78 | -------------------------------------------------------------------------------- /tests/integration/test_run_distillation_agent.py: -------------------------------------------------------------------------------- 1 | """Test only one step of distillation file for training.""" 2 | 3 | import os 4 | import pickle 5 | import re 6 | import shutil 7 | import subprocess 8 | 9 | 10 | def check_distillation_agent(config: str, run_file: str): 11 | """Test that 1 episode of run file works well.""" 12 | cmd = ( 13 | f"python {run_file} --cfg-path {config} --integration-test " 14 | + f"--episode-num 1 --interim-test 1 --off-render" 15 | ) 16 | 17 | p = subprocess.Popen( 18 | cmd, 19 | stdout=subprocess.PIPE, 20 | stderr=subprocess.STDOUT, 21 | universal_newlines=True, 22 | shell=True, 23 | ) 24 | output, _ = p.communicate() 25 | print(str(output)) 26 | assert p.returncode == 0 27 | 28 | # Find saved checkpoint path and data path. 29 | pattern = r"./checkpoint/.+/" 30 | data_pattern = r"data/.+/" 31 | checkpoint_path = re.findall(pattern, str(output))[0] 32 | full_data_path, n_frame_from_last_path = re.findall(data_pattern, str(output)) 33 | 34 | try: 35 | num_episode_step = re.findall(r"episode step: \d+", str(output))[0] 36 | num_episode_step = int(re.findall(r"\d+", num_episode_step)[0]) 37 | 38 | # Check if the number of data is same with iterated episode step. 39 | saved_data_list = os.listdir(full_data_path) 40 | assert ( 41 | len(saved_data_list) == num_episode_step 42 | ), "The number of data does not match the number of iterated episode steps." 43 | 44 | # Check if n_frame_from_last works well. 45 | n_frame_from_last_data_list = os.listdir(n_frame_from_last_path) 46 | assert 3 == len( 47 | n_frame_from_last_data_list 48 | ), f"n_frame_from_last doesn't work properly(expected num of data: 3, num of data: {len(n_frame_from_last_data_list)})." 49 | 50 | # Check if train-phase data only contaions state, not state & q value. 51 | with open(full_data_path + saved_data_list[0], "rb") as f: 52 | datum = pickle.load(f) 53 | assert ( 54 | len(datum) == 1 55 | ), "The length of the data is not appropriate(length must be 1, state only)." 56 | 57 | except Exception as e: 58 | raise e 59 | 60 | finally: 61 | """Delete generated directories.""" 62 | delete_path(checkpoint_path) 63 | delete_path(full_data_path) 64 | delete_path(n_frame_from_last_path) 65 | 66 | 67 | def delete_path(path: str): 68 | """Delete directory.""" 69 | shutil.rmtree(path) 70 | 71 | 72 | # TODO: Add student training test code. 73 | def test_distillation(): 74 | """Test distillation agent.""" 75 | check_distillation_agent( 76 | "configs/pong_no_frameskip_v4/distillation_dqn.yaml", 77 | "run_pong_no_frameskip_v4.py", 78 | ) 79 | check_distillation_agent( 80 | "configs/lunarlander_v2/distillation_dqn.yaml", "run_lunarlander_v2.py" 81 | ) 82 | 83 | 84 | if __name__ == "__main__": 85 | test_distillation() 86 | -------------------------------------------------------------------------------- /tests/test_config_registry.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import datetime 3 | 4 | import gym 5 | 6 | from rl_algorithms import build_agent 7 | from rl_algorithms.common.abstract.agent import Agent 8 | from rl_algorithms.utils import YamlConfig 9 | 10 | 11 | def parse_args(args: list): 12 | parser = argparse.ArgumentParser(description="Pytorch RL rl_algorithms") 13 | parser.add_argument( 14 | "--cfg-path", 15 | type=str, 16 | default="./configs/lunarlander_continuous_v2/ddpg.yaml", 17 | help="config path", 18 | ) 19 | parser.add_argument( 20 | "--test", dest="test", action="store_true", help="test mode (no training)" 21 | ) 22 | parser.add_argument( 23 | "--load-from", 24 | type=str, 25 | default=None, 26 | help="load the saved model and optimizer at the beginning", 27 | ) 28 | parser.add_argument( 29 | "--off-render", dest="render", action="store_false", help="turn off rendering" 30 | ) 31 | parser.add_argument( 32 | "--render-after", 33 | type=int, 34 | default=0, 35 | help="start rendering after the input number of episode", 36 | ) 37 | parser.add_argument( 38 | "--log", dest="log", action="store_true", help="turn on logging" 39 | ) 40 | parser.add_argument( 41 | "--save-period", type=int, default=100, help="save model period" 42 | ) 43 | parser.add_argument( 44 | "--episode-num", type=int, default=1500, help="total episode num" 45 | ) 46 | parser.add_argument( 47 | "--max-episode-steps", type=int, default=300, help="max episode step" 48 | ) 49 | parser.add_argument( 50 | "--interim-test-num", 51 | type=int, 52 | default=10, 53 | help="number of test during training", 54 | ) 55 | return parser.parse_args(args) 56 | 57 | 58 | def test_config_registry(): 59 | # configurations 60 | args = parse_args(["--test"]) 61 | 62 | # set env 63 | env = gym.make("LunarLanderContinuous-v2") 64 | 65 | # check start time 66 | NOWTIMES = datetime.datetime.now() 67 | curr_time = NOWTIMES.strftime("%y%m%d_%H%M%S") 68 | 69 | cfg = YamlConfig(dict(agent=args.cfg_path)).get_config_dict() 70 | env_info = dict( 71 | name=env.spec.id, 72 | observation_space=env.observation_space, 73 | action_space=env.action_space, 74 | is_atari=False, 75 | ) 76 | log_cfg = dict(agent=cfg.agent.type, curr_time=curr_time, cfg_path=args.cfg_path) 77 | build_args = dict( 78 | env=env, 79 | env_info=env_info, 80 | log_cfg=log_cfg, 81 | is_test=args.test, 82 | load_from=args.load_from, 83 | is_render=args.render, 84 | render_after=args.render_after, 85 | is_log=args.log, 86 | save_period=args.save_period, 87 | episode_num=args.episode_num, 88 | max_episode_steps=args.max_episode_steps, 89 | interim_test_num=args.interim_test_num, 90 | ) 91 | agent = build_agent(cfg.agent, build_args) 92 | assert isinstance(agent, Agent) 93 | 94 | 95 | if __name__ == "__main__": 96 | test_config_registry() 97 | -------------------------------------------------------------------------------- /.all-contributorsrc: -------------------------------------------------------------------------------- 1 | { 2 | "files": [ 3 | "README.md" 4 | ], 5 | "imageSize": 100, 6 | "commit": false, 7 | "contributors": [ 8 | { 9 | "login": "Curt-Park", 10 | "name": "Jinwoo Park (Curt)", 11 | "avatar_url": "https://avatars3.githubusercontent.com/u/14961526?v=4", 12 | "profile": "https://github.com/Curt-Park", 13 | "contributions": [ 14 | "code" 15 | ] 16 | }, 17 | { 18 | "login": "MrSyee", 19 | "name": "Kyunghwan Kim", 20 | "avatar_url": "https://avatars3.githubusercontent.com/u/17582508?v=4", 21 | "profile": "https://github.com/MrSyee", 22 | "contributions": [ 23 | "code" 24 | ] 25 | }, 26 | { 27 | "login": "darthegg", 28 | "name": "darthegg", 29 | "avatar_url": "https://avatars3.githubusercontent.com/u/16010242?v=4", 30 | "profile": "https://github.com/darthegg", 31 | "contributions": [ 32 | "code" 33 | ] 34 | }, 35 | { 36 | "login": "mclearning2", 37 | "name": "Mincheol Kim", 38 | "avatar_url": "https://avatars3.githubusercontent.com/u/43226417?v=4", 39 | "profile": "https://github.com/mclearning2", 40 | "contributions": [ 41 | "code" 42 | ] 43 | }, 44 | { 45 | "login": "minseop4898", 46 | "name": "김민섭", 47 | "avatar_url": "https://avatars1.githubusercontent.com/u/34338299?v=4", 48 | "profile": "https://github.com/minseop4898", 49 | "contributions": [ 50 | "code" 51 | ] 52 | }, 53 | { 54 | "login": "jinPrelude", 55 | "name": "Leejin Jung", 56 | "avatar_url": "https://avatars1.githubusercontent.com/u/16518993?v=4", 57 | "profile": "https://github.com/jinPrelude", 58 | "contributions": [ 59 | "code" 60 | ] 61 | }, 62 | { 63 | "login": "cyoon1729", 64 | "name": "Chris Yoon", 65 | "avatar_url": "https://avatars2.githubusercontent.com/u/33583101?v=4", 66 | "profile": "https://github.com/cyoon1729", 67 | "contributions": [ 68 | "code" 69 | ] 70 | }, 71 | { 72 | "login": "jiseongHAN", 73 | "name": "Jiseong Han", 74 | "avatar_url": "https://avatars2.githubusercontent.com/u/48741026?v=4", 75 | "profile": "https://jiseonghan.github.io/", 76 | "contributions": [ 77 | "code" 78 | ] 79 | }, 80 | { 81 | "login": "sehyun-hwang", 82 | "name": "Sehyun Hwang", 83 | "avatar_url": "https://avatars3.githubusercontent.com/u/23437715?v=4", 84 | "profile": "https://github.com/sehyun-hwang", 85 | "contributions": [ 86 | "maintenance" 87 | ] 88 | }, 89 | { 90 | "login": "isk03276", 91 | "name": "eunjin", 92 | "avatar_url": "https://avatars.githubusercontent.com/u/23740495?v=4", 93 | "profile": "https://github.com/isk03276", 94 | "contributions": [ 95 | "code" 96 | ] 97 | } 98 | ], 99 | "contributorsPerLine": 7, 100 | "projectName": "rl_algorithms", 101 | "projectOwner": "medipixel", 102 | "repoType": "github", 103 | "repoHost": "https://github.com", 104 | "skipCi": true 105 | } 106 | -------------------------------------------------------------------------------- /tests/test_cnn_cfg.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | from rl_algorithms.common.networks.backbones import CNN, ResNet 5 | from rl_algorithms.common.networks.brain import Brain 6 | from rl_algorithms.utils.config import ConfigDict 7 | 8 | cnn_cfg = ConfigDict( 9 | type="CNN", 10 | configs=dict( 11 | input_sizes=[3, 32, 32], 12 | output_sizes=[32, 32, 64], 13 | kernel_sizes=[5, 3, 3], 14 | strides=[4, 3, 2], 15 | paddings=[2, 0, 1], 16 | ), 17 | ) 18 | 19 | resnet_cfg = ConfigDict( 20 | type="ResNet", 21 | configs=dict( 22 | use_bottleneck=False, 23 | num_blocks=[1, 1, 1, 1], 24 | block_output_sizes=[32, 32, 64, 64], 25 | block_strides=[1, 2, 2, 2], 26 | first_input_size=3, 27 | first_output_size=32, 28 | expansion=4, 29 | channel_compression=4, 30 | ), 31 | ) 32 | 33 | head_cfg = ConfigDict( 34 | type="IQNMLP", 35 | configs=dict( 36 | hidden_sizes=[512], 37 | n_tau_samples=64, 38 | n_tau_prime_samples=64, 39 | n_quantile_samples=32, 40 | quantile_embedding_dim=64, 41 | kappa=1.0, 42 | output_activation="identity", 43 | # NoisyNet 44 | use_noisy_net=True, 45 | std_init=0.5, 46 | ), 47 | ) 48 | 49 | test_state_dim = (3, 256, 256) 50 | 51 | 52 | def test_brain(): 53 | """Test wheter brain make fc layer based on backbone's output size.""" 54 | 55 | head_cfg.configs.state_size = test_state_dim 56 | head_cfg.configs.output_size = 8 57 | 58 | model = Brain(resnet_cfg, head_cfg) 59 | assert model.head.input_size == 16384 60 | 61 | 62 | def test_cnn_with_config(): 63 | """Test whether CNN module can make proper model according to the configs given.""" 64 | conv_layer_size = [[1, 32, 64, 64], [1, 32, 21, 21], [1, 64, 11, 11]] 65 | test_cnn_model = CNN(configs=cnn_cfg.configs) 66 | conv_layers = [ 67 | module for module in test_cnn_model.modules() if isinstance(module, nn.Conv2d) 68 | ] 69 | x = torch.zeros(test_state_dim).unsqueeze(0) 70 | for i, layer in enumerate(conv_layers): 71 | layer_output = layer(x) 72 | x = layer_output 73 | assert list(x.shape) == conv_layer_size[i] 74 | 75 | 76 | def test_resnet_with_config(): 77 | """Test whether ResNet module can make proper model according to the configs given.""" 78 | conv_layer_size = [ 79 | [1, 32, 256, 256], 80 | [1, 32, 256, 256], 81 | [1, 128, 256, 256], 82 | [1, 128, 256, 256], 83 | [1, 32, 128, 128], 84 | [1, 128, 128, 128], 85 | [1, 128, 128, 128], 86 | [1, 64, 64, 64], 87 | [1, 256, 64, 64], 88 | [1, 256, 64, 64], 89 | [1, 64, 32, 32], 90 | [1, 256, 32, 32], 91 | [1, 256, 32, 32], 92 | [1, 16, 32, 32], 93 | ] 94 | test_resnet_model = ResNet(configs=resnet_cfg.configs) 95 | conv_layers = [ 96 | module 97 | for module in test_resnet_model.modules() 98 | if isinstance(module, nn.Conv2d) 99 | ] 100 | x = torch.zeros(test_state_dim).unsqueeze(0) 101 | skip_x = x 102 | for i, layer in enumerate(conv_layers): 103 | if i % 3 == 0: 104 | layer_output = layer(skip_x) 105 | skip_x = layer_output 106 | x = layer_output 107 | else: 108 | layer_output = layer(x) 109 | x = layer_output 110 | assert list(x.shape) == conv_layer_size[i] 111 | 112 | 113 | if __name__ == "__main__": 114 | test_brain() 115 | test_cnn_with_config() 116 | -------------------------------------------------------------------------------- /rl_algorithms/common/abstract/her.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Abstract class used for Hindsight Experience Replay. 3 | 4 | - Author: Kyunghwan Kim 5 | - Contact: kh.kim@medipixel.io 6 | - Paper: https://arxiv.org/pdf/1707.01495.pdf 7 | """ 8 | 9 | from abc import ABC, abstractmethod 10 | from typing import Callable, Tuple 11 | 12 | import numpy as np 13 | 14 | 15 | class HER(ABC): 16 | """Abstract class for HER (final strategy). 17 | 18 | Attributes: 19 | reward_fn (Callable): returns reward from state, action, next_state 20 | 21 | """ 22 | 23 | def __init__(self, reward_fn: Callable[[tuple, np.ndarray], np.float64]): 24 | """Initialize. 25 | 26 | Args: 27 | reward_fn (Callable): returns reward from state, action, next_state 28 | 29 | """ 30 | self.reward_fn = reward_fn 31 | 32 | @abstractmethod 33 | def fetch_desired_states_from_demo(self, demo: list): 34 | pass 35 | 36 | @abstractmethod 37 | def get_desired_state(self, *args) -> np.ndarray: 38 | pass 39 | 40 | @abstractmethod 41 | def generate_demo_transitions(self, demo: list) -> list: 42 | pass 43 | 44 | @abstractmethod 45 | def _get_final_state(self, transition: tuple) -> np.ndarray: 46 | pass 47 | 48 | def _append_origin_transitions( 49 | self, origin_transitions: list, transition: tuple, desired_state: np.ndarray 50 | ): 51 | """Append original transitions adding goal state for training.""" 52 | origin_transitions.append(self._get_transition(transition, desired_state)) 53 | 54 | def _append_new_transitions( 55 | self, new_transitions: list, transition: tuple, final_state: np.ndarray 56 | ): 57 | """Append new transitions made by HER strategy (final) for training.""" 58 | new_transitions.append(self._get_transition(transition, final_state)) 59 | 60 | def _get_transition( 61 | self, transition: tuple, goal_state: np.ndarray 62 | ) -> Tuple[np.ndarray, np.ndarray, np.float64, np.ndarray, bool]: 63 | """Get a single transition concatenated with a goal state.""" 64 | state, action, _, next_state, done = transition 65 | 66 | done = np.array_equal(next_state, goal_state) 67 | reward = self.reward_fn(transition, goal_state) 68 | state = np.concatenate((state, goal_state), axis=-1) 69 | next_state = np.concatenate((next_state, goal_state), axis=-1) 70 | 71 | return state, action, reward, next_state, done 72 | 73 | def generate_transitions( 74 | self, 75 | transitions: list, 76 | desired_state: np.ndarray, 77 | success_score: float, 78 | is_demo: bool = False, 79 | ) -> list: 80 | """Generate new transitions concatenated with desired states.""" 81 | origin_transitions: list = list() 82 | new_transitions: list = list() 83 | final_state = self._get_final_state(transitions[-1]) 84 | score = np.sum(np.array(transitions), axis=0)[2] 85 | 86 | for transition in transitions: 87 | # process transitions with the initial goal state 88 | self._append_origin_transitions( 89 | origin_transitions, transition, desired_state 90 | ) 91 | 92 | # do not need to append new transitions if sum of reward is big enough 93 | if not is_demo and score <= success_score: 94 | self._append_new_transitions(new_transitions, transition, final_state) 95 | 96 | return origin_transitions + new_transitions 97 | 98 | def __str__(self): 99 | return self.__class__.__name__ 100 | -------------------------------------------------------------------------------- /rl_algorithms/common/saliency_map.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Functions for Saliency map. 3 | 4 | - Author: Euijin Jeong 5 | - Contact: euijin.jeong@medipixel.io 6 | """ 7 | 8 | import os 9 | import pickle 10 | import shutil 11 | 12 | from PIL import Image 13 | import matplotlib.pyplot as plt 14 | import numpy as np 15 | import torch 16 | 17 | plt.rcParams["figure.figsize"] = (10.0, 8.0) # set default size of plots 18 | plt.rcParams["image.interpolation"] = "nearest" 19 | plt.rcParams["image.cmap"] = "gray" 20 | 21 | 22 | def make_saliency_dir(date_time: str) -> str: 23 | """Make directories for saving saliency map result.""" 24 | save_dir = f"./data/saliency_map/{date_time}" 25 | if os.path.exists(save_dir): 26 | shutil.rmtree(save_dir) 27 | os.makedirs(save_dir) 28 | os.makedirs(f"./data/saliency_map/{date_time}/input_image") 29 | os.makedirs(f"./data/saliency_map/{date_time}/state") 30 | os.makedirs(f"./data/saliency_map/{date_time}/saliency") 31 | os.makedirs(f"./data/saliency_map/{date_time}/overlay") 32 | saliency_map_dir = f"./data/saliency_map/{date_time}/" 33 | return saliency_map_dir 34 | 35 | 36 | def compute_saliency_maps(X, y, model, device): 37 | """Compute a class saliency map using the model for images X and labels y.""" 38 | 39 | # Make input tensor require gradient 40 | if isinstance(X, list): # For R2D1 41 | input_list = [] 42 | for x in X: 43 | input_list.append(x.requires_grad_()) 44 | 45 | saliency = None 46 | X = input_list 47 | scores, _ = model(X[0], X[1], X[2], X[3]) 48 | scores = (scores.gather(1, y.unsqueeze(0))).squeeze(0) 49 | scores.backward(torch.FloatTensor([1.0]).to(device)) 50 | saliency, _ = torch.max(X[0].grad.data.abs(), dim=1) 51 | else: 52 | X.requires_grad_() 53 | 54 | saliency = None 55 | scores = model(X) 56 | scores = (scores.gather(1, y.unsqueeze(0))).squeeze(0) 57 | scores.backward(torch.FloatTensor([1.0]).to(device)) 58 | saliency, _ = torch.max(X.grad.data.abs(), dim=1) 59 | 60 | return saliency 61 | 62 | 63 | def save_saliency_maps(i, X, y, model, device, saliency_map_dir): 64 | """Make and save saliency maps in directory.""" 65 | 66 | # Convert X and y from numpy arrays to Torch Tensors 67 | if isinstance(X, tuple): # For R2D1 68 | input_image = X[0][-1] 69 | X_tensor = [] 70 | for x in X: 71 | if not isinstance(x, torch.Tensor): 72 | X_tensor.append(torch.Tensor(x).float().to(device).unsqueeze(0)) 73 | else: 74 | X_tensor.append(x) 75 | else: 76 | input_image = X[-1] 77 | X_tensor = torch.Tensor(X).float().to(device).unsqueeze(0) 78 | y = int(y) 79 | y_tensor = torch.LongTensor([y]).to(device) 80 | 81 | # Compute saliency maps for images in X 82 | saliency = compute_saliency_maps(X_tensor, y_tensor, model, device) 83 | 84 | # image 85 | saliency = saliency.cpu().numpy() 86 | saliency = np.flip(saliency, axis=1) 87 | input_image = np.rot90(input_image, 3) 88 | input_image = Image.fromarray(np.uint8(input_image * 255.0)) 89 | input_image.save(saliency_map_dir + "/input_image/{}.png".format(i)) 90 | 91 | # numpy array 92 | with open(saliency_map_dir + "/state/{}.pkl".format(i), "wb") as f: 93 | pickle.dump(X, f) 94 | 95 | cmap = plt.cm.hot 96 | norm = plt.Normalize(saliency.min(), saliency.max()) 97 | saliency = cmap(norm(saliency[0])) 98 | saliency = np.rot90(saliency, 3) 99 | saliency = Image.fromarray(np.uint8(saliency * 255.0)) 100 | saliency.save(saliency_map_dir + "/saliency/{}.png".format(i)) 101 | 102 | overlay = Image.blend(input_image.convert("RGBA"), saliency, alpha=0.5) 103 | overlay.save(saliency_map_dir + "/overlay/{}.png".format(i)) 104 | return saliency 105 | -------------------------------------------------------------------------------- /rl_algorithms/acer/buffer.py: -------------------------------------------------------------------------------- 1 | # TODO : Move to common buffer 2 | import random 3 | from typing import Tuple 4 | 5 | import numpy as np 6 | import torch 7 | 8 | from rl_algorithms.common.abstract.buffer import BaseBuffer 9 | 10 | 11 | class ReplayMemory(BaseBuffer): 12 | """ReplayMemory for ACER. 13 | 14 | Attributes: 15 | obs_buf (np.ndarray): observations 16 | acts_buf (np.ndarray): actions 17 | rews_buf (np.ndarray): rewards 18 | probs_buf (np.ndarray): probability of actions 19 | done_buf (np.ndarray): dones 20 | max_len (int): size of buffers 21 | n_rollout (int): number of rollout 22 | num_in_buffer (int): amount of memory filled 23 | idx (int): memory index to add the next incoming transition 24 | """ 25 | 26 | def __init__(self, buffer_size: int, n_rollout: int): 27 | """Initialize a ReplayBuffer object.""" 28 | self.obs_buf = None 29 | self.acts_buf = None 30 | self.rews_buf = None 31 | self.probs_buf = None 32 | self.done_buf = None 33 | self.buffer_size = buffer_size 34 | self.idx = 0 35 | self.num_in_buffer = 0 36 | self.n_rollout = n_rollout 37 | 38 | def add(self, seq_data: list): 39 | """Add a new experience to memory. 40 | If the buffer is empty, it is respectively initialized by size of arguments. 41 | """ 42 | if self.num_in_buffer == 0: 43 | state, action, reward, prob, done_mask = seq_data[0] 44 | self._initialize_buffers(state, prob) 45 | 46 | self.idx = (self.idx + 1) % (self.buffer_size - 1) 47 | 48 | for i, transition in enumerate(seq_data): 49 | state, action, reward, prob, done_mask = transition 50 | self.obs_buf[self.idx][i] = state 51 | self.acts_buf[self.idx][i] = action 52 | self.rews_buf[self.idx][i] = reward 53 | self.probs_buf[self.idx][i] = prob 54 | self.done_buf[self.idx][i] = done_mask 55 | 56 | self.num_in_buffer += 1 57 | self.num_in_buffer = min(self.buffer_size - 1, self.num_in_buffer) 58 | 59 | def _initialize_buffers(self, state: np.ndarray, probs: np.ndarray): 60 | """Initialze buffers for state, action, reward, prob, done.""" 61 | self.obs_buf = np.zeros( 62 | [self.buffer_size, self.n_rollout] + list(state.shape), dtype=state.dtype 63 | ) 64 | self.acts_buf = np.zeros([self.buffer_size, self.n_rollout, 1], dtype=np.uint8) 65 | self.rews_buf = np.zeros( 66 | [self.buffer_size, self.n_rollout, 1], dtype=np.float64 67 | ) 68 | self.probs_buf = np.zeros( 69 | [self.buffer_size, self.n_rollout] + list(probs.shape), dtype=probs.dtype 70 | ) 71 | self.done_buf = np.zeros([self.buffer_size, self.n_rollout, 1]) 72 | 73 | def sample(self, on_policy=False) -> Tuple[torch.Tensor, ...]: 74 | """Randomly sample a batch of experiences from memory. 75 | If on_policy, using last experience.""" 76 | 77 | if on_policy: 78 | state = self.obs_buf[self.idx] 79 | action = self.acts_buf[self.idx] 80 | reward = self.rews_buf[self.idx] 81 | prob = self.probs_buf[self.idx] 82 | done = self.done_buf[self.idx] 83 | 84 | else: 85 | idx = random.randint(1, self.num_in_buffer) 86 | state = self.obs_buf[idx] 87 | action = self.acts_buf[idx] 88 | reward = self.rews_buf[idx] 89 | prob = self.probs_buf[idx] 90 | done = self.done_buf[idx] 91 | 92 | state = torch.FloatTensor(state) 93 | action = torch.LongTensor(action) 94 | reward = torch.FloatTensor(reward) 95 | prob = torch.FloatTensor(prob) 96 | done = torch.FloatTensor(done) 97 | 98 | return state, action, reward, prob, done 99 | 100 | def __len__(self) -> int: 101 | """Return the current size of internal memory.""" 102 | return self.num_in_buffer 103 | -------------------------------------------------------------------------------- /run_lunarlander_v2.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Train or test algorithms on LunarLander-v2. 3 | 4 | - Author: Curt Park 5 | - Contact: curt.park@medipixel.io 6 | """ 7 | 8 | import argparse 9 | import datetime 10 | 11 | import gym 12 | 13 | from rl_algorithms import build_agent 14 | import rl_algorithms.common.env.utils as env_utils 15 | import rl_algorithms.common.helper_functions as common_utils 16 | from rl_algorithms.utils import YamlConfig 17 | 18 | 19 | def parse_args() -> argparse.Namespace: 20 | # configurations 21 | parser = argparse.ArgumentParser(description="Pytorch RL algorithms") 22 | parser.add_argument( 23 | "--seed", type=int, default=777, help="random seed for reproducibility" 24 | ) 25 | parser.add_argument( 26 | "--integration-test", 27 | dest="integration_test", 28 | action="store_true", 29 | help="for integration test", 30 | ) 31 | parser.add_argument( 32 | "--cfg-path", 33 | type=str, 34 | default="./configs/lunarlander_v2/dqn.yaml", 35 | help="config path", 36 | ) 37 | parser.add_argument( 38 | "--test", dest="test", action="store_true", help="test mode (no training)" 39 | ) 40 | parser.add_argument( 41 | "--load-from", 42 | type=str, 43 | default=None, 44 | help="load the saved model and optimizer at the beginning", 45 | ) 46 | parser.add_argument( 47 | "--off-render", dest="render", action="store_false", help="turn off rendering" 48 | ) 49 | parser.add_argument( 50 | "--render-after", 51 | type=int, 52 | default=0, 53 | help="start rendering after the input number of episode", 54 | ) 55 | parser.add_argument( 56 | "--log", dest="log", action="store_true", help="turn on logging" 57 | ) 58 | parser.add_argument( 59 | "--save-period", type=int, default=100, help="save model period" 60 | ) 61 | parser.add_argument( 62 | "--episode-num", type=int, default=1500, help="total episode num" 63 | ) 64 | parser.add_argument( 65 | "--max-episode-steps", type=int, default=300, help="max episode step" 66 | ) 67 | parser.add_argument( 68 | "--interim-test-num", 69 | type=int, 70 | default=10, 71 | help="number of test during training", 72 | ) 73 | 74 | return parser.parse_args() 75 | 76 | 77 | def main(): 78 | """Main.""" 79 | args = parse_args() 80 | 81 | # env initialization 82 | env_name = "LunarLander-v2" 83 | env = gym.make(env_name) 84 | env, max_episode_steps = env_utils.set_env(env, args.max_episode_steps) 85 | 86 | # set a random seed 87 | common_utils.set_random_seed(args.seed, env) 88 | 89 | # run 90 | NOWTIMES = datetime.datetime.now() 91 | curr_time = NOWTIMES.strftime("%y%m%d_%H%M%S") 92 | 93 | cfg = YamlConfig(dict(agent=args.cfg_path)).get_config_dict() 94 | 95 | # If running integration test, simplify experiment 96 | if args.integration_test: 97 | cfg = common_utils.set_cfg_for_intergration_test(cfg) 98 | 99 | env_info = dict( 100 | name=env.spec.id, 101 | observation_space=env.observation_space, 102 | action_space=env.action_space, 103 | is_atari=False, 104 | ) 105 | log_cfg = dict(agent=cfg.agent.type, curr_time=curr_time, cfg_path=args.cfg_path) 106 | build_args = dict( 107 | env=env, 108 | env_info=env_info, 109 | log_cfg=log_cfg, 110 | is_test=args.test, 111 | load_from=args.load_from, 112 | is_render=args.render, 113 | render_after=args.render_after, 114 | is_log=args.log, 115 | save_period=args.save_period, 116 | episode_num=args.episode_num, 117 | max_episode_steps=max_episode_steps, 118 | interim_test_num=args.interim_test_num, 119 | ) 120 | agent = build_agent(cfg.agent, build_args) 121 | 122 | if not args.test: 123 | agent.train() 124 | else: 125 | agent.test() 126 | 127 | 128 | if __name__ == "__main__": 129 | main() 130 | -------------------------------------------------------------------------------- /rl_algorithms/fd/dqn_learner.py: -------------------------------------------------------------------------------- 1 | """Learner for DQfD Agent. 2 | 3 | - Author: Kyunghwan Kim 4 | - Contact: kh.kim@medipixel.io 5 | """ 6 | 7 | from typing import Tuple, Union 8 | 9 | import numpy as np 10 | import torch 11 | from torch.nn.utils import clip_grad_norm_ 12 | 13 | from rl_algorithms.common.abstract.learner import TensorTuple 14 | import rl_algorithms.common.helper_functions as common_utils 15 | from rl_algorithms.dqn.learner import DQNLearner 16 | from rl_algorithms.registry import LEARNERS 17 | 18 | 19 | @LEARNERS.register_module 20 | class DQfDLearner(DQNLearner): 21 | """Learner for DDPGfD Agent.""" 22 | 23 | def update_model( 24 | self, experience: Union[TensorTuple, Tuple[TensorTuple]] 25 | ) -> TensorTuple: # type: ignore 26 | """Train the model after each episode.""" 27 | if self.use_n_step: 28 | experience_1, experience_n = experience 29 | else: 30 | experience_1 = experience 31 | 32 | weights, indices, eps_d = experience_1[-3:] 33 | actions = experience_1[1] 34 | 35 | # 1 step loss 36 | gamma = self.hyper_params.gamma 37 | dq_loss_element_wise, q_values = self.loss_fn( 38 | self.dqn, self.dqn_target, experience_1, gamma, self.head_cfg 39 | ) 40 | dq_loss = torch.mean(dq_loss_element_wise * weights) 41 | 42 | # n step loss 43 | if self.use_n_step: 44 | gamma = self.hyper_params.gamma ** self.hyper_params.n_step 45 | dq_loss_n_element_wise, q_values_n = self.loss_fn( 46 | self.dqn, self.dqn_target, experience_n, gamma, self.head_cfg 47 | ) 48 | 49 | # to update loss and priorities 50 | q_values = 0.5 * (q_values + q_values_n) 51 | dq_loss_element_wise += dq_loss_n_element_wise * self.hyper_params.lambda1 52 | dq_loss = torch.mean(dq_loss_element_wise * weights) 53 | 54 | # supervised loss using demo for only demo transitions 55 | demo_idxs = np.where(eps_d != 0.0) 56 | n_demo = demo_idxs[0].size 57 | if n_demo != 0: # if 1 or more demos are sampled 58 | # get margin for each demo transition 59 | action_idxs = actions[demo_idxs].long() 60 | margin = torch.ones(q_values.size()) * self.hyper_params.margin 61 | margin[demo_idxs, action_idxs] = 0.0 # demo actions have 0 margins 62 | margin = margin.to(self.device) 63 | 64 | # calculate supervised loss 65 | demo_q_values = q_values[demo_idxs, action_idxs].squeeze() 66 | supervised_loss = torch.max(q_values + margin, dim=-1)[0] 67 | supervised_loss = supervised_loss[demo_idxs] - demo_q_values 68 | supervised_loss = torch.mean(supervised_loss) * self.hyper_params.lambda2 69 | else: # no demo sampled 70 | supervised_loss = torch.zeros(1, device=self.device) 71 | 72 | # q_value regularization 73 | q_regular = torch.norm(q_values, 2).mean() * self.hyper_params.w_q_reg 74 | 75 | # total loss 76 | loss = dq_loss + supervised_loss + q_regular 77 | 78 | # train dqn 79 | self.dqn_optim.zero_grad() 80 | loss.backward() 81 | clip_grad_norm_(self.dqn.parameters(), self.hyper_params.gradient_clip) 82 | self.dqn_optim.step() 83 | 84 | # update target networks 85 | common_utils.soft_update(self.dqn, self.dqn_target, self.hyper_params.tau) 86 | 87 | # update priorities in PER 88 | loss_for_prior = dq_loss_element_wise.detach().cpu().numpy().squeeze() 89 | new_priorities = loss_for_prior + self.hyper_params.per_eps 90 | new_priorities += eps_d 91 | 92 | if self.head_cfg.configs.use_noisy_net: 93 | self.dqn.head.reset_noise() 94 | self.dqn_target.head.reset_noise() 95 | 96 | return ( 97 | loss.item(), 98 | dq_loss.item(), 99 | supervised_loss.item(), 100 | q_values.mean().item(), 101 | n_demo, 102 | indices, 103 | new_priorities, 104 | ) 105 | -------------------------------------------------------------------------------- /run_lunarlander_continuous_v2.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Train or test algorithms on LunarLanderContinuous-v2. 3 | 4 | - Author: Curt Park 5 | - Contact: curt.park@medipixel.io 6 | """ 7 | 8 | import argparse 9 | import datetime 10 | 11 | import gym 12 | 13 | from rl_algorithms import build_agent 14 | import rl_algorithms.common.env.utils as env_utils 15 | import rl_algorithms.common.helper_functions as common_utils 16 | from rl_algorithms.utils import YamlConfig 17 | 18 | 19 | def parse_args() -> argparse.Namespace: 20 | # configurations 21 | parser = argparse.ArgumentParser(description="Pytorch RL algorithms") 22 | parser.add_argument( 23 | "--seed", type=int, default=777, help="random seed for reproducibility" 24 | ) 25 | parser.add_argument( 26 | "--integration-test", 27 | dest="integration_test", 28 | action="store_true", 29 | help="for integration test", 30 | ) 31 | parser.add_argument( 32 | "--cfg-path", 33 | type=str, 34 | default="./configs/lunarlander_continuous_v2/ddpg.yaml", 35 | help="config path", 36 | ) 37 | parser.add_argument( 38 | "--test", dest="test", action="store_true", help="test mode (no training)" 39 | ) 40 | parser.add_argument( 41 | "--load-from", 42 | type=str, 43 | default=None, 44 | help="load the saved model and optimizer at the beginning", 45 | ) 46 | parser.add_argument( 47 | "--off-render", dest="render", action="store_false", help="turn off rendering" 48 | ) 49 | parser.add_argument( 50 | "--render-after", 51 | type=int, 52 | default=0, 53 | help="start rendering after the input number of episode", 54 | ) 55 | parser.add_argument( 56 | "--log", dest="log", action="store_true", help="turn on logging" 57 | ) 58 | parser.add_argument( 59 | "--save-period", type=int, default=100, help="save model period" 60 | ) 61 | parser.add_argument( 62 | "--episode-num", type=int, default=1500, help="total episode num" 63 | ) 64 | parser.add_argument( 65 | "--max-episode-steps", type=int, default=300, help="max episode step" 66 | ) 67 | parser.add_argument( 68 | "--interim-test-num", 69 | type=int, 70 | default=10, 71 | help="number of test during training", 72 | ) 73 | 74 | return parser.parse_args() 75 | 76 | 77 | def main(): 78 | """Main.""" 79 | args = parse_args() 80 | 81 | # env initialization 82 | env_name = "LunarLanderContinuous-v2" 83 | env = gym.make(env_name) 84 | env, max_episode_steps = env_utils.set_env(env, args.max_episode_steps) 85 | 86 | # set a random seed 87 | common_utils.set_random_seed(args.seed, env) 88 | 89 | # run 90 | NOWTIMES = datetime.datetime.now() 91 | curr_time = NOWTIMES.strftime("%y%m%d_%H%M%S") 92 | 93 | cfg = YamlConfig(dict(agent=args.cfg_path)).get_config_dict() 94 | 95 | # If running integration test, simplify experiment 96 | if args.integration_test: 97 | cfg = common_utils.set_cfg_for_intergration_test(cfg) 98 | 99 | env_info = dict( 100 | name=env.spec.id, 101 | observation_space=env.observation_space, 102 | action_space=env.action_space, 103 | is_atari=False, 104 | ) 105 | log_cfg = dict(agent=cfg.agent.type, curr_time=curr_time, cfg_path=args.cfg_path) 106 | build_args = dict( 107 | env=env, 108 | env_info=env_info, 109 | log_cfg=log_cfg, 110 | is_test=args.test, 111 | load_from=args.load_from, 112 | is_render=args.render, 113 | render_after=args.render_after, 114 | is_log=args.log, 115 | save_period=args.save_period, 116 | episode_num=args.episode_num, 117 | max_episode_steps=max_episode_steps, 118 | interim_test_num=args.interim_test_num, 119 | ) 120 | agent = build_agent(cfg.agent, build_args) 121 | 122 | if not args.test: 123 | agent.train() 124 | else: 125 | agent.test() 126 | 127 | 128 | if __name__ == "__main__": 129 | main() 130 | -------------------------------------------------------------------------------- /rl_algorithms/utils/registry.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | 3 | import ray 4 | 5 | from rl_algorithms.utils.config import ConfigDict 6 | 7 | 8 | class Registry: 9 | def __init__(self, name): 10 | self._name = name 11 | self._module_dict = dict() 12 | 13 | def __repr__(self): 14 | format_str = self.__class__.__name__ + "(name={}, items={})".format( 15 | self._name, list(self._module_dict.keys()) 16 | ) 17 | return format_str 18 | 19 | @property 20 | def name(self): 21 | return self._name 22 | 23 | @property 24 | def module_dict(self): 25 | return self._module_dict 26 | 27 | def get(self, key): 28 | return self._module_dict.get(key, None) 29 | 30 | def _register_module(self, module_class): 31 | """Register a module. 32 | Args: 33 | module (:obj:`nn.Module`): Module to be registered. 34 | """ 35 | if not inspect.isclass(module_class): 36 | raise TypeError( 37 | "module must be a class, but got {}".format(type(module_class)) 38 | ) 39 | module_name = module_class.__name__ 40 | if module_name in self._module_dict: 41 | raise KeyError( 42 | "{} is already registered in {}".format(module_name, self.name) 43 | ) 44 | self._module_dict[module_name] = module_class 45 | 46 | def register_module(self, cls): 47 | self._register_module(cls) 48 | return cls 49 | 50 | 51 | def build_from_cfg(cfg: ConfigDict, registry: Registry, default_args: dict = None): 52 | """Build a module from config dict. 53 | Args: 54 | cfg (:obj: `ConfigDict`): Config dict. It should at least contain the key "type". 55 | registry (:obj:`Registry`): The registry to search the type from. 56 | default_args (dict, optional): Default initialization arguments. 57 | Returns: 58 | obj: The constructed object. 59 | """ 60 | assert isinstance(cfg, dict) and "type" in cfg 61 | assert isinstance(default_args, dict) or default_args is None 62 | args = cfg.copy() 63 | obj_type = args.pop("type") 64 | if isinstance(obj_type, str): 65 | obj_cls = registry.get(obj_type) 66 | if obj_cls is None: 67 | raise KeyError( 68 | "{} is not in the {} registry".format(obj_type, registry.name) 69 | ) 70 | elif inspect.isclass(obj_type): 71 | obj_cls = obj_type 72 | else: 73 | raise TypeError( 74 | "type must be a str or valid type, but got {}".format(type(obj_type)) 75 | ) 76 | 77 | if default_args is not None: 78 | for name, value in default_args.items(): 79 | args.setdefault(name, value) 80 | return obj_cls(**args) 81 | 82 | 83 | def build_ray_obj_from_cfg( 84 | cfg: ConfigDict, registry: Registry, default_args: dict = None 85 | ): 86 | """Build a module from config dict. 87 | Args: 88 | cfg (:obj: `ConfigDict`): Config dict. It should at least contain the key "type". 89 | registry (:obj:`Registry`): The registry to search the type from. 90 | default_args (dict, optional): Default initialization arguments. 91 | Returns: 92 | obj: The constructed object. 93 | """ 94 | assert isinstance(cfg, dict) and "type" in cfg 95 | assert isinstance(default_args, dict) or default_args is None 96 | args = cfg.copy() 97 | obj_type = args.pop("type") 98 | if isinstance(obj_type, str): 99 | obj_cls = registry.get(obj_type) 100 | if obj_cls is None: 101 | raise KeyError( 102 | "{} is not in the {} registry".format(obj_type, registry.name) 103 | ) 104 | elif inspect.isclass(obj_type): 105 | obj_cls = obj_type 106 | else: 107 | raise TypeError( 108 | "type must be a str or valid type, but got {}".format(type(obj_type)) 109 | ) 110 | 111 | if default_args is not None: 112 | for name, value in default_args.items(): 113 | args.setdefault(name, value) 114 | return ray.remote(num_cpus=1)(obj_cls).remote(**args) 115 | -------------------------------------------------------------------------------- /rl_algorithms/bc/ddpg_learner.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from torch.nn.utils import clip_grad_norm_ 4 | 5 | from rl_algorithms.common.abstract.learner import TensorTuple 6 | import rl_algorithms.common.helper_functions as common_utils 7 | from rl_algorithms.ddpg.learner import DDPGLearner 8 | from rl_algorithms.registry import LEARNERS 9 | 10 | 11 | @LEARNERS.register_module 12 | class BCDDPGLearner(DDPGLearner): 13 | """Learner for BCDDPG Agent. 14 | 15 | Attributes: 16 | hyper_params (ConfigDict): hyper-parameters 17 | optim_cfg (ConfigDict): config of optimizer 18 | log_cfg (ConfigDict): configuration for saving log and checkpoint 19 | actor (nn.Module): actor model to select actions 20 | actor_target (nn.Module): target actor model to select actions 21 | critic (nn.Module): critic model to predict state values 22 | critic_target (nn.Module): target critic model to predict state values 23 | actor_optim (Optimizer): optimizer for training actor 24 | critic_optim (Optimizer): optimizer for training critic 25 | 26 | """ 27 | 28 | def update_model( 29 | self, experience: TensorTuple, demos: TensorTuple 30 | ) -> TensorTuple: # type: ignore 31 | """Update actor and critic networks.""" 32 | exp_states, exp_actions, exp_rewards, exp_next_states, exp_dones = experience 33 | demo_states, demo_actions, demo_rewards, demo_next_states, demo_dones = demos 34 | 35 | states = torch.cat((exp_states, demo_states), dim=0) 36 | actions = torch.cat((exp_actions, demo_actions), dim=0) 37 | rewards = torch.cat((exp_rewards, demo_rewards), dim=0) 38 | next_states = torch.cat((exp_next_states, demo_next_states), dim=0) 39 | dones = torch.cat((exp_dones, demo_dones), dim=0) 40 | 41 | # G_t = r + gamma * v(s_{t+1}) if state != Terminal 42 | # = r otherwise 43 | masks = 1 - dones 44 | next_actions = self.actor_target(next_states) 45 | next_values = self.critic_target(torch.cat((next_states, next_actions), dim=-1)) 46 | curr_returns = rewards + (self.hyper_params.gamma * next_values * masks) 47 | curr_returns = curr_returns.to(self.device) 48 | 49 | # critic loss 50 | gradient_clip_ac = self.hyper_params.gradient_clip_ac 51 | gradient_clip_cr = self.hyper_params.gradient_clip_cr 52 | 53 | values = self.critic(torch.cat((states, actions), dim=-1)) 54 | critic_loss = F.mse_loss(values, curr_returns) 55 | 56 | # train critic 57 | self.critic_optim.zero_grad() 58 | critic_loss.backward() 59 | clip_grad_norm_(self.critic.parameters(), gradient_clip_cr) 60 | self.critic_optim.step() 61 | 62 | # policy loss 63 | actions = self.actor(states) 64 | policy_loss = -self.critic(torch.cat((states, actions), dim=-1)).mean() 65 | 66 | # bc loss 67 | pred_actions = self.actor(demo_states) 68 | qf_mask = torch.gt( 69 | self.critic(torch.cat((demo_states, demo_actions), dim=-1)), 70 | self.critic(torch.cat((demo_states, pred_actions), dim=-1)), 71 | ).to(self.device) 72 | qf_mask = qf_mask.float() 73 | n_qf_mask = int(qf_mask.sum().item()) 74 | 75 | if n_qf_mask == 0: 76 | bc_loss = torch.zeros(1, device=self.device) 77 | else: 78 | bc_loss = ( 79 | torch.mul(pred_actions, qf_mask) - torch.mul(demo_actions, qf_mask) 80 | ).pow(2).sum() / n_qf_mask 81 | 82 | # train actor: pg loss + BC loss 83 | actor_loss = ( 84 | self.hyper_params.lambda1 * policy_loss 85 | + self.hyper_params.lambda2 * bc_loss 86 | ) 87 | self.actor_optim.zero_grad() 88 | actor_loss.backward() 89 | clip_grad_norm_(self.actor.parameters(), gradient_clip_ac) 90 | self.actor_optim.step() 91 | 92 | # update target networks 93 | common_utils.soft_update(self.actor, self.actor_target, self.hyper_params.tau) 94 | common_utils.soft_update(self.critic, self.critic_target, self.hyper_params.tau) 95 | 96 | return actor_loss.item(), critic_loss.item(), n_qf_mask 97 | -------------------------------------------------------------------------------- /run_reacher_v2.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Train or test algorithms on Reacher-v2 of Mujoco. 3 | 4 | - Author: Kyunghwan Kim 5 | - Contact: kh.kim@medipixel.io 6 | """ 7 | 8 | import argparse 9 | import datetime 10 | 11 | import gym 12 | 13 | from rl_algorithms import build_agent 14 | import rl_algorithms.common.env.utils as env_utils 15 | import rl_algorithms.common.helper_functions as common_utils 16 | from rl_algorithms.utils import YamlConfig 17 | 18 | 19 | def parse_args() -> argparse.Namespace: 20 | # configurations 21 | parser = argparse.ArgumentParser(description="Pytorch RL rl_algorithms") 22 | parser.add_argument( 23 | "--seed", type=int, default=777, help="random seed for reproducibility" 24 | ) 25 | parser.add_argument("--algo", type=str, default="ddpg", help="choose an algorithm") 26 | parser.add_argument( 27 | "--cfg-path", 28 | type=str, 29 | default="./configs/reacher_v2/ddpg.yaml", 30 | help="config path", 31 | ) 32 | parser.add_argument( 33 | "--integration-test", 34 | dest="integration_test", 35 | action="store_true", 36 | help="for integration test", 37 | ) 38 | parser.add_argument( 39 | "--test", dest="test", action="store_true", help="test mode (no training)" 40 | ) 41 | parser.add_argument( 42 | "--load-from", 43 | type=str, 44 | default=None, 45 | help="load the saved model and optimizer at the beginning", 46 | ) 47 | parser.add_argument( 48 | "--off-render", dest="render", action="store_false", help="turn off rendering" 49 | ) 50 | parser.add_argument( 51 | "--render-after", 52 | type=int, 53 | default=0, 54 | help="start rendering after the input number of episode", 55 | ) 56 | parser.add_argument( 57 | "--log", dest="log", action="store_true", help="turn on logging" 58 | ) 59 | parser.add_argument( 60 | "--save-period", type=int, default=200, help="save model period" 61 | ) 62 | parser.add_argument( 63 | "--episode-num", type=int, default=20000, help="total episode num" 64 | ) 65 | parser.add_argument( 66 | "--max-episode-steps", type=int, default=-1, help="max episode step" 67 | ) 68 | parser.add_argument( 69 | "--interim-test-num", 70 | type=int, 71 | default=10, 72 | help="number of test during training", 73 | ) 74 | 75 | return parser.parse_args() 76 | 77 | 78 | def main(): 79 | """Main.""" 80 | args = parse_args() 81 | 82 | # env initialization 83 | env_name = "Reacher-v2" 84 | env = gym.make(env_name) 85 | env, max_episode_steps = env_utils.set_env(env, args.max_episode_steps) 86 | 87 | # set a random seed 88 | common_utils.set_random_seed(args.seed, env) 89 | 90 | # run 91 | NOWTIMES = datetime.datetime.now() 92 | curr_time = NOWTIMES.strftime("%y%m%d_%H%M%S") 93 | 94 | cfg = YamlConfig(dict(agent=args.cfg_path)).get_config_dict() 95 | 96 | # If running integration test, simplify experiment 97 | if args.integration_test: 98 | cfg = common_utils.set_cfg_for_intergration_test(cfg) 99 | 100 | env_info = dict( 101 | name=env.spec.id, 102 | observation_space=env.observation_space, 103 | action_space=env.action_space, 104 | is_atari=False, 105 | ) 106 | log_cfg = dict(agent=cfg.agent.type, curr_time=curr_time, cfg_path=args.cfg_path) 107 | build_args = dict( 108 | env=env, 109 | env_info=env_info, 110 | log_cfg=log_cfg, 111 | is_test=args.test, 112 | load_from=args.load_from, 113 | is_render=args.render, 114 | render_after=args.render_after, 115 | is_log=args.log, 116 | save_period=args.save_period, 117 | episode_num=args.episode_num, 118 | max_episode_steps=max_episode_steps, 119 | interim_test_num=args.interim_test_num, 120 | ) 121 | agent = build_agent(cfg.agent, build_args) 122 | 123 | if not args.test: 124 | agent.train() 125 | else: 126 | agent.test() 127 | 128 | 129 | if __name__ == "__main__": 130 | main() 131 | -------------------------------------------------------------------------------- /rl_algorithms/common/buffer/distillation_buffer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Distillation buffer.""" 3 | 4 | import os 5 | import pickle 6 | from typing import List 7 | 8 | import torch 9 | from torch.utils.data import DataLoader, Dataset 10 | 11 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 12 | 13 | 14 | class DistillationBuffer: 15 | """Class for managing reading and writing of distillation data. 16 | Distillation data is stored in the dataset_path location on the actual hard disk. 17 | The data collected by the teacher is stored as individual pickle files. 18 | It is also read as batch through the pytorch DataLoader class. 19 | 20 | Attributes: 21 | batch_size (int): size of batch size from distillation buffer for training 22 | dataset_path (list): list of distillation buffer path 23 | curr_time (str): program's start time to distinguish between teacher agents 24 | idx (int): index of data 25 | buffer_size (int): distillation buffer size 26 | dataloader (DataLoader): pytorch library for random batch data sampling 27 | 28 | """ 29 | 30 | def __init__( 31 | self, 32 | batch_size: int, 33 | dataset_path: List[str], 34 | ): 35 | """Initialize a DistillationBuffer object. 36 | 37 | Args: 38 | batch_size (int): size of a batched sampled from distillation buffer for training 39 | dataset_path (list): list of distillation buffer path 40 | curr_time (str): program's start time to distinguish between teacher agents 41 | 42 | """ 43 | self.batch_size = batch_size 44 | self.dataset_path = dataset_path 45 | self.idx = 0 46 | self.buffer_size = 0 47 | self.dataloader = None 48 | self.is_contain_q = False 49 | 50 | def reset_dataloader(self): 51 | """Initialize and reset DataLoader class. 52 | DataLoader class must be reset for every epoch. 53 | """ 54 | dataset = DistillationDataset(self.dataset_path) 55 | self.is_contain_q = dataset.is_contain_q 56 | self.buffer_size = len(dataset) 57 | self.dataloader = iter( 58 | DataLoader(dataset, batch_size=self.batch_size, shuffle=True, num_workers=4) 59 | ) 60 | 61 | def sample_for_diltillation(self): 62 | """Sample a batch of state and Q-value for student's learning.""" 63 | assert ( 64 | self.buffer_size >= self.batch_size 65 | ), f"buffer size({self.buffer_size}) < ({self.batch_size})" 66 | 67 | return next(self.dataloader) 68 | 69 | 70 | class DistillationDataset(Dataset): 71 | """Pytorch Dataset class for random batch data sampling. 72 | 73 | Attributes: 74 | dataset_path (str): distillation buffer path 75 | 76 | """ 77 | 78 | def __init__(self, dataset_path: List[str]): 79 | """Initialize a DistillationBuffer object. 80 | 81 | Args: 82 | dataset_path (str): distillation buffer path 83 | file_name_list (list): transition's file name list in distillation buffer path 84 | 85 | """ 86 | super().__init__() 87 | self.dataset_path = dataset_path 88 | self.file_name_list = [] 89 | 90 | sum_data_len = 0 91 | for _dir in self.dataset_path: 92 | tmp = os.listdir(_dir) 93 | self.file_name_list += [os.path.join(_dir, x) for x in tmp] 94 | with open(self.file_name_list[-1], "rb") as f: 95 | data = pickle.load(f) 96 | sum_data_len += int(len(data) == 2) 97 | 98 | if sum_data_len == len(self.dataset_path): 99 | self.is_contain_q = True 100 | elif sum_data_len == 0: 101 | self.is_contain_q = False 102 | else: 103 | raise AssertionError( 104 | "There is a mixture of data with q present and non-existent ones" 105 | + "in buffer-path." 106 | ) 107 | 108 | def __len__(self): 109 | """Denotes the total number of samples.""" 110 | return len(self.file_name_list) 111 | 112 | def __getitem__(self, index): 113 | """Generates one sample of data.""" 114 | with open(self.file_name_list[index], "rb") as f: 115 | transition = pickle.load(f) 116 | return transition 117 | -------------------------------------------------------------------------------- /rl_algorithms/dqn/linear.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Linear module for dqn algorithms 3 | 4 | - Author: Kyunghwan Kim 5 | - Contact: kh.kim@medipixel.io 6 | """ 7 | 8 | import math 9 | 10 | import numpy as np 11 | import torch 12 | import torch.nn as nn 13 | import torch.nn.functional as F 14 | 15 | from rl_algorithms.common.helper_functions import numpy2floattensor 16 | 17 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 18 | 19 | 20 | # TODO: Remove it when upgrade torch>=1.7 21 | # pylint: disable=abstract-method 22 | class NoisyLinear(nn.Module): 23 | """Noisy linear module for NoisyNet. 24 | 25 | References: 26 | https://github.com/higgsfield/RL-Adventure/blob/master/5.noisy%20dqn.ipynb 27 | https://github.com/Kaixhin/Rainbow/blob/master/model.py 28 | 29 | Attributes: 30 | in_features (int): input size of linear module 31 | out_features (int): output size of linear module 32 | std_init (float): initial std value 33 | weight_mu (nn.Parameter): mean value weight parameter 34 | weight_sigma (nn.Parameter): std value weight parameter 35 | bias_mu (nn.Parameter): mean value bias parameter 36 | bias_sigma (nn.Parameter): std value bias parameter 37 | 38 | """ 39 | 40 | def __init__(self, in_features: int, out_features: int, std_init: float = 0.5): 41 | """Initialize.""" 42 | super(NoisyLinear, self).__init__() 43 | self.in_features = in_features 44 | self.out_features = out_features 45 | self.std_init = std_init 46 | 47 | self.weight_mu = nn.Parameter(torch.Tensor(out_features, in_features)) 48 | self.weight_sigma = nn.Parameter(torch.Tensor(out_features, in_features)) 49 | self.register_buffer("weight_epsilon", torch.Tensor(out_features, in_features)) 50 | 51 | self.bias_mu = nn.Parameter(torch.Tensor(out_features)) 52 | self.bias_sigma = nn.Parameter(torch.Tensor(out_features)) 53 | self.register_buffer("bias_epsilon", torch.Tensor(out_features)) 54 | 55 | self.reset_parameters() 56 | self.reset_noise() 57 | 58 | def reset_parameters(self): 59 | """Reset trainable network parameters (factorized gaussian noise).""" 60 | mu_range = 1 / math.sqrt(self.in_features) 61 | self.weight_mu.data.uniform_(-mu_range, mu_range) 62 | self.weight_sigma.data.fill_(self.std_init / math.sqrt(self.in_features)) 63 | self.bias_mu.data.uniform_(-mu_range, mu_range) 64 | self.bias_sigma.data.fill_(self.std_init / math.sqrt(self.out_features)) 65 | 66 | @staticmethod 67 | def scale_noise(size: int) -> torch.Tensor: 68 | """Set scale to make noise (factorized gaussian noise).""" 69 | x = numpy2floattensor(np.random.normal(loc=0.0, scale=1.0, size=size), device) 70 | 71 | return x.sign().mul(x.abs().sqrt()) 72 | 73 | def reset_noise(self): 74 | """Make new noise.""" 75 | epsilon_in = self.scale_noise(self.in_features) 76 | epsilon_out = self.scale_noise(self.out_features) 77 | 78 | # outer product 79 | self.weight_epsilon.copy_(epsilon_out.ger(epsilon_in)) 80 | self.bias_epsilon.copy_(epsilon_out) 81 | 82 | def forward(self, x: torch.Tensor) -> torch.Tensor: 83 | """Forward method implementation. 84 | 85 | We don't use separate statements on train / eval mode. 86 | It doesn't show remarkable difference of performance. 87 | """ 88 | return F.linear( 89 | x, 90 | self.weight_mu + self.weight_sigma * self.weight_epsilon, 91 | self.bias_mu + self.bias_sigma * self.bias_epsilon, 92 | ) 93 | 94 | 95 | class NoisyLinearConstructor: 96 | """Constructor class for changing hyper parameters of NoisyLinear. 97 | 98 | Attributes: 99 | std_init (float): initial std value 100 | 101 | """ 102 | 103 | def __init__(self, std_init: float = 0.5): 104 | """Initialize.""" 105 | self.std_init = std_init 106 | 107 | def __call__(self, in_features: int, out_features: int) -> NoisyLinear: 108 | """Return NoisyLinear instance set hyper parameters""" 109 | return NoisyLinear(in_features, out_features, self.std_init) 110 | 111 | 112 | class NoisyMLPHandler: 113 | """Includes methods to handle noisy linear.""" 114 | 115 | def reset_noise(self): 116 | """Re-sample noise""" 117 | for _, module in self.named_children(): 118 | module.reset_noise() 119 | -------------------------------------------------------------------------------- /rl_algorithms/fd/ddpg_learner.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple, Union 2 | 3 | import torch 4 | from torch.nn.utils import clip_grad_norm_ 5 | 6 | from rl_algorithms.common.abstract.learner import TensorTuple 7 | import rl_algorithms.common.helper_functions as common_utils 8 | from rl_algorithms.ddpg.learner import DDPGLearner 9 | from rl_algorithms.registry import LEARNERS 10 | 11 | 12 | @LEARNERS.register_module 13 | class DDPGfDLearner(DDPGLearner): 14 | """Learner for DDPGfD Agent. 15 | 16 | Attributes: 17 | hyper_params (ConfigDict): hyper-parameters 18 | optim_cfg (ConfigDict): config of optimizer 19 | log_cfg (ConfigDict): configuration for saving log and checkpoint 20 | actor (nn.Module): actor model to select actions 21 | actor_target (nn.Module): target actor model to select actions 22 | critic (nn.Module): critic model to predict state values 23 | critic_target (nn.Module): target critic model to predict state values 24 | actor_optim (Optimizer): optimizer for training actor 25 | critic_optim (Optimizer): optimizer for training critic 26 | 27 | """ 28 | 29 | def _get_critic_loss( 30 | self, experiences: Tuple[TensorTuple, ...], gamma: float 31 | ) -> torch.Tensor: 32 | """Return element-wise critic loss.""" 33 | states, actions, rewards, next_states, dones = experiences[:5] 34 | 35 | # G_t = r + gamma * v(s_{t+1}) if state != Terminal 36 | # = r otherwise 37 | masks = 1 - dones 38 | next_actions = self.actor_target(next_states) 39 | next_states_actions = torch.cat((next_states, next_actions), dim=-1) 40 | next_values = self.critic_target(next_states_actions) 41 | curr_returns = rewards + gamma * next_values * masks 42 | curr_returns = curr_returns.to(self.device).detach() 43 | 44 | # train critic 45 | values = self.critic(torch.cat((states, actions), dim=-1)) 46 | critic_loss_element_wise = (values - curr_returns).pow(2) 47 | 48 | return critic_loss_element_wise 49 | 50 | def update_model( 51 | self, experience: Union[TensorTuple, Tuple[TensorTuple]] 52 | ) -> TensorTuple: # type: ignore 53 | """Train the model after each episode.""" 54 | use_n_step = self.hyper_params.n_step > 1 55 | if use_n_step: 56 | experience_1, experience_n = experience 57 | else: 58 | experience_1 = experience 59 | 60 | states, actions = experience_1[:2] 61 | weights, indices, eps_d = experience_1[-3:] 62 | gamma = self.hyper_params.gamma 63 | 64 | # train critic 65 | gradient_clip_ac = self.hyper_params.gradient_clip_ac 66 | gradient_clip_cr = self.hyper_params.gradient_clip_cr 67 | 68 | critic_loss_element_wise = self._get_critic_loss(experience_1, gamma) 69 | critic_loss = torch.mean(critic_loss_element_wise * weights) 70 | 71 | if use_n_step: 72 | gamma = gamma ** self.hyper_params.n_step 73 | 74 | critic_loss_n_element_wise = self._get_critic_loss(experience_n, gamma) 75 | # to update loss and priorities 76 | critic_loss_element_wise += ( 77 | critic_loss_n_element_wise * self.hyper_params.lambda1 78 | ) 79 | critic_loss = torch.mean(critic_loss_element_wise * weights) 80 | 81 | self.critic_optim.zero_grad() 82 | critic_loss.backward() 83 | clip_grad_norm_(self.critic.parameters(), gradient_clip_cr) 84 | self.critic_optim.step() 85 | 86 | # train actor 87 | actions = self.actor(states) 88 | actor_loss_element_wise = -self.critic(torch.cat((states, actions), dim=-1)) 89 | actor_loss = torch.mean(actor_loss_element_wise * weights) 90 | self.actor_optim.zero_grad() 91 | actor_loss.backward() 92 | clip_grad_norm_(self.actor.parameters(), gradient_clip_ac) 93 | self.actor_optim.step() 94 | 95 | # update target networks 96 | common_utils.soft_update(self.actor, self.actor_target, self.hyper_params.tau) 97 | common_utils.soft_update(self.critic, self.critic_target, self.hyper_params.tau) 98 | 99 | # update priorities 100 | new_priorities = critic_loss_element_wise 101 | new_priorities += self.hyper_params.lambda3 * actor_loss_element_wise.pow(2) 102 | new_priorities += self.hyper_params.per_eps 103 | new_priorities = new_priorities.data.cpu().numpy().squeeze() 104 | new_priorities += eps_d 105 | 106 | return ( 107 | actor_loss.item(), 108 | critic_loss.item(), 109 | indices, 110 | new_priorities, 111 | ) 112 | -------------------------------------------------------------------------------- /rl_algorithms/common/buffer/segment_tree.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Segment tree for Proirtized Replay Buffer.""" 3 | 4 | import operator 5 | from typing import Callable 6 | 7 | 8 | class SegmentTree: 9 | """Create SegmentTree. 10 | 11 | Taken from OpenAI baselines github repository: 12 | https://github.com/openai/baselines/blob/master/baselines/common/segment_tree.py 13 | 14 | Attributes: 15 | capacity (int) 16 | tree (list) 17 | operation (function) 18 | 19 | """ 20 | 21 | def __init__(self, capacity: int, operation: Callable, init_value: float): 22 | """Initialize. 23 | 24 | Args: 25 | capacity (int) 26 | operation (function) 27 | init_value (float) 28 | 29 | """ 30 | assert ( 31 | capacity > 0 and capacity & (capacity - 1) == 0 32 | ), "capacity must be positive and a power of 2." 33 | self.capacity = capacity 34 | self.tree = [init_value for _ in range(2 * capacity)] 35 | self.operation = operation 36 | 37 | def _operate_helper( 38 | self, start: int, end: int, node: int, node_start: int, node_end: int 39 | ) -> float: 40 | """Returns result of operation in segment.""" 41 | if start == node_start and end == node_end: 42 | return self.tree[node] 43 | mid = (node_start + node_end) // 2 44 | if end <= mid: 45 | return self._operate_helper(start, end, 2 * node, node_start, mid) 46 | else: 47 | if mid + 1 <= start: 48 | return self._operate_helper(start, end, 2 * node + 1, mid + 1, node_end) 49 | else: 50 | return self.operation( 51 | self._operate_helper(start, mid, 2 * node, node_start, mid), 52 | self._operate_helper(mid + 1, end, 2 * node + 1, mid + 1, node_end), 53 | ) 54 | 55 | def operate(self, start: int = 0, end: int = 0) -> float: 56 | """Returns result of applying `self.operation`.""" 57 | if end <= 0: 58 | end += self.capacity 59 | end -= 1 60 | 61 | return self._operate_helper(start, end, 1, 0, self.capacity - 1) 62 | 63 | def __setitem__(self, idx: int, val: float): 64 | """Set value in tree.""" 65 | idx += self.capacity 66 | self.tree[idx] = val 67 | 68 | idx //= 2 69 | while idx >= 1: 70 | self.tree[idx] = self.operation(self.tree[2 * idx], self.tree[2 * idx + 1]) 71 | idx //= 2 72 | 73 | def __getitem__(self, idx: int) -> float: 74 | """Get real value in leaf node of tree.""" 75 | assert 0 <= idx < self.capacity 76 | 77 | return self.tree[self.capacity + idx] 78 | 79 | 80 | class SumSegmentTree(SegmentTree): 81 | """Create SumSegmentTree. 82 | 83 | Taken from OpenAI baselines github repository: 84 | https://github.com/openai/baselines/blob/master/baselines/common/segment_tree.py 85 | 86 | """ 87 | 88 | def __init__(self, capacity: int): 89 | """Initialize. 90 | 91 | Args: 92 | capacity (int) 93 | 94 | """ 95 | super(SumSegmentTree, self).__init__( 96 | capacity=capacity, operation=operator.add, init_value=0.0 97 | ) 98 | 99 | def sum(self, start: int = 0, end: int = 0) -> float: 100 | """Returns arr[start] + ... + arr[end].""" 101 | return super(SumSegmentTree, self).operate(start, end) 102 | 103 | def retrieve(self, upperbound: float) -> int: 104 | """Find the highest index `i` about upper bound in the tree.""" 105 | # TODO: Check assert case and fix bug 106 | assert 0 <= upperbound <= self.sum() + 1e-5, "upperbound: {}".format(upperbound) 107 | 108 | idx = 1 109 | 110 | while idx < self.capacity: # while non-leaf 111 | left = 2 * idx 112 | right = left + 1 113 | if self.tree[left] > upperbound: 114 | idx = 2 * idx 115 | else: 116 | upperbound -= self.tree[left] 117 | idx = right 118 | return idx - self.capacity 119 | 120 | 121 | class MinSegmentTree(SegmentTree): 122 | """Create SegmentTree. 123 | 124 | Taken from OpenAI baselines github repository: 125 | https://github.com/openai/baselines/blob/master/baselines/common/segment_tree.py 126 | 127 | """ 128 | 129 | def __init__(self, capacity: int): 130 | """Initialize. 131 | 132 | Args: 133 | capacity (int) 134 | 135 | """ 136 | super(MinSegmentTree, self).__init__( 137 | capacity=capacity, operation=min, init_value=float("inf") 138 | ) 139 | 140 | def min(self, start: int = 0, end: int = 0) -> float: 141 | """Returns min(arr[start], ..., arr[end]).""" 142 | return super(MinSegmentTree, self).operate(start, end) 143 | -------------------------------------------------------------------------------- /rl_algorithms/bc/sac_learner.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | 4 | from rl_algorithms.common.abstract.learner import TensorTuple 5 | import rl_algorithms.common.helper_functions as common_utils 6 | from rl_algorithms.registry import LEARNERS 7 | from rl_algorithms.sac.learner import SACLearner 8 | 9 | 10 | @LEARNERS.register_module 11 | class BCSACLearner(SACLearner): 12 | """Learner for BCSAC Agent. 13 | 14 | Attributes: 15 | hyper_params (ConfigDict): hyper-parameters 16 | log_cfg (ConfigDict): configuration for saving log and checkpoint 17 | """ 18 | 19 | def update_model( 20 | self, experience: TensorTuple, demos: TensorTuple 21 | ) -> TensorTuple: # type: ignore 22 | """Train the model after each episode.""" 23 | self.update_step += 1 24 | 25 | states, actions, rewards, next_states, dones = experience 26 | demo_states, demo_actions, _, _, _ = demos 27 | new_actions, log_prob, pre_tanh_value, mu, std = self.actor(states) 28 | pred_actions, _, _, _, _ = self.actor(demo_states) 29 | 30 | # train alpha 31 | if self.hyper_params.auto_entropy_tuning: 32 | alpha_loss = ( 33 | -self.log_alpha * (log_prob + self.target_entropy).detach() 34 | ).mean() 35 | 36 | self.alpha_optim.zero_grad() 37 | alpha_loss.backward() 38 | self.alpha_optim.step() 39 | 40 | alpha = self.log_alpha.exp() 41 | else: 42 | alpha_loss = torch.zeros(1) 43 | alpha = self.hyper_params.w_entropy 44 | 45 | # Q function loss 46 | masks = 1 - dones 47 | states_actions = torch.cat((states, actions), dim=-1) 48 | q_1_pred = self.qf_1(states_actions) 49 | q_2_pred = self.qf_2(states_actions) 50 | v_target = self.vf_target(next_states) 51 | q_target = rewards + self.hyper_params.gamma * v_target * masks 52 | qf_1_loss = F.mse_loss(q_1_pred, q_target.detach()) 53 | qf_2_loss = F.mse_loss(q_2_pred, q_target.detach()) 54 | 55 | # V function loss 56 | states_actions = torch.cat((states, new_actions), dim=-1) 57 | v_pred = self.vf(states) 58 | q_pred = torch.min(self.qf_1(states_actions), self.qf_2(states_actions)) 59 | v_target = q_pred - alpha * log_prob 60 | vf_loss = F.mse_loss(v_pred, v_target.detach()) 61 | 62 | # update actor 63 | actor_loss = torch.zeros(1) 64 | n_qf_mask = 0 65 | if self.update_step % self.hyper_params.policy_update_freq == 0: 66 | # bc loss 67 | qf_mask = torch.gt( 68 | self.qf_1(torch.cat((demo_states, demo_actions), dim=-1)), 69 | self.qf_1(torch.cat((demo_states, pred_actions), dim=-1)), 70 | ).to(self.device) 71 | qf_mask = qf_mask.float() 72 | n_qf_mask = int(qf_mask.sum().item()) 73 | 74 | if n_qf_mask == 0: 75 | bc_loss = torch.zeros(1, device=self.device) 76 | else: 77 | bc_loss = ( 78 | torch.mul(pred_actions, qf_mask) - torch.mul(demo_actions, qf_mask) 79 | ).pow(2).sum() / n_qf_mask 80 | 81 | # actor loss 82 | advantage = q_pred - v_pred.detach() 83 | actor_loss = (alpha * log_prob - advantage).mean() 84 | actor_loss = ( 85 | self.hyper_params.lambda1 * actor_loss 86 | + self.hyper_params.lambda2 * bc_loss 87 | ) 88 | 89 | # regularization 90 | mean_reg, std_reg = ( 91 | self.hyper_params.w_mean_reg * mu.pow(2).mean(), 92 | self.hyper_params.w_std_reg * std.pow(2).mean(), 93 | ) 94 | pre_activation_reg = self.hyper_params.w_pre_activation_reg * ( 95 | pre_tanh_value.pow(2).sum(dim=-1).mean() 96 | ) 97 | actor_reg = mean_reg + std_reg + pre_activation_reg 98 | 99 | # actor loss + regularization 100 | actor_loss += actor_reg 101 | 102 | # train actor 103 | self.actor_optim.zero_grad() 104 | actor_loss.backward() 105 | self.actor_optim.step() 106 | 107 | # update target networks 108 | common_utils.soft_update(self.vf, self.vf_target, self.hyper_params.tau) 109 | 110 | # train Q functions 111 | self.qf_1_optim.zero_grad() 112 | qf_1_loss.backward() 113 | self.qf_1_optim.step() 114 | 115 | self.qf_2_optim.zero_grad() 116 | qf_2_loss.backward() 117 | self.qf_2_optim.step() 118 | 119 | # train V function 120 | self.vf_optim.zero_grad() 121 | vf_loss.backward() 122 | self.vf_optim.step() 123 | 124 | return ( 125 | actor_loss.item(), 126 | qf_1_loss.item(), 127 | qf_2_loss.item(), 128 | vf_loss.item(), 129 | alpha_loss.item(), 130 | n_qf_mask, 131 | ) 132 | -------------------------------------------------------------------------------- /run_pong_no_frameskip_v4.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Train or test algorithms on PongNoFrameskip-v4. 3 | 4 | - Author: Curt Park 5 | - Contact: curt.park@medipixel.io 6 | """ 7 | 8 | import argparse 9 | import datetime 10 | 11 | from rl_algorithms import build_agent 12 | from rl_algorithms.common.env.atari_wrappers import atari_env_generator 13 | import rl_algorithms.common.helper_functions as common_utils 14 | from rl_algorithms.utils import YamlConfig 15 | 16 | 17 | def parse_args() -> argparse.Namespace: 18 | # configurations 19 | parser = argparse.ArgumentParser(description="Pytorch RL algorithms") 20 | parser.add_argument( 21 | "--seed", type=int, default=161, help="random seed for reproducibility" 22 | ) 23 | parser.add_argument( 24 | "--cfg-path", 25 | type=str, 26 | default="./configs/pong_no_frameskip_v4/dqn.yaml", 27 | help="config path", 28 | ) 29 | parser.add_argument( 30 | "--integration-test", 31 | dest="integration_test", 32 | action="store_true", 33 | help="for integration test", 34 | ) 35 | parser.add_argument( 36 | "--grad-cam", 37 | dest="grad_cam", 38 | action="store_true", 39 | help="test mode with viewing Grad-CAM", 40 | ) 41 | parser.add_argument( 42 | "--test", dest="test", action="store_true", help="test mode (no training)" 43 | ) 44 | parser.add_argument( 45 | "--load-from", 46 | type=str, 47 | default=None, 48 | help="load the saved model and optimizer at the beginning", 49 | ) 50 | parser.add_argument( 51 | "--off-render", dest="render", action="store_false", help="turn off rendering" 52 | ) 53 | parser.add_argument( 54 | "--render-after", 55 | type=int, 56 | default=0, 57 | help="start rendering after the input number of episode", 58 | ) 59 | parser.add_argument( 60 | "--log", dest="log", action="store_true", help="turn on logging" 61 | ) 62 | parser.add_argument("--save-period", type=int, default=20, help="save model period") 63 | parser.add_argument( 64 | "--episode-num", type=int, default=500, help="total episode num" 65 | ) 66 | parser.add_argument( 67 | "--max-episode-steps", type=int, default=None, help="max episode step" 68 | ) 69 | parser.add_argument( 70 | "--interim-test-num", type=int, default=5, help="interim test number" 71 | ) 72 | parser.add_argument( 73 | "--off-framestack", 74 | dest="framestack", 75 | action="store_false", 76 | help="turn off framestack", 77 | ) 78 | parser.add_argument( 79 | "--saliency-map", 80 | action="store_true", 81 | help="save saliency map", 82 | ) 83 | 84 | return parser.parse_args() 85 | 86 | 87 | def env_generator(env_name, max_episode_steps, frame_stack): 88 | def _thunk(rank: int): 89 | env = atari_env_generator(env_name, max_episode_steps, frame_stack=frame_stack) 90 | env.seed(777 + rank + 1) 91 | return env 92 | 93 | return _thunk 94 | 95 | 96 | def main(): 97 | """Main.""" 98 | args = parse_args() 99 | 100 | # env initialization 101 | env_name = "PongNoFrameskip-v4" 102 | env_gen = env_generator( 103 | env_name, args.max_episode_steps, frame_stack=args.framestack 104 | ) 105 | env = env_gen(0) 106 | 107 | # set a random seed 108 | common_utils.set_random_seed(args.seed, env) 109 | 110 | # run 111 | NOWTIMES = datetime.datetime.now() 112 | curr_time = NOWTIMES.strftime("%y%m%d_%H%M%S") 113 | 114 | cfg = YamlConfig(dict(agent=args.cfg_path)).get_config_dict() 115 | 116 | # If running integration test, simplify experiment 117 | if args.integration_test: 118 | cfg = common_utils.set_cfg_for_intergration_test(cfg) 119 | 120 | env_info = dict( 121 | name=env.spec.id, 122 | observation_space=env.observation_space, 123 | action_space=env.action_space, 124 | is_atari=True, 125 | env_generator=env_gen, 126 | ) 127 | log_cfg = dict(agent=cfg.agent.type, curr_time=curr_time, cfg_path=args.cfg_path) 128 | build_args = dict( 129 | env=env, 130 | env_info=env_info, 131 | log_cfg=log_cfg, 132 | is_test=args.test, 133 | load_from=args.load_from, 134 | is_render=args.render, 135 | render_after=args.render_after, 136 | is_log=args.log, 137 | save_period=args.save_period, 138 | episode_num=args.episode_num, 139 | max_episode_steps=env.spec.max_episode_steps, 140 | interim_test_num=args.interim_test_num, 141 | ) 142 | agent = build_agent(cfg.agent, build_args) 143 | 144 | if not args.test: 145 | agent.train() 146 | elif args.test and args.grad_cam: 147 | agent.test_with_gradcam() 148 | elif args.test and args.saliency_map: 149 | agent.test_with_saliency_map() 150 | else: 151 | agent.test() 152 | 153 | 154 | if __name__ == "__main__": 155 | main() 156 | -------------------------------------------------------------------------------- /rl_algorithms/fd/sac_learner.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple 2 | 3 | import torch 4 | 5 | from rl_algorithms.common.abstract.learner import TensorTuple 6 | import rl_algorithms.common.helper_functions as common_utils 7 | from rl_algorithms.registry import LEARNERS 8 | from rl_algorithms.sac.learner import SACLearner 9 | 10 | 11 | @LEARNERS.register_module 12 | class SACfDLearner(SACLearner): 13 | """Learner for BCSAC Agent.""" 14 | 15 | # pylint: disable=too-many-statements 16 | def update_model(self, experience: Tuple[TensorTuple, ...]) -> TensorTuple: 17 | """Train the model after each episode.""" 18 | use_n_step = self.hyper_params.n_step > 1 19 | 20 | if use_n_step: 21 | experience_1, experience_n = experience 22 | else: 23 | experience_1 = experience 24 | 25 | states, actions, rewards, next_states, dones = experience_1[:-3] 26 | weights, indices, eps_d = experience_1[-3:] 27 | new_actions, log_prob, pre_tanh_value, mu, std = self.actor(states) 28 | 29 | # train alpha 30 | if self.hyper_params.auto_entropy_tuning: 31 | alpha_loss = torch.mean( 32 | (-self.log_alpha * (log_prob + self.target_entropy).detach()) * weights 33 | ) 34 | 35 | self.alpha_optim.zero_grad() 36 | alpha_loss.backward() 37 | self.alpha_optim.step() 38 | 39 | alpha = self.log_alpha.exp() 40 | else: 41 | alpha_loss = torch.zeros(1) 42 | alpha = self.hyper_params.w_entropy 43 | 44 | # Q function loss 45 | masks = 1 - dones 46 | gamma = self.hyper_params.gamma 47 | states_actions = torch.cat((states, actions), dim=-1) 48 | q_1_pred = self.qf_1(states_actions) 49 | q_2_pred = self.qf_2(states_actions) 50 | v_target = self.vf_target(next_states) 51 | q_target = rewards + self.hyper_params.gamma * v_target * masks 52 | qf_1_loss = torch.mean((q_1_pred - q_target.detach()).pow(2) * weights) 53 | qf_2_loss = torch.mean((q_2_pred - q_target.detach()).pow(2) * weights) 54 | 55 | if use_n_step: 56 | _, _, rewards, next_states, dones = experience_n 57 | 58 | gamma = gamma ** self.hyper_params.n_step 59 | masks = 1 - dones 60 | 61 | v_target = self.vf_target(next_states) 62 | q_target = rewards + gamma * v_target * masks 63 | qf_1_loss_n = torch.mean((q_1_pred - q_target.detach()).pow(2) * weights) 64 | qf_2_loss_n = torch.mean((q_2_pred - q_target.detach()).pow(2) * weights) 65 | 66 | # to update loss and priorities 67 | qf_1_loss = qf_1_loss + qf_1_loss_n * self.hyper_params.lambda1 68 | qf_2_loss = qf_2_loss + qf_2_loss_n * self.hyper_params.lambda1 69 | 70 | # V function loss 71 | states_actions = torch.cat((states, new_actions), dim=-1) 72 | v_pred = self.vf(states) 73 | q_pred = torch.min(self.qf_1(states_actions), self.qf_2(states_actions)) 74 | v_target = (q_pred - alpha * log_prob).detach() 75 | vf_loss_element_wise = (v_pred - v_target).pow(2) 76 | vf_loss = torch.mean(vf_loss_element_wise * weights) 77 | 78 | # actor loss 79 | advantage = q_pred - v_pred.detach() 80 | actor_loss_element_wise = alpha * log_prob - advantage 81 | actor_loss = torch.mean(actor_loss_element_wise * weights) 82 | 83 | # regularization 84 | mean_reg = self.hyper_params.w_mean_reg * mu.pow(2).mean() 85 | std_reg = self.hyper_params.w_std_reg * std.pow(2).mean() 86 | pre_activation_reg = self.hyper_params.w_pre_activation_reg * ( 87 | pre_tanh_value.pow(2).sum(dim=-1).mean() 88 | ) 89 | actor_reg = mean_reg + std_reg + pre_activation_reg 90 | 91 | # actor loss + regularization 92 | actor_loss += actor_reg 93 | 94 | # train actor 95 | self.actor_optim.zero_grad() 96 | actor_loss.backward() 97 | self.actor_optim.step() 98 | 99 | # update target networks 100 | common_utils.soft_update(self.vf, self.vf_target, self.hyper_params.tau) 101 | 102 | # update priorities 103 | new_priorities = vf_loss_element_wise 104 | new_priorities += self.hyper_params.lambda3 * actor_loss_element_wise.pow(2) 105 | new_priorities += self.hyper_params.per_eps 106 | new_priorities = new_priorities.data.cpu().numpy().squeeze() 107 | new_priorities += eps_d 108 | 109 | # train Q functions 110 | self.qf_1_optim.zero_grad() 111 | qf_1_loss.backward() 112 | self.qf_1_optim.step() 113 | 114 | self.qf_2_optim.zero_grad() 115 | qf_2_loss.backward() 116 | self.qf_2_optim.step() 117 | 118 | # train V function 119 | self.vf_optim.zero_grad() 120 | vf_loss.backward() 121 | self.vf_optim.step() 122 | 123 | return ( 124 | actor_loss.item(), 125 | qf_1_loss.item(), 126 | qf_2_loss.item(), 127 | vf_loss.item(), 128 | alpha_loss.item(), 129 | indices, 130 | new_priorities, 131 | ) 132 | -------------------------------------------------------------------------------- /rl_algorithms/common/grad_cam.py: -------------------------------------------------------------------------------- 1 | """Grad-CAM class for analyzing CNN network. 2 | 3 | - Author: Kyunghwan Kim 4 | - Contact: kh.kim@medipixel.io 5 | - Paper: https://arxiv.org/pdf/1610.02391v1.pdf 6 | - Reference: https://github.com/RRoundTable/XAI 7 | """ 8 | 9 | from collections import OrderedDict 10 | from typing import Callable 11 | 12 | import numpy as np 13 | import torch 14 | import torch.nn as nn 15 | from torch.nn import functional as F 16 | 17 | 18 | # pylint: disable=attribute-defined-outside-init 19 | class CAMBaseWrapper: 20 | """Base Wrapping module for CAM.""" 21 | 22 | def __init__(self, model: nn.Module): 23 | """Initialize.""" 24 | super(CAMBaseWrapper, self).__init__() 25 | self.device = next(model.parameters()).device 26 | self.model = model 27 | self.handlers = [] # a set of hook function handlers 28 | 29 | def _encode_one_hot(self, ids: torch.Tensor) -> torch.Tensor: 30 | """Convert input to one-hot.""" 31 | one_hot = torch.zeros_like(self.logits).to(self.device) 32 | one_hot[0][ids] = 1 33 | return one_hot 34 | 35 | def forward(self, image: torch.Tensor) -> torch.Tensor: 36 | """ 37 | Simple classification 38 | """ 39 | self.model.zero_grad() 40 | self.logits = self.model(image) 41 | return self.logits 42 | 43 | def backward(self, ids: torch.Tensor) -> torch.Tensor: 44 | """ 45 | Class-specific backpropagation. 46 | Either way works: 47 | 1. self.logits.backward(gradient=one_hot, retain_graph=True) 48 | 2. (self.logits * one_hot).sum().backward(retain_graph=True) 49 | """ 50 | 51 | one_hot = self._encode_one_hot(ids) 52 | self.logits.backward(gradient=one_hot, retain_graph=True) 53 | 54 | def generate(self, target_layer: str): 55 | raise NotImplementedError 56 | 57 | def remove_hook(self): 58 | """ 59 | Remove all the forward/backward hook functions 60 | """ 61 | for handle in self.handlers: 62 | handle.remove() 63 | 64 | 65 | # pylint: disable=attribute-defined-outside-init 66 | class GradCAM(CAMBaseWrapper): 67 | """ 68 | "Grad-CAM: Visual Explanations from Deep Networks via Gradient-based Localization" 69 | https://arxiv.org/pdf/1610.02391.pdf 70 | Look at Figure 2 on page 4 71 | """ 72 | 73 | def __init__(self, model: nn.Module, candidate_layers: list = None): 74 | """Initialize.""" 75 | super(GradCAM, self).__init__(model) 76 | self.fmap_pool = OrderedDict() 77 | self.grad_pool = OrderedDict() 78 | self.candidate_layers = candidate_layers # list 79 | 80 | def forward_hook(key: str) -> Callable: 81 | def forward_hook_(_, __, output: torch.Tensor): 82 | # Save featuremaps 83 | self.fmap_pool[key] = output.detach() 84 | 85 | return forward_hook_ 86 | 87 | def backward_hook(key: str) -> Callable: 88 | def backward_hook_(_, __, grad_out: tuple): 89 | # Save the gradients correspond to the featuremaps 90 | self.grad_pool[key] = grad_out[0].detach() 91 | 92 | return backward_hook_ 93 | 94 | # If any candidates are not specified, the hook is registered to all the layers. 95 | for name, module in self.model.named_modules(): 96 | print(name, module) 97 | if self.candidate_layers is None or name in self.candidate_layers: 98 | self.handlers.append(module.register_forward_hook(forward_hook(name))) 99 | self.handlers.append(module.register_backward_hook(backward_hook(name))) 100 | 101 | @staticmethod 102 | def _find(pool: OrderedDict, target_layer: str) -> torch.Tensor: 103 | """Get designated layer from model.""" 104 | if target_layer in pool.keys(): 105 | return pool[target_layer] 106 | else: 107 | raise ValueError("Invalid layer name: {}".format(target_layer)) 108 | 109 | @staticmethod 110 | def _compute_grad_weights(grads: torch.Tensor) -> torch.Tensor: 111 | """Compute gradient weight with average pooling.""" 112 | return F.adaptive_avg_pool2d(grads, 1) 113 | 114 | def forward(self, image: np.ndarray) -> torch.Tensor: 115 | """Forward method implementation.""" 116 | self.image_shape = image.shape[1:] 117 | return super(GradCAM, self).forward(image) 118 | 119 | def generate(self, target_layer: str) -> torch.Tensor: 120 | """Generate feature map of target layer with Grad-CAM.""" 121 | fmaps = self._find(self.fmap_pool, target_layer) 122 | grads = self._find(self.grad_pool, target_layer) 123 | weights = self._compute_grad_weights(grads) 124 | 125 | gcam = torch.mul(fmaps, weights).sum(dim=1, keepdim=True) 126 | gcam = F.relu(gcam) 127 | 128 | gcam = F.interpolate( 129 | gcam, self.image_shape, mode="bilinear", align_corners=False 130 | ) 131 | 132 | B, C, H, W = gcam.shape 133 | gcam = gcam.view(B, -1) 134 | gcam -= gcam.min(dim=1, keepdim=True)[0] 135 | gcam /= gcam.max(dim=1, keepdim=True)[0] + 1e-7 136 | gcam = gcam.view(B, C, H, W) 137 | 138 | return gcam 139 | -------------------------------------------------------------------------------- /rl_algorithms/dqn/distributed_worker.py: -------------------------------------------------------------------------------- 1 | """DQN worker for distributed training. 2 | 3 | - Author: Chris Yoon 4 | - Contact: chris.yoon@medipixel.io 5 | """ 6 | 7 | from collections import OrderedDict 8 | from typing import Dict, Tuple 9 | 10 | import numpy as np 11 | import torch 12 | 13 | from rl_algorithms.common.abstract.distributed_worker import DistributedWorker 14 | from rl_algorithms.common.helper_functions import numpy2floattensor 15 | from rl_algorithms.common.networks.brain import Brain 16 | from rl_algorithms.registry import WORKERS, build_loss 17 | from rl_algorithms.utils.config import ConfigDict 18 | 19 | 20 | @WORKERS.register_module 21 | class DQNWorker(DistributedWorker): 22 | """DQN worker for distributed training. 23 | 24 | Attributes: 25 | backbone (ConfigDict): backbone configs for building network 26 | head (ConfigDict): head configs for building network 27 | state_dict (ConfigDict): initial network state dict received form learner 28 | device (str): literal to indicate cpu/cuda use 29 | 30 | """ 31 | 32 | def __init__( 33 | self, 34 | rank: int, 35 | device: str, 36 | hyper_params: ConfigDict, 37 | env_name: str, 38 | is_atari: bool, 39 | max_episode_steps: int, 40 | loss_type: ConfigDict, 41 | state_dict: OrderedDict, 42 | backbone: ConfigDict, 43 | head: ConfigDict, 44 | state_size: int, 45 | output_size: int, 46 | ): 47 | DistributedWorker.__init__( 48 | self, rank, device, hyper_params, env_name, is_atari, max_episode_steps 49 | ) 50 | 51 | self.loss_fn = build_loss(loss_type) 52 | self.backbone_cfg = backbone 53 | self.head_cfg = head 54 | self.head_cfg.configs.state_size = state_size 55 | self.head_cfg.configs.output_size = output_size 56 | 57 | self.use_n_step = self.hyper_params.n_step > 1 58 | 59 | self.max_epsilon = self.hyper_params.max_epsilon 60 | self.min_epsilon = self.hyper_params.min_epsilon 61 | self.epsilon = self.hyper_params.max_epsilon 62 | 63 | self._init_networks(state_dict) 64 | 65 | # pylint: disable=attribute-defined-outside-init 66 | def _init_networks(self, state_dict: OrderedDict): 67 | """Initialize DQN policy with learner state dict.""" 68 | self.dqn = Brain(self.backbone_cfg, self.head_cfg).to(self.device) 69 | self.dqn.load_state_dict(state_dict) 70 | self.dqn.eval() 71 | 72 | def load_params(self, path: str): 73 | """Load model and optimizer parameters.""" 74 | DistributedWorker.load_params(self, path) 75 | 76 | params = torch.load(path) 77 | self.dqn.load_state_dict(params["dqn_state_dict"]) 78 | print("[INFO] loaded the model and optimizer from", path) 79 | 80 | def select_action(self, state: np.ndarray) -> np.ndarray: 81 | """Select an action from the input space.""" 82 | # epsilon greedy policy 83 | # pylint: disable=comparison-with-callable 84 | if self.epsilon > np.random.random(): 85 | selected_action = np.array(self.env.action_space.sample()) 86 | else: 87 | with torch.no_grad(): 88 | state = self._preprocess_state(state, self.device) 89 | selected_action = self.dqn(state).argmax() 90 | selected_action = selected_action.cpu().numpy() 91 | 92 | # Decay epsilon 93 | self.epsilon = max( 94 | self.epsilon 95 | - (self.max_epsilon - self.min_epsilon) * self.hyper_params.epsilon_decay, 96 | self.min_epsilon, 97 | ) 98 | 99 | return selected_action 100 | 101 | def step(self, action: np.ndarray) -> Tuple[np.ndarray, np.float64, bool, dict]: 102 | """Take an action and return the response of the env.""" 103 | next_state, reward, done, info = self.env.step(action) 104 | return next_state, reward, done, info 105 | 106 | def compute_priorities(self, memory: Dict[str, np.ndarray]) -> np.ndarray: 107 | """Compute initial priority values of experiences in local memory.""" 108 | states = numpy2floattensor(memory["states"], self.device) 109 | actions = numpy2floattensor(memory["actions"], self.device).long() 110 | rewards = numpy2floattensor(memory["rewards"].reshape(-1, 1), self.device) 111 | next_states = numpy2floattensor(memory["next_states"], self.device) 112 | dones = numpy2floattensor(memory["dones"].reshape(-1, 1), self.device) 113 | memory_tensors = (states, actions, rewards, next_states, dones) 114 | 115 | with torch.no_grad(): 116 | dq_loss_element_wise, _ = self.loss_fn( 117 | self.dqn, 118 | self.dqn, 119 | memory_tensors, 120 | self.hyper_params.gamma, 121 | self.head_cfg, 122 | ) 123 | loss_for_prior = dq_loss_element_wise.detach().cpu().numpy() 124 | new_priorities = loss_for_prior + self.hyper_params.per_eps 125 | return new_priorities 126 | 127 | def synchronize(self, new_state_dict: Dict[str, np.ndarray]): 128 | """Synchronize worker dqn with learner dqn.""" 129 | self._synchronize(self.dqn, new_state_dict) 130 | -------------------------------------------------------------------------------- /rl_algorithms/common/apex/learner.py: -------------------------------------------------------------------------------- 1 | """Learner Wrapper to enable Ape-X distributed training. 2 | 3 | - Author: Chris Yoon 4 | - Contact: chris.yoon@medipixel.io 5 | """ 6 | 7 | from typing import Dict, List 8 | 9 | import numpy as np 10 | import pyarrow as pa 11 | import ray 12 | import zmq 13 | 14 | from rl_algorithms.common.abstract.learner import DistributedLearnerWrapper, Learner 15 | from rl_algorithms.common.helper_functions import numpy2floattensor, state_dict2numpy 16 | from rl_algorithms.utils.config import ConfigDict 17 | 18 | 19 | @ray.remote(num_gpus=1) 20 | class ApeXLearnerWrapper(DistributedLearnerWrapper): 21 | """Learner Wrapper to enable Ape-X distributed training. 22 | 23 | Attributes: 24 | learner (Learner): learner 25 | comm_config (ConfigDict): configs for communication 26 | update_step (int): counts update steps 27 | pub_socket (zmq.Socket): publisher socket for broadcasting params 28 | rep_socket (zmq.Socket): reply socket for receiving replay data & sending new priorities 29 | update_step (int): number of update steps 30 | max_update_step (int): maximum update steps per run 31 | worker_update_interval (int): num update steps between worker synchronization 32 | logger_interval (int): num update steps between logging 33 | 34 | """ 35 | 36 | def __init__(self, learner: Learner, comm_cfg: ConfigDict): 37 | """Initialize.""" 38 | DistributedLearnerWrapper.__init__(self, learner, comm_cfg) 39 | self.update_step = 0 40 | self.max_update_step = self.learner.hyper_params.max_update_step 41 | self.worker_update_interval = self.learner.hyper_params.worker_update_interval 42 | self.logger_interval = self.learner.hyper_params.logger_interval 43 | 44 | # NOTE: disable because learner uses preprocessed n_step experience 45 | self.learner.use_n_step = False 46 | 47 | # pylint: disable=attribute-defined-outside-init 48 | def init_communication(self): 49 | """Initialize sockets for communication.""" 50 | ctx = zmq.Context() 51 | # Socket to send updated network parameters to worker 52 | self.pub_socket = ctx.socket(zmq.PUB) 53 | self.pub_socket.setsockopt(zmq.SNDHWM, 2) 54 | self.pub_socket.bind(f"tcp://127.0.0.1:{self.comm_cfg.learner_worker_port}") 55 | 56 | # Socket to receive replay data and send new priorities to buffer 57 | self.rep_socket = ctx.socket(zmq.REP) 58 | self.rep_socket.bind(f"tcp://127.0.0.1:{self.comm_cfg.learner_buffer_port}") 59 | 60 | # Socket to send logging data to logger 61 | self.push_socket = ctx.socket(zmq.PUSH) 62 | self.push_socket.connect(f"tcp://127.0.0.1:{self.comm_cfg.learner_logger_port}") 63 | 64 | def recv_replay_data(self): 65 | """Receive replay data from gloal buffer.""" 66 | replay_data_id = self.rep_socket.recv() 67 | replay_data = pa.deserialize(replay_data_id) 68 | return replay_data 69 | 70 | def send_new_priorities(self, indices: np.ndarray, priorities: np.ndarray): 71 | """Send new priority values and corresponding indices to buffer.""" 72 | new_priors = [indices, priorities] 73 | new_priors_id = pa.serialize(new_priors).to_buffer() 74 | self.rep_socket.send(new_priors_id) 75 | 76 | def publish_params(self, update_step: int, np_state_dict: Dict[str, np.ndarray]): 77 | """Broadcast updated params to all workers.""" 78 | param_info = [update_step, np_state_dict] 79 | new_params_id = pa.serialize(param_info).to_buffer() 80 | self.pub_socket.send(new_params_id) 81 | 82 | def send_info_to_logger( 83 | self, 84 | np_state_dict: List[np.ndarray], 85 | step_info: list, 86 | ): 87 | """Send new params and log info to logger.""" 88 | log_value = dict(update_step=self.update_step, step_info=step_info) 89 | log_info = dict(log_value=log_value, state_dict=np_state_dict) 90 | log_info_id = pa.serialize(log_info).to_buffer() 91 | self.push_socket.send(log_info_id) 92 | 93 | def run(self): 94 | """Run main training loop.""" 95 | self.telapsed = 0 96 | while self.update_step < self.max_update_step: 97 | replay_data = self.recv_replay_data() 98 | if replay_data is not None: 99 | replay_data = ( 100 | numpy2floattensor(replay_data[:6], self.learner.device) 101 | + replay_data[6:] 102 | ) 103 | info = self.update_model(replay_data) 104 | indices, new_priorities = info[-2:] 105 | step_info = info[:-2] 106 | self.update_step = self.update_step + 1 107 | 108 | self.send_new_priorities(indices, new_priorities) 109 | 110 | if self.update_step % self.worker_update_interval == 0: 111 | state_dict = self.get_state_dict() 112 | np_state_dict = state_dict2numpy(state_dict) 113 | self.publish_params(self.update_step, np_state_dict) 114 | 115 | if self.update_step % self.logger_interval == 0: 116 | state_dict = self.get_state_dict() 117 | np_state_dict = state_dict2numpy(state_dict) 118 | self.send_info_to_logger(np_state_dict, step_info) 119 | self.learner.save_params(self.update_step) 120 | -------------------------------------------------------------------------------- /rl_algorithms/common/abstract/learner.py: -------------------------------------------------------------------------------- 1 | """Base Learner & LearnerWrapper class. 2 | 3 | - Author: Chris Yoon 4 | - Contact: chris.yoon@medipixel.io 5 | """ 6 | 7 | from abc import ABC, abstractmethod 8 | from collections import OrderedDict 9 | import os 10 | import shutil 11 | from typing import Tuple, Union 12 | 13 | import torch 14 | import torch.nn as nn 15 | 16 | from rl_algorithms.utils.config import ConfigDict 17 | 18 | TensorTuple = Tuple[torch.Tensor, ...] 19 | 20 | 21 | class BaseLearner(ABC): 22 | """Abstract class for all learner objects.""" 23 | 24 | @abstractmethod 25 | def update_model(self, experience: Union[TensorTuple, Tuple[TensorTuple]]) -> tuple: 26 | pass 27 | 28 | @abstractmethod 29 | def save_params(self, n_episode: int): 30 | pass 31 | 32 | @abstractmethod 33 | def load_params(self, path: str): 34 | if not os.path.exists(path): 35 | raise Exception( 36 | f"[ERROR] the input path does not exist. Wrong path: {path}" 37 | ) 38 | 39 | @abstractmethod 40 | def get_state_dict(self) -> Union[OrderedDict, Tuple[OrderedDict]]: 41 | pass 42 | 43 | 44 | class Learner(BaseLearner): 45 | """Base class for all base learners. 46 | 47 | Attributes: 48 | args (argparse.Namespace): arguments including hyperparameters and training settings 49 | hyper_params (ConfigDict): hyper-parameters 50 | log_cfg (ConfigDict): configuration for saving log 51 | sha (str): sha code of current git commit 52 | 53 | """ 54 | 55 | def __init__( 56 | self, 57 | hyper_params: ConfigDict, 58 | log_cfg: ConfigDict, 59 | env_name: str, 60 | is_test: bool, 61 | ): 62 | """Initialize.""" 63 | self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 64 | self.hyper_params = hyper_params 65 | 66 | if not is_test: 67 | self.ckpt_path = ( 68 | f"./checkpoint/{env_name}/{log_cfg.agent}/{log_cfg.curr_time}" 69 | ) 70 | os.makedirs(self.ckpt_path, exist_ok=True) 71 | 72 | # save configuration 73 | shutil.copy(log_cfg.cfg_path, os.path.join(self.ckpt_path, "config.yaml")) 74 | 75 | @abstractmethod 76 | def _init_network(self): 77 | pass 78 | 79 | @abstractmethod 80 | def update_model(self, experience: Union[TensorTuple, Tuple[TensorTuple]]) -> tuple: 81 | pass 82 | 83 | @abstractmethod 84 | def save_params(self, n_episode: int): 85 | pass 86 | 87 | def _save_params(self, params: dict, n_episode: int): 88 | """Save parameters of networks.""" 89 | os.makedirs(self.ckpt_path, exist_ok=True) 90 | 91 | path = os.path.join(self.ckpt_path, f"ep_{str(n_episode)}.pt") 92 | torch.save(params, path) 93 | 94 | print(f"[INFO] Saved the model and optimizer to {path} \n") 95 | 96 | @abstractmethod 97 | def load_params(self, path: str): 98 | if not os.path.exists(path): 99 | raise Exception( 100 | f"[ERROR] the input path does not exist. Wrong path: {path}" 101 | ) 102 | 103 | @abstractmethod 104 | def get_state_dict(self) -> Union[OrderedDict, Tuple[OrderedDict]]: 105 | pass 106 | 107 | @abstractmethod 108 | def get_policy(self) -> nn.Module: 109 | pass 110 | 111 | 112 | class LearnerWrapper(BaseLearner): 113 | """Base class for all learner wrappers.""" 114 | 115 | def __init__(self, learner: BaseLearner): 116 | """Initialize.""" 117 | self.learner = learner 118 | 119 | def update_model(self, experience: Union[TensorTuple, Tuple[TensorTuple]]) -> tuple: 120 | return self.learner.update_model(experience) 121 | 122 | def save_params(self, n_episode: int): 123 | self.learner.save_params(n_episode) 124 | 125 | def load_params(self, path: str): 126 | self.learner.load_params(path) 127 | 128 | def get_state_dict(self) -> Union[OrderedDict, Tuple[OrderedDict]]: 129 | return self.learner.get_state_dict() 130 | 131 | 132 | class DistributedLearnerWrapper(LearnerWrapper): 133 | """Base wrapper class for distributed learners. 134 | 135 | Attributes: 136 | learner (Learner): learner 137 | comm_cfg (ConfigDict): configs for communication 138 | 139 | """ 140 | 141 | def __init__(self, learner: Learner, comm_cfg: ConfigDict): 142 | LearnerWrapper.__init__(self, learner) 143 | self.comm_cfg = comm_cfg 144 | 145 | @abstractmethod 146 | def init_communication(self): 147 | pass 148 | 149 | def update_model(self, experience: Union[TensorTuple, Tuple[TensorTuple]]) -> tuple: 150 | """Run one step of learner model update.""" 151 | return self.learner.update_model(experience) 152 | 153 | def save_params(self, n_update_step: int): 154 | """Save learner params at defined directory.""" 155 | self.learner.save_params(n_update_step) 156 | 157 | def load_params(self, path: str): 158 | """Load params at start.""" 159 | self.learner.load_params(path) 160 | 161 | def get_policy(self): 162 | """Return model (policy) used for action selection, used only in grad cam.""" 163 | return self.learner.get_policy() 164 | 165 | def get_state_dict(self): 166 | """Return state dicts.""" 167 | return self.learner.get_state_dict() 168 | 169 | @abstractmethod 170 | def run(self): 171 | pass 172 | -------------------------------------------------------------------------------- /rl_algorithms/bc/her.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """HER class and reward function for Behavior Cloning. 3 | 4 | - Author: Kyunghwan Kim 5 | - Contact: kh.kim@medipixel.io 6 | """ 7 | from typing import Callable, Tuple 8 | 9 | import numpy as np 10 | 11 | from rl_algorithms.common.abstract.her import HER 12 | from rl_algorithms.common.abstract.reward_fn import RewardFn 13 | from rl_algorithms.registry import HERS 14 | 15 | 16 | class L1DistanceRewardFn(RewardFn): 17 | def __call__(self, transition: tuple, goal_state: np.ndarray) -> np.float64: 18 | """L1 Distance reward function.""" 19 | next_state = transition[3] 20 | eps = 1e-6 21 | if np.abs(next_state - goal_state).sum() < eps: 22 | return np.float64(0.0) 23 | else: 24 | return np.float64(-1.0) 25 | 26 | 27 | l1_distance_reward_fn = L1DistanceRewardFn() 28 | 29 | 30 | @HERS.register_module 31 | class LunarLanderContinuousHER(HER): 32 | """HER for LunarLanderContinuous-v2 environment. 33 | 34 | Attributes: 35 | demo_goal_indices (np.ndarray): indices about goal of demo list 36 | desired_states (np.ndarray): desired states from demonstration 37 | 38 | """ 39 | 40 | def __init__( 41 | self, 42 | reward_fn: Callable[[tuple, np.ndarray], np.float64] = l1_distance_reward_fn, 43 | ): 44 | """Initialize.""" 45 | HER.__init__(self, reward_fn=reward_fn) 46 | self.is_goal_in_state = False 47 | 48 | # pylint: disable=attribute-defined-outside-init 49 | def fetch_desired_states_from_demo(self, demo: list): 50 | """Return desired goal states from demonstration data.""" 51 | np_demo: np.ndarray = np.array(demo) 52 | self.demo_goal_indices: np.ndarray = np.where(np_demo[:, 4])[0] 53 | self.desired_states: np.ndarray = np_demo[self.demo_goal_indices][:, 0] 54 | 55 | def get_desired_state(self, *args) -> np.ndarray: 56 | """Sample one of the desired states.""" 57 | return np.random.choice(self.desired_states, 1).item() 58 | 59 | def _get_final_state(self, transition: tuple) -> np.ndarray: 60 | """Get final state from transitions for making HER transitions.""" 61 | return transition[0] 62 | 63 | def generate_demo_transitions(self, demo: list) -> list: 64 | """Return generated demo transitions for HER.""" 65 | new_demo: list = list() 66 | 67 | # generate demo transitions 68 | prev_idx = 0 69 | for idx in self.demo_goal_indices: 70 | demo_final_state = self._get_final_state(demo[idx]) 71 | transitions = [demo[i] for i in range(prev_idx, idx + 1)] 72 | prev_idx = idx + 1 73 | 74 | transitions = self.generate_transitions( 75 | transitions, demo_final_state, 0, is_demo=True 76 | ) 77 | 78 | new_demo.extend(transitions) 79 | 80 | return new_demo 81 | 82 | 83 | class ReacherRewardFn(RewardFn): 84 | def __call__(self, transition: tuple, _) -> np.float64: 85 | """Reward function for Reacher-v2 environment.""" 86 | state, action = transition[0:2] 87 | diff_vec = state[-3:] 88 | reward_dist = -1 * np.linalg.norm(diff_vec) 89 | reward_ctrl = -np.square(action).sum() 90 | 91 | return reward_dist + reward_ctrl 92 | 93 | 94 | reacher_reward_fn = ReacherRewardFn() 95 | 96 | 97 | @HERS.register_module 98 | class ReacherHER(HER): 99 | """HER for Reacher-v2 environment.""" 100 | 101 | def __init__( 102 | self, reward_fn: Callable[[tuple, np.ndarray], np.float64] = reacher_reward_fn 103 | ): 104 | """Initialize.""" 105 | HER.__init__(self, reward_fn=reward_fn) 106 | self.is_goal_in_state = True 107 | 108 | def fetch_desired_states_from_demo(self, _: list): 109 | """Return desired goal states from demonstration data. 110 | 111 | DO NOT use this method because demo states have a goal position. 112 | """ 113 | raise Exception("Do not use this method.") 114 | 115 | def get_desired_state(self, *args) -> np.ndarray: 116 | """Sample one of the desired states. 117 | 118 | Returns an empty array since demo states have a goal position. 119 | """ 120 | return np.array([]) 121 | 122 | def _get_final_state(self, transition_final: tuple) -> np.ndarray: 123 | """Get a finger-tip position from the final transition.""" 124 | return transition_final[0][8:10] + transition_final[0][2:4] 125 | 126 | def generate_demo_transitions(self, demo: list) -> list: 127 | """Return generated demo transitions for HER. 128 | 129 | Works as an identity function in this class. 130 | """ 131 | return demo 132 | 133 | def _append_origin_transitions( 134 | self, origin_transitions: list, transition: tuple, _: np.ndarray 135 | ): 136 | """Append original transitions for training.""" 137 | origin_transitions.append(transition) 138 | 139 | def _get_transition( 140 | self, transition: tuple, goal_state: np.ndarray 141 | ) -> Tuple[np.ndarray, np.ndarray, np.float64, np.ndarray, bool]: 142 | """Get a single transition concatenated with a goal state.""" 143 | state, action, _, next_state, done = transition 144 | 145 | reward = self.reward_fn(transition, goal_state) 146 | state_ = state 147 | state_[4:6] = goal_state 148 | next_state_ = next_state 149 | next_state_[4:6] = goal_state 150 | 151 | return state_, action, reward, next_state_, done 152 | -------------------------------------------------------------------------------- /rl_algorithms/a2c/learner.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | from typing import Tuple 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | from torch.nn.utils import clip_grad_norm_ 8 | import torch.optim as optim 9 | 10 | from rl_algorithms.common.abstract.learner import Learner, TensorTuple 11 | from rl_algorithms.common.helper_functions import numpy2floattensor 12 | from rl_algorithms.common.networks.brain import Brain 13 | from rl_algorithms.registry import LEARNERS 14 | from rl_algorithms.utils.config import ConfigDict 15 | 16 | 17 | @LEARNERS.register_module 18 | class A2CLearner(Learner): 19 | """Learner for A2C Agent. 20 | 21 | Attributes: 22 | hyper_params (ConfigDict): hyper-parameters 23 | log_cfg (ConfigDict): configuration for saving log and checkpoint 24 | actor (nn.Module): actor model to select actions 25 | critic (nn.Module): critic model to predict state values 26 | actor_optim (Optimizer): optimizer for training actor 27 | critic_optim (Optimizer): optimizer for training critic 28 | 29 | """ 30 | 31 | def __init__( 32 | self, 33 | hyper_params: ConfigDict, 34 | log_cfg: ConfigDict, 35 | backbone: ConfigDict, 36 | head: ConfigDict, 37 | optim_cfg: ConfigDict, 38 | env_name: str, 39 | state_size: tuple, 40 | output_size: int, 41 | is_test: bool, 42 | load_from: str, 43 | ): 44 | Learner.__init__(self, hyper_params, log_cfg, env_name, is_test) 45 | 46 | self.backbone_cfg = backbone 47 | self.head_cfg = head 48 | self.head_cfg.actor.configs.state_size = ( 49 | self.head_cfg.critic.configs.state_size 50 | ) = state_size 51 | self.head_cfg.actor.configs.output_size = output_size 52 | self.optim_cfg = optim_cfg 53 | self.load_from = load_from 54 | 55 | self._init_network() 56 | 57 | def _init_network(self): 58 | """Initialize networks and optimizers.""" 59 | self.actor = Brain(self.backbone_cfg.actor, self.head_cfg.actor).to(self.device) 60 | self.critic = Brain(self.backbone_cfg.critic, self.head_cfg.critic).to( 61 | self.device 62 | ) 63 | 64 | # create optimizer 65 | self.actor_optim = optim.Adam( 66 | self.actor.parameters(), 67 | lr=self.optim_cfg.lr_actor, 68 | weight_decay=self.optim_cfg.weight_decay, 69 | ) 70 | 71 | self.critic_optim = optim.Adam( 72 | self.critic.parameters(), 73 | lr=self.optim_cfg.lr_critic, 74 | weight_decay=self.optim_cfg.weight_decay, 75 | ) 76 | 77 | if self.load_from is not None: 78 | self.load_params(self.load_from) 79 | 80 | def update_model(self, experience: TensorTuple) -> TensorTuple: 81 | """Update A2C actor and critic networks""" 82 | 83 | log_prob, pred_value, next_state, reward, done = experience 84 | next_state = numpy2floattensor(next_state, self.device) 85 | 86 | # Q_t = r + gamma * V(s_{t+1}) if state != Terminal 87 | # = r otherwise 88 | mask = 1 - done 89 | next_value = self.critic(next_state).detach() 90 | q_value = reward + self.hyper_params.gamma * next_value * mask 91 | q_value = q_value.to(self.device) 92 | 93 | # advantage = Q_t - V(s_t) 94 | advantage = q_value - pred_value 95 | 96 | # calculate loss at the current step 97 | policy_loss = -advantage.detach() * log_prob # adv. is not backpropagated 98 | policy_loss += self.hyper_params.w_entropy * -log_prob # entropy 99 | value_loss = F.smooth_l1_loss(pred_value, q_value.detach()) 100 | 101 | # train 102 | gradient_clip_ac = self.hyper_params.gradient_clip_ac 103 | gradient_clip_cr = self.hyper_params.gradient_clip_cr 104 | 105 | self.actor_optim.zero_grad() 106 | policy_loss.backward() 107 | clip_grad_norm_(self.actor.parameters(), gradient_clip_ac) 108 | self.actor_optim.step() 109 | 110 | self.critic_optim.zero_grad() 111 | value_loss.backward() 112 | clip_grad_norm_(self.critic.parameters(), gradient_clip_cr) 113 | self.critic_optim.step() 114 | 115 | return policy_loss.item(), value_loss.item() 116 | 117 | def save_params(self, n_episode: int): 118 | """Save model and optimizer parameters.""" 119 | params = { 120 | "actor_state_dict": self.actor.state_dict(), 121 | "critic_state_dict": self.critic.state_dict(), 122 | "actor_optim_state_dict": self.actor_optim.state_dict(), 123 | "critic_optim_state_dict": self.critic_optim.state_dict(), 124 | } 125 | 126 | Learner._save_params(self, params, n_episode) 127 | 128 | def load_params(self, path: str): 129 | """Load model and optimizer parameters.""" 130 | Learner.load_params(self, path) 131 | 132 | params = torch.load(path) 133 | self.actor.load_state_dict(params["actor_state_dict"]) 134 | self.critic.load_state_dict(params["critic_state_dict"]) 135 | self.actor_optim.load_state_dict(params["actor_optim_state_dict"]) 136 | self.critic_optim.load_state_dict(params["critic_optim_state_dict"]) 137 | print("[INFO] Loaded the model and optimizer from", path) 138 | 139 | def get_state_dict(self) -> Tuple[OrderedDict]: 140 | """Return state dicts, mainly for distributed worker.""" 141 | return (self.critic.state_dict(), self.actor.state_dict()) 142 | 143 | def get_policy(self) -> nn.Module: 144 | """Return model (policy) used for action selection.""" 145 | return self.actor 146 | -------------------------------------------------------------------------------- /rl_algorithms/common/abstract/distributed_worker.py: -------------------------------------------------------------------------------- 1 | """Worker classes for distributed training. 2 | 3 | - Author: Chris Yoon 4 | - Contact: chris.yoon@medipixel.io 5 | """ 6 | 7 | from abc import ABC, abstractmethod 8 | import os 9 | import random 10 | from typing import Deque, Dict, Tuple 11 | 12 | import gym 13 | import numpy as np 14 | import torch 15 | 16 | from rl_algorithms.common.env.atari_wrappers import atari_env_generator 17 | import rl_algorithms.common.env.utils as env_utils 18 | from rl_algorithms.common.helper_functions import numpy2floattensor, set_random_seed 19 | from rl_algorithms.common.networks.brain import Brain 20 | from rl_algorithms.utils.config import ConfigDict 21 | 22 | 23 | class BaseDistributedWorker(ABC): 24 | """Base class for Worker classes.""" 25 | 26 | @abstractmethod 27 | def select_action(self, state: np.ndarray) -> np.ndarray: 28 | pass 29 | 30 | @abstractmethod 31 | def step(self, action: np.ndarray) -> Tuple[np.ndarray, np.float64, bool, dict]: 32 | pass 33 | 34 | @abstractmethod 35 | def synchronize(self, new_state_dict: Dict[str, np.ndarray]): 36 | pass 37 | 38 | # pylint: disable=no-self-use 39 | def _synchronize(self, network: Brain, new_state_dict: Dict[str, np.ndarray]): 40 | """Copy parameters from numpy arrays.""" 41 | param_name_list = list(new_state_dict.keys()) 42 | for worker_named_param in network.named_parameters(): 43 | worker_param_name = worker_named_param[0] 44 | if worker_param_name in param_name_list: 45 | new_param = numpy2floattensor( 46 | new_state_dict[worker_param_name], self.device 47 | ) 48 | worker_named_param[1].data.copy_(new_param) 49 | 50 | 51 | class DistributedWorker(BaseDistributedWorker): 52 | """Base class for all functioning RL workers. 53 | 54 | Attributes: 55 | rank (int): rank (ID) of worker 56 | hyper_params (ConfigDict): algorithm hyperparameters 57 | device (torch.Device): device on which worker process runs 58 | env (gym.ENV): gym environment 59 | """ 60 | 61 | def __init__( 62 | self, 63 | rank: int, 64 | device: str, 65 | hyper_params: ConfigDict, 66 | env_name: str, 67 | is_atari: bool, 68 | max_episode_steps: int, 69 | ): 70 | """Initialize.""" 71 | self.rank = rank 72 | self.device = torch.device(device) 73 | 74 | self.hyper_params = hyper_params 75 | self.env_name = env_name 76 | self.is_atari = is_atari 77 | self.max_episode_steps = max_episode_steps 78 | 79 | self._init_env() 80 | 81 | # pylint: disable=attribute-defined-outside-init, no-self-use 82 | def _init_env(self): 83 | """Intialize worker local environment.""" 84 | if self.is_atari: 85 | self.env = atari_env_generator( 86 | self.env_name, self.max_episode_steps, frame_stack=True 87 | ) 88 | else: 89 | self.env = gym.make(self.env_name) 90 | env_utils.set_env(self.env, self.max_episode_steps) 91 | 92 | random.seed(self.rank) 93 | env_seed = random.randint(0, 999) 94 | set_random_seed(env_seed, self.env) 95 | 96 | @abstractmethod 97 | def load_params(self, path: str): 98 | if not os.path.exists(path): 99 | raise Exception( 100 | f"[ERROR] the input path does not exist. Wrong path: {path}" 101 | ) 102 | 103 | @abstractmethod 104 | def select_action(self, state: np.ndarray) -> np.ndarray: 105 | pass 106 | 107 | @abstractmethod 108 | def step(self, action: np.ndarray) -> Tuple[np.ndarray, np.float64, bool, dict]: 109 | pass 110 | 111 | # NOTE: No need to explicitly implement for non-PER/non-Ape-X workers 112 | @abstractmethod 113 | def compute_priorities(self, experience: Dict[str, np.ndarray]): 114 | pass 115 | 116 | @abstractmethod 117 | def synchronize(self, new_state_dict: Dict[str, np.ndarray]): 118 | pass 119 | 120 | @staticmethod 121 | def _preprocess_state(state: np.ndarray, device: torch.device) -> torch.Tensor: 122 | """Preprocess state so that actor selects an action.""" 123 | state = numpy2floattensor(state, device) 124 | return state 125 | 126 | 127 | class DistributedWorkerWrapper(BaseDistributedWorker): 128 | """Base wrapper class for distributed worker wrappers.""" 129 | 130 | def __init__(self, worker: DistributedWorker, comm_cfg: ConfigDict): 131 | self.worker = worker 132 | self.comm_cfg = comm_cfg 133 | 134 | @abstractmethod 135 | def init_communication(self): 136 | pass 137 | 138 | def select_action(self, state: np.ndarray) -> np.ndarray: 139 | """Select an action from the input space.""" 140 | return self.worker.select_action(state) 141 | 142 | def step(self, action: np.ndarray) -> Tuple[np.ndarray, np.float64, bool, dict]: 143 | """Take an action and return the response of the env.""" 144 | return self.worker.step(action) 145 | 146 | def synchronize(self, new_state_dict: Dict[str, np.ndarray]): 147 | """Synchronize worker brain with learner brain.""" 148 | self.worker.synchronize(new_state_dict) 149 | 150 | @abstractmethod 151 | def collect_data(self) -> Dict[str, np.ndarray]: 152 | pass 153 | 154 | @abstractmethod 155 | def run(self): 156 | pass 157 | 158 | def preprocess_nstep(self, nstepqueue: Deque) -> Tuple[np.ndarray, ...]: 159 | """Return n-step transition with discounted reward.""" 160 | discounted_reward = 0 161 | _, _, _, last_state, done = nstepqueue[-1] 162 | for transition in list(reversed(nstepqueue)): 163 | state, action, reward, _, _ = transition 164 | discounted_reward = reward + self.hyper_params.gamma * discounted_reward 165 | nstep_data = (state, action, discounted_reward, last_state, done) 166 | 167 | return nstep_data 168 | --------------------------------------------------------------------------------