├── rl_algorithms
    ├── version
    ├── acer
    │   ├── __init__.py
    │   └── buffer.py
    ├── common
    │   ├── apex
    │   │   ├── __init__.py
    │   │   └── learner.py
    │   ├── abstract
    │   │   ├── __init__.py
    │   │   ├── architecture.py
    │   │   ├── reward_fn.py
    │   │   ├── buffer.py
    │   │   ├── her.py
    │   │   ├── learner.py
    │   │   └── distributed_worker.py
    │   ├── __init__.py
    │   ├── env
    │   │   ├── __init__.py
    │   │   ├── normalizers.py
    │   │   └── utils.py
    │   ├── buffer
    │   │   ├── __init__.py
    │   │   ├── gail_buffer.py
    │   │   ├── distillation_buffer.py
    │   │   └── segment_tree.py
    │   ├── networks
    │   │   ├── __init__.py
    │   │   └── backbones
    │   │   │   ├── __init__.py
    │   │   │   └── cnn.py
    │   ├── noise.py
    │   ├── saliency_map.py
    │   └── grad_cam.py
    ├── a2c
    │   ├── __init__.py
    │   └── learner.py
    ├── bc
    │   ├── __init__.py
    │   ├── ddpg_learner.py
    │   ├── sac_learner.py
    │   └── her.py
    ├── ddpg
    │   └── __init__.py
    ├── dqn
    │   ├── __init__.py
    │   ├── distributed_logger.py
    │   ├── linear.py
    │   └── distributed_worker.py
    ├── fd
    │   ├── __init__.py
    │   ├── dqn_learner.py
    │   ├── ddpg_learner.py
    │   └── sac_learner.py
    ├── gail
    │   ├── __init__.py
    │   ├── utils.py
    │   └── networks.py
    ├── ppo
    │   ├── __init__.py
    │   └── utils.py
    ├── sac
    │   └── __init__.py
    ├── td3
    │   └── __init__.py
    ├── recurrent
    │   └── __init__.py
    ├── distillation
    │   ├── __init__.py
    │   └── README.md
    ├── utils
    │   ├── __init__.py
    │   ├── config.py
    │   └── registry.py
    ├── registry.py
    └── __init__.py
├── configs
    ├── reacher_v2
    │   ├── __init__.py
    │   ├── td3.yaml
    │   ├── ddpg.yaml
    │   ├── bc_ddpg.yaml
    │   ├── sac.yaml
    │   └── bc_sac.yaml
    ├── lunarlander_v2
    │   ├── __init__.py
    │   ├── acer.yaml
    │   ├── ppo.yaml
    │   ├── dqn.yaml
    │   ├── r2d1.yaml
    │   ├── distillation_dqn.yaml
    │   └── dqfd.yaml
    ├── pong_no_frameskip_v4
    │   ├── __init__.py
    │   ├── ppo.yaml
    │   ├── r2d1.yaml
    │   ├── dqn.yaml
    │   ├── dqn_resnet.yaml
    │   ├── apex_dqn.yaml
    │   └── distillation_dqn.yaml
    └── lunarlander_continuous_v2
    │   ├── __init__.py
    │   ├── a2c.yaml
    │   ├── td3.yaml
    │   ├── ddpg.yaml
    │   ├── ppo.yaml
    │   ├── bc_ddpg.yaml
    │   ├── ddpgfd.yaml
    │   ├── sac.yaml
    │   ├── bc_sac.yaml
    │   ├── gail_ppo.yaml
    │   └── sacfd.yaml
├── MANIFEST.in
├── tools
    ├── run_test.sh
    ├── check_version.sh
    ├── run_reacher_v2.sh
    ├── run_descrete_env.sh
    └── run_lunarlander_continuous_v2.sh
├── data
    ├── reacher_demo.pkl
    ├── lunarlander_discrete_demo.pkl
    └── lunarlander_continuous_demo.pkl
├── mypy.ini
├── .isort.cfg
├── .gitignore
├── .flake8
├── .pre-commit-config.yaml
├── .github
    ├── CODEOWNERS
    └── workflows
    │   └── python-publish.yaml
├── requirements.txt
├── Jenkinsfile
├── requirements-dev.txt
├── Makefile
├── Dockerfile
├── LICENSE.md
├── setup.py
├── tests
    ├── buffer
    │   ├── test_uniform_buffer.py
    │   ├── test_distillation_buffer.py
    │   └── test_prioritized_buffer.py
    ├── test_helper_funcion.py
    ├── integration
    │   ├── test_run_apex.py
    │   ├── test_run_agent.py
    │   └── test_run_distillation_agent.py
    ├── test_config_registry.py
    └── test_cnn_cfg.py
├── .all-contributorsrc
├── run_lunarlander_v2.py
├── run_lunarlander_continuous_v2.py
├── run_reacher_v2.py
└── run_pong_no_frameskip_v4.py


/rl_algorithms/version:
--------------------------------------------------------------------------------
1 | 1.2.0


--------------------------------------------------------------------------------
/rl_algorithms/acer/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rl_algorithms/common/apex/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rl_algorithms/common/abstract/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/configs/reacher_v2/__init__.py:
--------------------------------------------------------------------------------
1 | """Empty."""
2 | 


--------------------------------------------------------------------------------
/rl_algorithms/a2c/__init__.py:
--------------------------------------------------------------------------------
1 | """Empty."""
2 | 


--------------------------------------------------------------------------------
/rl_algorithms/bc/__init__.py:
--------------------------------------------------------------------------------
1 | """Empty."""
2 | 


--------------------------------------------------------------------------------
/rl_algorithms/ddpg/__init__.py:
--------------------------------------------------------------------------------
1 | """Empty."""
2 | 


--------------------------------------------------------------------------------
/rl_algorithms/dqn/__init__.py:
--------------------------------------------------------------------------------
1 | """Empty."""
2 | 


--------------------------------------------------------------------------------
/rl_algorithms/fd/__init__.py:
--------------------------------------------------------------------------------
1 | """Empty."""
2 | 


--------------------------------------------------------------------------------
/rl_algorithms/gail/__init__.py:
--------------------------------------------------------------------------------
1 | """Empty."""
2 | 


--------------------------------------------------------------------------------
/rl_algorithms/ppo/__init__.py:
--------------------------------------------------------------------------------
1 | """Empty."""
2 | 


--------------------------------------------------------------------------------
/rl_algorithms/sac/__init__.py:
--------------------------------------------------------------------------------
1 | """Empty."""
2 | 


--------------------------------------------------------------------------------
/rl_algorithms/td3/__init__.py:
--------------------------------------------------------------------------------
1 | """Empty."""
2 | 


--------------------------------------------------------------------------------
/configs/lunarlander_v2/__init__.py:
--------------------------------------------------------------------------------
1 | """Empty."""
2 | 


--------------------------------------------------------------------------------
/rl_algorithms/common/__init__.py:
--------------------------------------------------------------------------------
1 | """Empty."""
2 | 


--------------------------------------------------------------------------------
/rl_algorithms/common/env/__init__.py:
--------------------------------------------------------------------------------
1 | """Empty."""
2 | 


--------------------------------------------------------------------------------
/rl_algorithms/recurrent/__init__.py:
--------------------------------------------------------------------------------
1 | """Empty."""
2 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include requirements.txt
2 | include */version


--------------------------------------------------------------------------------
/configs/pong_no_frameskip_v4/__init__.py:
--------------------------------------------------------------------------------
1 | """Empty."""
2 | 


--------------------------------------------------------------------------------
/rl_algorithms/common/buffer/__init__.py:
--------------------------------------------------------------------------------
1 | """Empty."""
2 | 


--------------------------------------------------------------------------------
/rl_algorithms/common/networks/__init__.py:
--------------------------------------------------------------------------------
1 | """Empty."""
2 | 


--------------------------------------------------------------------------------
/rl_algorithms/distillation/__init__.py:
--------------------------------------------------------------------------------
1 | """Empty."""
2 | 


--------------------------------------------------------------------------------
/configs/lunarlander_continuous_v2/__init__.py:
--------------------------------------------------------------------------------
1 | """Empty."""
2 | 


--------------------------------------------------------------------------------
/tools/run_test.sh:
--------------------------------------------------------------------------------
1 | sh ./tools/run_lunarlander_continuous_v2.sh
2 | sh ./tools/run_descrete_env.sh


--------------------------------------------------------------------------------
/data/reacher_demo.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/medipixel/rl_algorithms/HEAD/data/reacher_demo.pkl


--------------------------------------------------------------------------------
/mypy.ini:
--------------------------------------------------------------------------------
1 | # Global options:
2 | 
3 | [mypy]
4 | python_version = 3.6
5 | ignore_missing_imports = True
6 | 


--------------------------------------------------------------------------------
/data/lunarlander_discrete_demo.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/medipixel/rl_algorithms/HEAD/data/lunarlander_discrete_demo.pkl


--------------------------------------------------------------------------------
/data/lunarlander_continuous_demo.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/medipixel/rl_algorithms/HEAD/data/lunarlander_continuous_demo.pkl


--------------------------------------------------------------------------------
/.isort.cfg:
--------------------------------------------------------------------------------
1 | [settings]
2 | multi_line_output=3
3 | include_trailing_comma=True
4 | force_grid_wrap=0
5 | combine_as_imports=True
6 | line_length=88
7 | force_sort_within_sections=True
8 | known_third_party=wandb, ray
9 | 


--------------------------------------------------------------------------------
/rl_algorithms/common/networks/backbones/__init__.py:
--------------------------------------------------------------------------------
1 | from rl_algorithms.common.networks.backbones.cnn import CNN
2 | from rl_algorithms.common.networks.backbones.resnet import ResNet
3 | 
4 | __all__ = [
5 |     "CNN",
6 |     "ResNet",
7 | ]
8 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | __pycache__
 2 | wandb*
 3 | checkpoint
 4 | .idea
 5 | .mypy_cache
 6 | .vscode
 7 | MUJOCO_LOG.TXT
 8 | .coverage
 9 | 
10 | # build
11 | build
12 | dist
13 | *.egg-info
14 | 
15 | # data
16 | data/distillation_buffer
17 | data/saliency_map/
18 | 


--------------------------------------------------------------------------------
/rl_algorithms/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | from .config import YamlConfig
 2 | from .registry import Registry, build_from_cfg, build_ray_obj_from_cfg
 3 | 
 4 | __all__ = [
 5 |     "Registry",
 6 |     "build_from_cfg",
 7 |     "build_ray_obj_from_cfg",
 8 |     "YamlConfig",
 9 | ]
10 | 


--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
1 | # This is an example .flake8 config, used when developing *Black* itself.
2 | # Keep in sync with setup.cfg which is used for source packages.
3 | 
4 | [flake8]
5 | ignore = E203, E266, E501, W503
6 | max-line-length = 88
7 | max-complexity = 18
8 | select = C,E,F,W,B,B950
9 | 


--------------------------------------------------------------------------------
/rl_algorithms/gail/utils.py:
--------------------------------------------------------------------------------
1 | import torch
2 | 
3 | 
4 | def compute_gail_reward(discriminator_score: torch.Tensor):
5 |     """Compute gail(imitation) reward of data generated by policy."""
6 |     return (
7 |         -torch.log(torch.sigmoid(discriminator_score) + 1e-8).detach().cpu().numpy()[0]
8 |     )
9 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 | -   repo: local
 3 |     hooks:
 4 |     - id: format
 5 |       name: format
 6 |       language: system
 7 |       entry: make format
 8 |       types: [python]
 9 |     - id: test
10 |       name: test
11 |       language: system
12 |       entry: make test
13 |       types: [python]
14 | 


--------------------------------------------------------------------------------
/.github/CODEOWNERS:
--------------------------------------------------------------------------------
1 | # Lines starting with '#' are comments.
2 | # Reference: https://docs.github.com/en/github/creating-cloning-and-archiving-repositories/about-code-owners
3 | # Each line is a file pattern followed by one or more owners.
4 | 
5 | # These owners will be the default owners for everything in the repo.
6 | *       @isk03276


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | numpy==1.18.0
 2 | torch==1.6.0
 3 | gym==0.17.3
 4 | atari-py==0.2.6
 5 | box2d-py==2.3.8
 6 | scipy==1.5.4
 7 | opencv-python==4.4.0.46
 8 | addict==2.4.0
 9 | tqdm
10 | 
11 | # for distributed learning
12 | redis==3.5.3  # for ray
13 | ray==1.3.0
14 | pyzmq==20.0.0
15 | pyarrow==3.0.0
16 | 
17 | # for log
18 | six>=1.13.0
19 | wandb==0.10.11
20 | matplotlib==3.3.3
21 | plotly==4.13.0
22 | 


--------------------------------------------------------------------------------
/Jenkinsfile:
--------------------------------------------------------------------------------
 1 | pipeline {
 2 |     agent { 
 3 |         dockerfile{
 4 |             filename "Dockerfile"
 5 |             args "-v /home/mpadmin/.ssh/:/root/.ssh/"
 6 |         }
 7 |     }
 8 |   stages {
 9 |     stage('Test') {
10 |       steps {
11 |         echo 'Testing...'
12 |         sh 'make jenkins-dev'
13 |         sh 'make test'
14 |         sh 'make integration-test'
15 |       }
16 |     }
17 |   }
18 | }


--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
 1 | pre-commit
 2 | 
 3 | # formatting
 4 | black == 21.6b0
 5 | isort == 5.6.4
 6 | 
 7 | # testing
 8 | pylint == 2.6.0
 9 | mypy == 0.761
10 | flake8 == 3.8.4
11 | flake8-bugbear == 20.11.1
12 | flake8-docstrings == 1.5.0
13 | pluggy == 0.13.1
14 | pytest == 6.1.2
15 | pytest-pylint == 0.18.0
16 | pytest-flake8 == 1.0.6
17 | pytest-mypy == 0.4.2
18 | pytest-cov == 2.10.1
19 | 
20 | # for jenkins
21 | setuptools==40.3.0


--------------------------------------------------------------------------------
/tools/check_version.sh:
--------------------------------------------------------------------------------
 1 | TAG=$1
 2 | TAG=`echo $TAG | sed -e "s/v//g"`
 3 | 
 4 | PACKAGE_NAME="rl_algorithms"
 5 | 
 6 | echo "A tag triggered build. $TAG"
 7 | 
 8 | VERSION=`cat  "$PACKAGE_NAME"/version`
 9 | echo "The version is $VERSION"
10 | 
11 | if [ "$VERSION" = "$TAG" ]; then
12 |     echo "Version is correct! Deploy to local server."
13 | 
14 | else
15 |     echo "Tag and version are not same. Check again."
16 |     exit 1
17 | fi;
18 | 


--------------------------------------------------------------------------------
/tools/run_reacher_v2.sh:
--------------------------------------------------------------------------------
1 | python run_reacher_v2.py --cfg-path ./configs/reacher_v2/ddpg.yaml --off-render --log
2 | python run_reacher_v2.py --cfg-path ./configs/reacher_v2/sac.yaml --off-render --log
3 | python run_reacher_v2.py --cfg-path ./configs/reacher_v2/td3.yaml --off-render --log
4 | python run_reacher_v2.py --cfg-path ./configs/reacher_v2/bc_ddpg.yaml --off-render --log
5 | python run_reacher_v2.py --cfg-path ./configs/reacher_v2/bc_sac.yaml --off-render --log


--------------------------------------------------------------------------------
/rl_algorithms/common/abstract/architecture.py:
--------------------------------------------------------------------------------
 1 | """Abstract class for distributed architectures.
 2 | 
 3 | - Author: Chris Yoon
 4 | - Contact: chris.yoon@medipixel.io
 5 | """
 6 | 
 7 | from abc import ABC, abstractmethod
 8 | 
 9 | 
10 | class Architecture(ABC):
11 |     """Abstract class for distributed architectures"""
12 | 
13 |     @abstractmethod
14 |     def _spawn(self):
15 |         pass
16 | 
17 |     @abstractmethod
18 |     def train(self):
19 |         pass
20 | 
21 |     @abstractmethod
22 |     def test(self):
23 |         pass
24 | 


--------------------------------------------------------------------------------
/rl_algorithms/common/abstract/reward_fn.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """Abstract class for computing reward.
 3 | 
 4 | - Author: Kyunghwan Kim
 5 | - Contact: kh.kim@medipixel.io
 6 | """
 7 | from abc import ABC, abstractmethod
 8 | 
 9 | import numpy as np
10 | 
11 | 
12 | class RewardFn(ABC):
13 |     """Abstract class for computing reward.
14 |     New compute_reward class should redefine __call__()
15 | 
16 |     """
17 | 
18 |     @abstractmethod
19 |     def __call__(self, transition: tuple, goal_state: np.ndarray) -> np.float64:
20 |         pass
21 | 


--------------------------------------------------------------------------------
/configs/lunarlander_v2/acer.yaml:
--------------------------------------------------------------------------------
 1 | type: ACERAgent
 2 | hyper_params:
 3 |  gamma: 0.98
 4 |  c: 1
 5 |  buffer_size: 5000
 6 |  n_rollout: 10
 7 |  replay_ratio: 16
 8 |  start_from: 100
 9 |  gradient_clip: 10
10 |  tau: 0.005
11 | 
12 | learner_cfg:
13 |  type: ACERLearner
14 |  backbone:
15 |   actor:
16 |   critic:
17 |   shared_actor_critic:
18 |  head:
19 |   actor:
20 |    type: MLP
21 |    configs:
22 |     hidden_sizes: [256, 256]
23 |     output_activation: identity
24 |   critic:
25 |    type: MLP
26 |    configs:
27 |     hidden_sizes: [256, 256]
28 |     output_activation: identity
29 |  optim_cfg:
30 |   lr: 0.0002
31 |   weight_decay: 0.0
32 |   adam_eps: 0.00000001
33 |  trust_region:
34 |   use_trust_region: true
35 |   delta: 1
36 | 


--------------------------------------------------------------------------------
/tools/run_descrete_env.sh:
--------------------------------------------------------------------------------
1 | python run_lunarlander_v2.py --cfg-path ./configs/lunarlander_v2/dqn.yaml --off-render --log
2 | python run_lunarlander_v2.py --cfg-path ./configs/lunarlander_v2/dqfd.yaml --off-render --log
3 | python run_lunarlander_v2.py --cfg-path ./configs/lunarlander_v2/r2d1.yaml --off-render --log
4 | 
5 | python run_pong_no_frameskip_v4.py --cfg-path ./configs/pong_no_frameskip_v4/dqn.yaml --off-render --log
6 | python run_pong_no_frameskip_v4.py --cfg-path ./configs/pong_no_frameskip_v4/r2d1.yaml --off-render --log
7 | python run_pong_no_frameskip_v4.py --cfg-path configs/pong_no_frameskip_v4/apex_dqn.yaml --off-render --log
8 | 
9 | python run_pong_no_frameskip_v4.py --cfg-path ./configs/pong_no_frameskip_v4/dqn_resnet.yaml --off-render --log


--------------------------------------------------------------------------------
/configs/lunarlander_continuous_v2/a2c.yaml:
--------------------------------------------------------------------------------
 1 | type: "A2CAgent"
 2 | hyper_params:
 3 |   gamma: 0.99
 4 |   w_entropy: 0.001
 5 |   gradient_clip_ac: 0.1
 6 |   gradient_clip_cr: 0.5
 7 | 
 8 | learner_cfg:
 9 |   type: "A2CLearner"
10 |   backbone:
11 |     actor:
12 |     critic:
13 |     shared_actor_critic:
14 |   head:
15 |     actor:
16 |       type: "GaussianDist"
17 |       configs: 
18 |         hidden_sizes: [256, 256]
19 |         output_activation: "identity"
20 |         fixed_logstd: True
21 |     critic:
22 |       type: "MLP"
23 |       configs:
24 |         hidden_sizes: [256, 256]
25 |         output_size: 1
26 |         output_activation: "identity"
27 |   optim_cfg:
28 |     lr_actor: 0.00004
29 |     lr_critic: 0.0003
30 |     weight_decay: 0.0
31 | 


--------------------------------------------------------------------------------
/configs/reacher_v2/td3.yaml:
--------------------------------------------------------------------------------
 1 | type: "TD3Agent"
 2 | hyper_params:
 3 |   gamma: 0.95
 4 |   tau: 0.005
 5 |   buffer_size: 100000
 6 |   batch_size: 100
 7 |   initial_random_action: 10000
 8 |   policy_update_freq: 2
 9 | 
10 | learner_cfg:
11 |   type: "TD3Learner"
12 |   backbone:
13 |     actor:
14 |     critic:
15 |   head:
16 |     actor:
17 |       type: "MLP"
18 |       configs: 
19 |         hidden_sizes: [400, 300]
20 |         output_activation: "tanh"
21 |     critic:
22 |       type: "MLP"
23 |       configs:
24 |         hidden_sizes: [400, 300]
25 |         output_size: 1
26 |         output_activation: "identity"
27 |   optim_cfg:
28 |     lr_actor: 0.001
29 |     lr_critic: 0.001
30 |     weight_decay: 0.0
31 | 
32 | noise_cfg:
33 |   exploration_noise: 0.1
34 |   target_policy_noise: 0.2
35 |   target_policy_noise_clip: 0.5
36 | 


--------------------------------------------------------------------------------
/configs/lunarlander_continuous_v2/td3.yaml:
--------------------------------------------------------------------------------
 1 | type: "TD3Agent"
 2 | hyper_params:
 3 |   gamma: 0.99
 4 |   tau: 0.005
 5 |   buffer_size: 100000
 6 |   batch_size: 100
 7 |   initial_random_action: 10000
 8 |   policy_update_freq: 2
 9 | 
10 | learner_cfg:
11 |   type: "TD3Learner"
12 |   backbone:
13 |     actor:
14 |     critic:
15 |     shared_actor_critic:
16 |   head:
17 |     actor:
18 |       type: "MLP"
19 |       configs: 
20 |         hidden_sizes: [400, 300]
21 |         output_activation: "tanh"
22 |     critic:
23 |       type: "MLP"
24 |       configs:
25 |         hidden_sizes: [400, 300]
26 |         output_size: 1
27 |         output_activation: "identity"
28 |   optim_cfg:
29 |     lr_actor: 0.001
30 |     lr_critic: 0.001
31 |     weight_decay: 0.0
32 | 
33 | noise_cfg:
34 |   exploration_noise: 0.1
35 |   target_policy_noise: 0.2
36 |   target_policy_noise_clip: 0.5
37 | 


--------------------------------------------------------------------------------
/configs/reacher_v2/ddpg.yaml:
--------------------------------------------------------------------------------
 1 | type: "DDPGAgent"
 2 | hyper_params:
 3 |   gamma: 0.99
 4 |   tau: 0.001
 5 |   buffer_size: 10000
 6 |   batch_size: 128
 7 |   initial_random_action: 10000
 8 |   multiple_update: 1  # multiple learning updates
 9 |   gradient_clip_ac: 0.5
10 |   gradient_clip_cr: 1.0
11 | 
12 | learner_cfg:
13 |   type: "DDPGLearner"
14 |   backbone:
15 |     actor:
16 |     critic:
17 |   head:
18 |     actor:
19 |       type: "MLP"
20 |       configs: 
21 |         hidden_sizes: [256, 256]
22 |         output_activation: "tanh"
23 |     critic:
24 |       type: "MLP"
25 |       configs:
26 |         hidden_sizes: [256, 256]
27 |         output_size: 1
28 |         output_activation: "identity"
29 |   optim_cfg:
30 |     lr_actor: 0.001
31 |     lr_critic: 0.001
32 |     weight_decay: 0.000001
33 | 
34 | noise_cfg:
35 |   ou_noise_theta: 0.0
36 |   ou_noise_sigma: 0.0
37 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | format:
 2 | 	black . --exclude checkpoint wandb
 3 | 	isort . --skip checkpoint --skip wandb --skip data
 4 | 
 5 | test:
 6 | 	black . --check
 7 | 	isort . --check --diff --skip checkpoint --skip wandb --skip data
 8 | 	env PYTHONPATH=. pytest --pylint --flake8 --cov=tests --ignore=checkpoint --ignore=data --ignore=wandb --ignore tests/integration
 9 | 
10 | integration-test:
11 | 	env PYTHONPATH=. pytest tests/integration --cov=tests
12 | 
13 | docker-push:
14 | 	docker build -t medipixel/rl_algorithms .
15 | 	docker push medipixel/rl_algorithms
16 | 
17 | dev:
18 | 	pip install -U -r requirements.txt
19 | 	pip install -U -r requirements-dev.txt
20 | 	pre-commit install
21 | 	python setup.py develop
22 | 
23 | dep:
24 | 	pip install -U -r requirements.txt
25 | 	python setup.py install
26 | 
27 | jenkins-dev:
28 | 	pip install -U -r requirements-dev.txt
29 | 	python setup.py develop


--------------------------------------------------------------------------------
/configs/lunarlander_continuous_v2/ddpg.yaml:
--------------------------------------------------------------------------------
 1 | type: "DDPGAgent"
 2 | hyper_params:
 3 |   gamma: 0.99
 4 |   tau: 0.005
 5 |   buffer_size: 10000
 6 |   batch_size: 64
 7 |   initial_random_action: 10000
 8 |   multiple_update: 1  # multiple learning updates
 9 |   gradient_clip_ac: 0.5
10 |   gradient_clip_cr: 1.0
11 | 
12 | learner_cfg:
13 |   type: "DDPGLearner"
14 |   backbone:
15 |     actor:
16 |     critic:
17 |     shared_actor_critic:
18 |   head:
19 |     actor:
20 |       type: "MLP"
21 |       configs: 
22 |         hidden_sizes: [256, 256]
23 |         output_activation: "tanh"
24 |     critic:
25 |       type: "MLP"
26 |       configs:
27 |         hidden_sizes: [256, 256]
28 |         output_size: 1
29 |         output_activation: "identity"
30 |   optim_cfg:
31 |     lr_actor: 0.0003
32 |     lr_critic: 0.0003
33 |     weight_decay: 0.000001
34 | 
35 | noise_cfg:
36 |   ou_noise_theta: 0.0
37 |   ou_noise_sigma: 0.0
38 | 


--------------------------------------------------------------------------------
/configs/lunarlander_v2/ppo.yaml:
--------------------------------------------------------------------------------
 1 | type: "PPOAgent"
 2 | hyper_params:
 3 |   gamma: 0.99
 4 |   tau: 0.95
 5 |   batch_size: 32
 6 |   max_epsilon: 0.2
 7 |   min_epsilon: 0.2
 8 |   epsilon_decay_period: 1500
 9 |   w_value: 1.0
10 |   w_entropy: 0.001
11 |   gradient_clip_ac: 0.5
12 |   gradient_clip_cr: 1.0
13 |   epoch: 16
14 |   rollout_len: 256
15 |   n_workers: 12
16 |   use_clipped_value_loss: False
17 |   standardize_advantage: True
18 | 
19 | learner_cfg:
20 |   type: "PPOLearner"
21 |   backbone:
22 |     actor:
23 |     critic:
24 |     shared_actor_critic:
25 |   head:
26 |     actor:
27 |       type: "CategoricalDist"
28 |       configs: 
29 |         hidden_sizes: [256, 256]
30 |         output_activation: "identity"
31 |     critic:
32 |       type: "MLP"
33 |       configs:
34 |         hidden_sizes: [256, 256]
35 |         output_size: 1
36 |         output_activation: "identity"
37 |   optim_cfg:
38 |     lr_actor: 0.0003
39 |     lr_critic: 0.001
40 |     weight_decay: 0.0
41 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04
 2 | 
 3 | RUN rm /etc/apt/sources.list.d/cuda.list
 4 | RUN rm /etc/apt/sources.list.d/nvidia-ml.list
 5 | 
 6 | RUN apt-get update
 7 | RUN apt-get install -y software-properties-common vim
 8 | RUN apt-get install -y libsm6 libxext6 libxrender-dev libusb-1.0-0-dev && apt-get update
 9 | RUN apt-get install -y git
10 | RUN apt-get install -y python3-pip python3-dev \
11 |     && cd /usr/local/bin \
12 |     && ln -s /usr/bin/python3 python \
13 |     && pip3 install --upgrade pip
14 | 
15 | # set workspace
16 | RUN mkdir /workspace/
17 | WORKDIR /workspace
18 | 
19 | COPY requirements.txt /workspace/requirements.txt
20 | RUN pip install -U Cython numpy
21 | RUN pip install -U -r requirements.txt
22 | 
23 | # set cuda path
24 | ENV CUDA_HOME /usr/local/cuda
25 | ENV PATH "/usr/local/cuda/bin:$PATH"
26 | ENV LD_LIBRARY_PATH "$LD_LIBRARY_PATH:/usr/local/cuda/lib64"
27 | ENV LIBRARY_PATH "$LIBRARY_PATH:/usr/local/cuda/lib64"
28 | 
29 | RUN apt-get update && apt-get install -y libgl1-mesa-glx


--------------------------------------------------------------------------------
/configs/lunarlander_continuous_v2/ppo.yaml:
--------------------------------------------------------------------------------
 1 | type: "PPOAgent"
 2 | hyper_params:
 3 |   gamma: 0.99
 4 |   tau: 0.95
 5 |   batch_size: 32
 6 |   max_epsilon: 0.2
 7 |   min_epsilon: 0.2
 8 |   epsilon_decay_period: 1500
 9 |   w_value: 1.0
10 |   w_entropy: 0.001
11 |   gradient_clip_ac: 0.5
12 |   gradient_clip_cr: 1.0
13 |   epoch: 16
14 |   rollout_len: 256
15 |   n_workers: 12
16 |   use_clipped_value_loss: False
17 |   standardize_advantage: True
18 | 
19 | learner_cfg:
20 |   type: "PPOLearner"
21 |   backbone:
22 |     actor:
23 |     critic:
24 |     shared_actor_critic:
25 |   head:
26 |     actor:
27 |       type: "GaussianDist"
28 |       configs: 
29 |         hidden_sizes: [256, 256]
30 |         hidden_activation: "tanh"
31 |         output_activation: "identity"
32 |         fixed_logstd: True
33 |     critic:
34 |       type: "MLP"
35 |       configs:
36 |         hidden_sizes: [256, 256]
37 |         output_size: 1
38 |         output_activation: "identity"
39 |   optim_cfg:
40 |     lr_actor: 0.0003
41 |     lr_critic: 0.001
42 |     weight_decay: 0.0
43 | 


--------------------------------------------------------------------------------
/configs/reacher_v2/bc_ddpg.yaml:
--------------------------------------------------------------------------------
 1 | type: "BCDDPGAgent"
 2 | hyper_params:
 3 |   gamma: 0.99
 4 |   tau: 0.001
 5 |   buffer_size: 100000
 6 |   batch_size: 512
 7 |   initial_random_action: 10000
 8 |   multiple_update: 1  # multiple learning updates
 9 |   gradient_clip_ac: 0.5
10 |   gradient_clip_cr: 1.0
11 |   # BC
12 |   demo_batch_size: 64
13 |   lambda1: 0.001
14 |   demo_path: "data/reacher_demo.pkl"
15 |   # HER
16 |   use_her: False
17 |   her:
18 |     type: ReacherHER
19 |   success_score: -5.0
20 |   desired_states_from_demo: False
21 | 
22 | learner_cfg:
23 |   type: "BCDDPGLearner"
24 |   backbone:
25 |     actor:
26 |     critic:
27 |   head:
28 |     actor:
29 |       type: "MLP"
30 |       configs: 
31 |         hidden_sizes: [256, 256]
32 |         output_activation: "tanh"
33 |     critic:
34 |       type: "MLP"
35 |       configs:
36 |         hidden_sizes: [256, 256]
37 |         output_size: 1
38 |         output_activation: "identity"
39 |   optim_cfg:
40 |     lr_actor: 0.0001
41 |     lr_critic: 0.001
42 |     weight_decay: 0.000001
43 | 
44 | noise_cfg:
45 |   ou_noise_theta: 0.0
46 |   ou_noise_sigma: 0.0
47 | 


--------------------------------------------------------------------------------
/configs/lunarlander_v2/dqn.yaml:
--------------------------------------------------------------------------------
 1 | type: "DQNAgent"
 2 | hyper_params:
 3 |   gamma: 0.99
 4 |   tau: 0.005
 5 |   buffer_size: 100000  # openai baselines: 10000
 6 |   batch_size: 64  # openai baselines: 32
 7 |   update_starts_from: 10000  # openai baselines: 10000
 8 |   multiple_update: 1  # multiple learning updates
 9 |   train_freq: 1  # in openai baselines, train_freq = 4
10 |   gradient_clip: 10.0  # dueling: 10.0
11 |   n_step: 3
12 |   w_n_step: 1.0
13 |   w_q_reg: 0.0000001
14 |   per_alpha: 0.6  # openai baselines: 0.6
15 |   per_beta: 0.4
16 |   per_eps: 0.000001
17 |   max_epsilon: 1.0
18 |   min_epsilon: 0.01  # openai baselines: 0.01
19 |   epsilon_decay: 0.00001  # openai baselines: 1e-7 / 1e-1
20 | 
21 | learner_cfg:
22 |   type: "DQNLearner"
23 |   loss_type:
24 |     type: "C51Loss"
25 |   backbone:
26 |   head:
27 |     type: "C51DuelingMLP"
28 |     configs: 
29 |       hidden_sizes: [128, 64]
30 |       v_min: -300
31 |       v_max: 300
32 |       atom_size: 1530
33 |       output_activation: "identity"
34 |       use_noisy_net: False
35 |   optim_cfg:
36 |     lr_dqn: 0.0001
37 |     weight_decay: 0.0000001
38 |     adam_eps: 0.00000001
39 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2019 Medipixel
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/configs/lunarlander_continuous_v2/bc_ddpg.yaml:
--------------------------------------------------------------------------------
 1 | type: "BCDDPGAgent"
 2 | hyper_params:
 3 |   gamma: 0.99
 4 |   tau: 0.001
 5 |   buffer_size: 100000
 6 |   batch_size: 512
 7 |   initial_random_action: 10000
 8 |   multiple_update: 1  # multiple learning updates
 9 |   gradient_clip_ac: 0.5
10 |   gradient_clip_cr: 0.5
11 |   # BC
12 |   demo_batch_size: 64
13 |   lambda1: 0.001
14 |   demo_path: "data/lunarlander_continuous_demo.pkl"
15 |   # HER
16 |   use_her: False
17 |   her:
18 |     type: LunarLanderContinuousHER
19 |   success_score: 250.0
20 |   desired_states_from_demo: True
21 | 
22 | learner_cfg:
23 |   type: "BCDDPGLearner"
24 |   backbone:
25 |     actor:
26 |     critic:
27 |     shared_actor_critic:
28 |   head:
29 |     actor:
30 |       type: "MLP"
31 |       configs: 
32 |         hidden_sizes: [256, 256]
33 |         output_activation: "tanh"
34 |     critic:
35 |       type: "MLP"
36 |       configs:
37 |         hidden_sizes: [256, 256]
38 |         output_size: 1
39 |         output_activation: "identity"
40 |   optim_cfg:
41 |     lr_actor: 0.0001
42 |     lr_critic: 0.001
43 |     weight_decay: 0.0001
44 | 
45 | noise_cfg:
46 |   ou_noise_theta: 0.0
47 |   ou_noise_sigma: 0.0
48 | 


--------------------------------------------------------------------------------
/configs/reacher_v2/sac.yaml:
--------------------------------------------------------------------------------
 1 | type: "SACAgent"
 2 | hyper_params:
 3 |   gamma: 0.99
 4 |   tau: 0.005
 5 |   buffer_size: 100000
 6 |   batch_size: 512
 7 |   initial_random_action: 20000
 8 |   multiple_update: 1  # multiple learning updates
 9 |   policy_update_freq: 2
10 |   w_entropy: 0.001
11 |   w_mean_reg: 0.001
12 |   w_std_reg: 0.001
13 |   w_pre_activation_reg: 0.0
14 |   auto_entropy_tuning: True
15 | 
16 | learner_cfg:
17 |   type: "SACLearner"
18 |   backbone:
19 |     actor:
20 |     critic_vf:
21 |     critic_qf:
22 |   head:
23 |     actor:
24 |       type: "TanhGaussianDistParams"
25 |       configs: 
26 |         hidden_sizes: [256, 256]
27 |         output_activation: "identity"
28 |     critic_vf:
29 |       type: "MLP"
30 |       configs:
31 |         hidden_sizes: [256, 256]
32 |         output_size: 1
33 |         output_activation: "identity"
34 |     critic_qf:
35 |       type: "MLP"
36 |       configs:
37 |         hidden_sizes: [256, 256]
38 |         output_size: 1
39 |         output_activation: "identity"
40 |   optim_cfg:
41 |     lr_actor: 0.0003
42 |     lr_vf: 0.0003
43 |     lr_qf1: 0.0003
44 |     lr_qf2: 0.0003
45 |     lr_entropy: 0.0003
46 |     weight_decay: 0.0
47 | 


--------------------------------------------------------------------------------
/configs/pong_no_frameskip_v4/ppo.yaml:
--------------------------------------------------------------------------------
 1 | type: "PPOAgent"
 2 | hyper_params:
 3 |   gamma: 0.99
 4 |   tau: 0.95
 5 |   batch_size: 32
 6 |   max_epsilon: 0.2
 7 |   min_epsilon: 0.2
 8 |   epsilon_decay_period: 1500
 9 |   w_value: 1.0
10 |   w_entropy: 0.001
11 |   gradient_clip_ac: 0.5
12 |   gradient_clip_cr: 1.0
13 |   epoch: 16
14 |   rollout_len: 256
15 |   n_workers: 4
16 |   use_clipped_value_loss: False
17 |   standardize_advantage: True
18 | 
19 | learner_cfg:
20 |   type: "PPOLearner"
21 |   backbone:
22 |     actor:
23 |     critic:
24 |     shared_actor_critic:
25 |       type: "CNN"
26 |       configs:
27 |         input_sizes: [4, 32, 64]
28 |         output_sizes: [32, 64, 64]
29 |         kernel_sizes: [8, 4, 3]
30 |         strides: [4, 2, 1]
31 |         paddings: [1, 0, 0]
32 |   head:
33 |     actor:
34 |       type: "CategoricalDist"
35 |       configs: 
36 |         hidden_sizes: [512]
37 |         output_activation: "identity"
38 |     critic:
39 |       type: "MLP"
40 |       configs:
41 |         hidden_sizes: [512]
42 |         output_size: 1
43 |         output_activation: "identity"
44 |   optim_cfg:
45 |     lr_actor: 0.0003
46 |     lr_critic: 0.001
47 |     weight_decay: 0.0
48 | 


--------------------------------------------------------------------------------
/.github/workflows/python-publish.yaml:
--------------------------------------------------------------------------------
 1 | # This workflows will upload a Python Package using Twine when a release is created
 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
 3 | 
 4 | name: Upload package to local pypi server
 5 | 
 6 | on:
 7 |   push:
 8 |     tags:
 9 |       - "v*"
10 | 
11 | jobs:
12 |   deploy:
13 | 
14 |     runs-on: ubuntu-latest
15 | 
16 |     steps:
17 |     - uses: actions/checkout@v2
18 |     - name: Set up Python
19 |       uses: actions/setup-python@v2
20 |       with:
21 |         python-version: '3.x'
22 |     - name: Install dependencies
23 |       run: |
24 |         python -m pip install --upgrade pip
25 |         pip install setuptools wheel twine
26 |     - name: Check version
27 |       run: |
28 |         sh tools/check_version.sh ${GITHUB_REF#refs/*/}
29 |     - name: Build and publish
30 |       env:
31 |         TWINE_REPOSITORY_URL: ${{ secrets.PYPI_REPOSITORY_URL }}
32 |         TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
33 |         TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
34 |       run: |
35 |         python setup.py sdist
36 |         twine upload dist/*
37 | 


--------------------------------------------------------------------------------
/tools/run_lunarlander_continuous_v2.sh:
--------------------------------------------------------------------------------
 1 | python run_lunarlander_continuous_v2.py --cfg-path ./configs/lunarlander_continuous_v2/a2c.yaml --off-render --log
 2 | python run_lunarlander_continuous_v2.py --cfg-path ./configs/lunarlander_continuous_v2/ddpg.yaml --off-render --log
 3 | python run_lunarlander_continuous_v2.py --cfg-path ./configs/lunarlander_continuous_v2/td3.yaml --off-render --log
 4 | python run_lunarlander_continuous_v2.py --cfg-path ./configs/lunarlander_continuous_v2/sac.yaml --off-render --log
 5 | python run_lunarlander_continuous_v2.py --cfg-path ./configs/lunarlander_continuous_v2/ppo.yaml --off-render --log
 6 | python run_lunarlander_continuous_v2.py --cfg-path ./configs/lunarlander_continuous_v2/bc_ddpg.yaml --off-render --log
 7 | python run_lunarlander_continuous_v2.py --cfg-path ./configs/lunarlander_continuous_v2/bc_sac.yaml --off-render --log
 8 | python run_lunarlander_continuous_v2.py --cfg-path ./configs/lunarlander_continuous_v2/per_ddpg.yaml --off-render --log
 9 | python run_lunarlander_continuous_v2.py --cfg-path ./configs/lunarlander_continuous_v2/ddpgfd.yaml --off-render --log
10 | python run_lunarlander_continuous_v2.py --cfg-path ./configs/lunarlander_continuous_v2/sacfd.yaml --off-render --log


--------------------------------------------------------------------------------
/rl_algorithms/common/env/normalizers.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """Collection of normalizers.
 3 | 
 4 | - Author: Curt Park
 5 | - Contact: curt.park@medipixel.io
 6 | """
 7 | 
 8 | import gym
 9 | import numpy as np
10 | 
11 | 
12 | class ActionNormalizer(gym.ActionWrapper):
13 |     """Rescale and relocate the actions."""
14 | 
15 |     def action(self, action: np.ndarray) -> np.ndarray:
16 |         """Change the range (-1, 1) to (low, high)."""
17 |         low = self.action_space.low
18 |         high = self.action_space.high
19 | 
20 |         scale_factor = (high - low) / 2
21 |         reloc_factor = high - scale_factor
22 | 
23 |         action = action * scale_factor + reloc_factor
24 |         action = np.clip(action, low, high)
25 | 
26 |         return action
27 | 
28 |     def reverse_action(self, action: np.ndarray) -> np.ndarray:
29 |         """Change the range (low, high) to (-1, 1)."""
30 |         low = self.action_space.low
31 |         high = self.action_space.high
32 | 
33 |         scale_factor = (high - low) / 2
34 |         reloc_factor = high - scale_factor
35 | 
36 |         action = (action - reloc_factor) / scale_factor
37 |         action = np.clip(action, -1.0, 1.0)
38 | 
39 |         return action
40 | 


--------------------------------------------------------------------------------
/configs/lunarlander_continuous_v2/ddpgfd.yaml:
--------------------------------------------------------------------------------
 1 | type: "DDPGfDAgent"
 2 | hyper_params:
 3 |   gamma: 0.99
 4 |   tau: 0.005
 5 |   buffer_size: 10000
 6 |   batch_size: 64
 7 |   initial_random_action: 10000
 8 |   multiple_update: 1  # multiple learning updates
 9 |   gradient_clip_ac: 0.5
10 |   gradient_clip_cr: 1.0
11 |   # fD
12 |   per_alpha: 0.3
13 |   per_beta: 1.0
14 |   per_eps: 0.000006
15 |   per_eps_demo: 1.0
16 |   n_step: 1
17 |   pretrain_step: 5000
18 |   lambda1: 1.0  # N-step return weight
19 |   # lambda2 = weight_decay
20 |   lambda3: 1.0  # actor loss contribution of prior weight
21 |   demo_path: "data/lunarlander_continuous_demo.pkl"
22 | 
23 | learner_cfg:
24 |   type: "DDPGfDLearner"
25 |   backbone:
26 |     actor:
27 |     critic:
28 |     shared_actor_critic:
29 |   head:
30 |     actor:
31 |       type: "MLP"
32 |       configs: 
33 |         hidden_sizes: [256, 256]
34 |         output_activation: "tanh"
35 |     critic:
36 |       type: "MLP"
37 |       configs:
38 |         hidden_sizes: [256, 256]
39 |         output_size: 1
40 |         output_activation: "identity"
41 |   optim_cfg:
42 |     lr_actor: 0.0003
43 |     lr_critic: 0.0003
44 |     weight_decay: 0.0001
45 | 
46 | noise_cfg:
47 |   ou_noise_theta: 0.0
48 |   ou_noise_sigma: 0.0
49 | 


--------------------------------------------------------------------------------
/configs/lunarlander_continuous_v2/sac.yaml:
--------------------------------------------------------------------------------
 1 | type: "SACAgent"
 2 | hyper_params:
 3 |   gamma: 0.99
 4 |   tau: 0.005
 5 |   buffer_size: 100000
 6 |   batch_size: 128
 7 |   initial_random_action: 5000
 8 |   multiple_update: 1  # multiple learning updates
 9 |   policy_update_freq: 2
10 |   w_entropy: 0.001
11 |   w_mean_reg: 0.0
12 |   w_std_reg: 0.0
13 |   w_pre_activation_reg: 0.0
14 |   auto_entropy_tuning: True
15 | 
16 | learner_cfg:
17 |   type: "SACLearner"
18 |   backbone:
19 |     actor:
20 |     critic_vf:
21 |     critic_qf:
22 |     shared_actor_critic:
23 |   head:
24 |     actor:
25 |       type: "TanhGaussianDistParams"
26 |       configs: 
27 |         hidden_sizes: [256, 256]
28 |         output_activation: "identity"
29 |         fixed_logstd: False
30 |     critic_vf:
31 |       type: "MLP"
32 |       configs:
33 |         hidden_sizes: [256, 256]
34 |         output_size: 1
35 |         output_activation: "identity"
36 |     critic_qf:
37 |       type: "MLP"
38 |       configs:
39 |         hidden_sizes: [256, 256]
40 |         output_size: 1
41 |         output_activation: "identity"
42 |   optim_cfg:
43 |     lr_actor: 0.0003
44 |     lr_vf: 0.0003
45 |     lr_qf1: 0.0003
46 |     lr_qf2: 0.0003
47 |     lr_entropy: 0.0003
48 |     weight_decay: 0.0
49 | 


--------------------------------------------------------------------------------
/configs/lunarlander_v2/r2d1.yaml:
--------------------------------------------------------------------------------
 1 | type: "R2D1Agent"
 2 | hyper_params:
 3 |   gamma: 0.99
 4 |   tau: 0.005
 5 |   buffer_size: 10000  # openai baselines: 10000
 6 |   batch_size: 64  # openai baselines: 32
 7 |   update_starts_from: 1000  # openai baselines: 10000
 8 |   multiple_update: 1  # multiple learning updates
 9 |   train_freq: 1  # in openai baselines, train_freq = 4
10 |   gradient_clip: 10.0  # dueling: 10.0
11 |   n_step: 3
12 |   w_n_step: 1.0
13 |   w_q_reg: 0.0
14 |   per_alpha: 0.6  # openai baselines: 0.6
15 |   per_beta: 0.4
16 |   per_eps: 0.000001
17 |   max_epsilon: 1.0
18 |   min_epsilon: 0.01  # openai baselines: 0.01
19 |   epsilon_decay: 0.00002  # openai baselines: 1e-7 / 1e-1
20 |   # R2D1
21 |   sequence_size: 32
22 |   overlap_size: 16
23 | 
24 | learner_cfg:
25 |   type: "R2D1Learner"
26 |   loss_type:
27 |     type: "R2D1C51Loss"
28 |   backbone:
29 |   gru:
30 |     rnn_hidden_size: 64
31 |     burn_in_step: 16
32 |   head:
33 |     type: "C51DuelingMLP"
34 |     configs: 
35 |       hidden_sizes: [128, 64]
36 |       v_min: -300
37 |       v_max: 300
38 |       atom_size: 51
39 |       output_activation: "identity"
40 |       use_noisy_net: False
41 |   optim_cfg:
42 |     lr_dqn: 0.0001
43 |     weight_decay: 0.0000001
44 |     adam_eps: 0.00000001
45 | 


--------------------------------------------------------------------------------
/configs/lunarlander_v2/distillation_dqn.yaml:
--------------------------------------------------------------------------------
 1 | type: "DistillationDQNAgent"
 2 | hyper_params:
 3 |   gamma: 0.99
 4 |   tau: 0.005
 5 |   buffer_size: 100000  # openai baselines: 10000
 6 |   batch_size: 64  # openai baselines: 32
 7 |   update_starts_from: 10000  # openai baselines: 10000
 8 |   multiple_update: 1  # multiple learning updates
 9 |   train_freq: 1  # in openai baselines, train_freq = 4
10 |   gradient_clip: 10.0  # dueling: 10.0
11 |   n_step: 3
12 |   w_n_step: 1.0
13 |   w_q_reg: 0.01
14 |   per_alpha: 0.6  # openai baselines: 0.6
15 |   per_beta: 0.4
16 |   per_eps: 0.000001
17 |   max_epsilon: 1.0
18 |   min_epsilon: 0.01  # openai baselines: 0.01
19 |   epsilon_decay: 0.00001  # openai baselines: 1e-7 / 1e-1
20 |   # Distillation
21 |   dataset_path: []
22 |   save_dir: "data/"
23 |   epochs: 20
24 |   n_frame_from_last: 50000
25 |   is_student: False
26 | 
27 | learner_cfg:
28 |   type: "DQNLearner"
29 |   loss_type:
30 |     type: "C51Loss"
31 |   backbone:
32 |   head:
33 |     type: "C51DuelingMLP"
34 |     configs: 
35 |       hidden_sizes: [128, 64]
36 |       v_min: -300
37 |       v_max: 300
38 |       atom_size: 1530
39 |       output_activation: "identity"
40 |       use_noisy_net: False
41 |   optim_cfg:
42 |     lr_dqn: 0.0001
43 |     weight_decay: 0.0000001
44 |     adam_eps: 0.00000001
45 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import find_packages, setup
 2 | 
 3 | with open("README.md", "r", encoding="utf-8") as fh:
 4 |     long_description = fh.read()
 5 | 
 6 | with open("./requirements.txt", "r") as f:
 7 |     required = f.read().splitlines()
 8 | 
 9 | version_file = "rl_algorithms/version"
10 | 
11 | 
12 | def get_version():
13 |     version = open(version_file, "r", encoding="utf-8").read().strip()
14 |     return version
15 | 
16 | 
17 | setup(
18 |     name="rl_algorithms",
19 |     version=get_version(),
20 |     author="medipixel",
21 |     author_email="kh.kim@medipixel.io",
22 |     description="Reinforcement Learning algorithms which are being used for research \
23 |         activities at Medipixel.",
24 |     long_description=long_description,
25 |     long_description_content_type="text/markdown",
26 |     url="https://github.com/medipixel/rl_algorithms.git",
27 |     keywords="reinforcement-learning python machine learning",
28 |     packages=find_packages(),
29 |     classifiers=[
30 |         "Programming Language :: Python :: 3",
31 |         "Programming Language :: Python :: 3.6",
32 |         "License :: OSI Approved :: MIT License",
33 |         "Operating System :: OS Independent",
34 |         "Topic :: Scientific/Engineering :: Artificial Intelligence",
35 |     ],
36 |     python_requires=">=3.6",
37 |     install_requires=required,
38 |     include_package_data=True,
39 |     zip_safe=False,
40 | )
41 | 


--------------------------------------------------------------------------------
/configs/lunarlander_v2/dqfd.yaml:
--------------------------------------------------------------------------------
 1 | type: "DQfDAgent"
 2 | hyper_params:
 3 |   gamma: 0.99
 4 |   tau: 0.005
 5 |   buffer_size: 100000  # openai baselines: 10000
 6 |   batch_size: 64  # openai baselines: 32
 7 |   update_starts_from: 10000  # openai baselines: 10000
 8 |   multiple_update: 1  # multiple learning updates
 9 |   train_freq: 8  # in openai baselines, train_freq = 4
10 |   gradient_clip: 0.5  # dueling: 10.0
11 |   n_step: 3
12 |   w_n_step: 1.0
13 |   w_q_reg: 0.0000001
14 |   per_alpha: 0.6  # openai baselines: 0.6
15 |   per_beta: 0.4
16 |   per_eps: 0.001
17 |   # fD
18 |   per_eps_demo: 1.0
19 |   lambda1: 1.0  # N-step return weight
20 |   lambda2: 1.0  # Supervised loss weight
21 |   # lambda3 = weight_decay (l2 regularization weight)
22 |   margin: 0.8
23 |   pretrain_step: 100
24 |   max_epsilon: 1.0
25 |   min_epsilon: 0.0  # openai baselines: 0.01
26 |   epsilon_decay: 0.00002  # openai baselines: 1e-7 / 1e-1
27 |   demo_path: "data/lunarlander_discrete_demo.pkl"
28 | 
29 | learner_cfg:
30 |   type: "DQfDLearner"
31 |   loss_type:
32 |     type: "C51Loss"
33 |   backbone:
34 |   head:
35 |     type: "C51DuelingMLP"
36 |     configs: 
37 |       hidden_sizes: [128, 64]
38 |       v_min: -300
39 |       v_max: 300
40 |       atom_size: 1530
41 |       output_activation: "identity"
42 |       use_noisy_net: False
43 |   optim_cfg:
44 |     lr_dqn: 0.0001
45 |     weight_decay: 0.00001
46 |     adam_eps: 0.00000001
47 | 


--------------------------------------------------------------------------------
/configs/reacher_v2/bc_sac.yaml:
--------------------------------------------------------------------------------
 1 | type: "BCSACAgent"
 2 | hyper_params:
 3 |   gamma: 0.99
 4 |   tau: 0.005
 5 |   buffer_size: 1000000
 6 |   batch_size: 512
 7 |   initial_random_action: 10000
 8 |   multiple_update: 1  # multiple learning updates
 9 |   policy_update_freq: 2
10 |   w_entropy: 0.001
11 |   w_mean_reg: 0.001
12 |   w_std_reg: 0.001
13 |   w_pre_activation_reg: 0.0
14 |   auto_entropy_tuning: True
15 |   # BC
16 |   demo_batch_size: 64
17 |   lambda1: 0.001
18 |   demo_path: "data/reacher_demo.pkl"
19 |   # HER
20 |   use_her: True
21 |   her:
22 |     type: ReacherHER
23 |   success_score: -5.0
24 |   desired_states_from_demo: False
25 | 
26 | learner_cfg:
27 |   type: "BCSACLearner"
28 |   backbone:
29 |     actor:
30 |     critic_vf:
31 |     critic_qf:
32 |   head:
33 |     actor:
34 |       type: "TanhGaussianDistParams"
35 |       configs: 
36 |         hidden_sizes: [256, 256]
37 |         output_activation: "identity"
38 |     critic_vf:
39 |       type: "MLP"
40 |       configs:
41 |         hidden_sizes: [256, 256]
42 |         output_size: 1
43 |         output_activation: "identity"
44 |     critic_qf:
45 |       type: "MLP"
46 |       configs:
47 |         hidden_sizes: [256, 256]
48 |         output_size: 1
49 |         output_activation: "identity"
50 |   optim_cfg:
51 |     lr_actor: 0.0003
52 |     lr_vf: 0.0003
53 |     lr_qf1: 0.0003
54 |     lr_qf2: 0.0003
55 |     lr_entropy: 0.0003
56 |     weight_decay: 0.0
57 | 


--------------------------------------------------------------------------------
/rl_algorithms/common/abstract/buffer.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """Abstract Buffer & BufferWrapper class.
 3 | 
 4 | - Author: Euijin Jeong
 5 | - Contact: euijin.jeong@medipixel.io
 6 | """
 7 | 
 8 | from abc import ABC, abstractmethod
 9 | from typing import Any, Tuple
10 | 
11 | import numpy as np
12 | 
13 | 
14 | class BaseBuffer(ABC):
15 |     """Abstract Buffer used for replay buffer."""
16 | 
17 |     @abstractmethod
18 |     def add(self, transition: Tuple[Any, ...]) -> Tuple[Any, ...]:
19 |         pass
20 | 
21 |     @abstractmethod
22 |     def sample(self) -> Tuple[np.ndarray, ...]:
23 |         pass
24 | 
25 |     @abstractmethod
26 |     def __len__(self) -> int:
27 |         pass
28 | 
29 | 
30 | class BufferWrapper(BaseBuffer):
31 |     """Abstract BufferWrapper used for buffer wrapper.
32 | 
33 |     Attributes:
34 |         buffer (Buffer): Hold replay buffer as am attribute
35 |     """
36 | 
37 |     def __init__(self, base_buffer: BaseBuffer):
38 |         """Initialize a ReplayBuffer object.
39 | 
40 |         Args:
41 |             base_buffer (int): ReplayBuffer which should be hold
42 |         """
43 |         self.buffer = base_buffer
44 | 
45 |     def add(self, transition: Tuple[Any, ...]) -> Tuple[Any, ...]:
46 |         return self.buffer.add(transition)
47 | 
48 |     def sample(self) -> Tuple[np.ndarray, ...]:
49 |         return self.buffer.sample()
50 | 
51 |     def __len__(self) -> int:
52 |         """Return the current size of internal memory."""
53 |         return len(self.buffer)
54 | 


--------------------------------------------------------------------------------
/configs/lunarlander_continuous_v2/bc_sac.yaml:
--------------------------------------------------------------------------------
 1 | type: "BCSACAgent"
 2 | hyper_params:
 3 |   gamma: 0.99
 4 |   tau: 0.005
 5 |   buffer_size: 1000000
 6 |   batch_size: 512
 7 |   initial_random_action: 10000
 8 |   multiple_update: 1  # multiple learning updates
 9 |   policy_update_freq: 2
10 |   w_entropy: 0.001
11 |   w_mean_reg: 0.001
12 |   w_std_reg: 0.001
13 |   w_pre_activation_reg: 0.0
14 |   auto_entropy_tuning: True
15 |   # BC
16 |   demo_batch_size: 64
17 |   lambda1: 0.001
18 |   demo_path: "data/lunarlander_continuous_demo.pkl"
19 |   # HER
20 |   use_her: False
21 |   her:
22 |     type: LunarLanderContinuousHER
23 |   success_score: 250.0
24 |   desired_states_from_demo: True
25 | 
26 | learner_cfg:
27 |   type: "BCSACLearner"
28 |   backbone:
29 |     actor:
30 |     critic_vf:
31 |     critic_qf:
32 |     shared_actor_critic:
33 |   head:
34 |     actor:
35 |       type: "TanhGaussianDistParams"
36 |       configs: 
37 |         hidden_sizes: [256, 256]
38 |         output_activation: "identity"
39 |         fixed_logstd: False
40 |     critic_vf:
41 |       type: "MLP"
42 |       configs:
43 |         hidden_sizes: [256, 256]
44 |         output_size: 1
45 |         output_activation: "identity"
46 |     critic_qf:
47 |       type: "MLP"
48 |       configs:
49 |         hidden_sizes: [256, 256]
50 |         output_size: 1
51 |         output_activation: "identity"
52 |   optim_cfg:
53 |     lr_actor: 0.0003
54 |     lr_vf: 0.0003
55 |     lr_qf1: 0.0003
56 |     lr_qf2: 0.0003
57 |     lr_entropy: 0.0003
58 |     weight_decay: 0.0
59 | 


--------------------------------------------------------------------------------
/configs/lunarlander_continuous_v2/gail_ppo.yaml:
--------------------------------------------------------------------------------
 1 | type: "GAILPPOAgent"
 2 | hyper_params:
 3 |   gamma: 0.99
 4 |   tau: 0.95
 5 |   batch_size: 128
 6 |   max_epsilon: 0.2
 7 |   min_epsilon: 0.2
 8 |   epsilon_decay_period: 1500
 9 |   w_value: 1.0
10 |   w_entropy: 0.001
11 |   gradient_clip_ac: 0.5
12 |   gradient_clip_cr: 1.0
13 |   epoch: 10
14 |   rollout_len: 1024
15 |   n_workers: 4
16 |   use_clipped_value_loss: False
17 |   standardize_advantage: True
18 |   gail_reward_weight: 1.0
19 |   demo_path: "data/lunarlander_continuous_demo.pkl"
20 | 
21 | learner_cfg:
22 |   type: "GAILPPOLearner"
23 |   backbone:
24 |     actor:
25 |     critic:
26 |     discriminator: 
27 |     shared_actor_critic:
28 |   head:
29 |     actor:
30 |       type: "GaussianDist"
31 |       configs: 
32 |         hidden_sizes: [256, 256]
33 |         output_activation: "identity"
34 |         fixed_logstd: True
35 |     critic:
36 |       type: "MLP"
37 |       configs:
38 |         hidden_sizes: [256, 256]
39 |         output_size: 1
40 |         output_activation: "identity"
41 |     discriminator:
42 |       type: "MLP"
43 |       configs:
44 |         hidden_sizes: [256, 256]
45 |         output_size: 1
46 |         output_activation: "identity"
47 |     aciton_embedder:
48 |       type: "MLP"
49 |       configs:
50 |         hidden_sizes: []
51 |         output_size: 16
52 |         output_activation: "identity"
53 |         
54 |   optim_cfg:
55 |     lr_actor: 0.0003
56 |     lr_critic: 0.001
57 |     lr_discriminator: 0.0003
58 |     weight_decay: 0.0
59 |     discriminator_acc_threshold : 0.8


--------------------------------------------------------------------------------
/configs/pong_no_frameskip_v4/r2d1.yaml:
--------------------------------------------------------------------------------
 1 | type: "R2D1Agent"
 2 | hyper_params:
 3 |   gamma: 0.99
 4 |   tau: 0.005
 5 |   buffer_size: 4000  # openai baselines: 10000
 6 |   batch_size: 32  # openai baselines: 32
 7 |   update_starts_from: 4000  # openai baselines: 10000
 8 |   multiple_update: 1  # multiple learning updates
 9 |   train_freq: 4  # in openai baselines, train_freq = 4
10 |   gradient_clip: 10.0  # dueling: 10.0
11 |   n_step: 5
12 |   w_n_step: 1.0
13 |   w_q_reg: 0.0
14 |   per_alpha: 0.6  # openai baselines: 0.6
15 |   per_beta: 0.4
16 |   per_eps: 0.000001
17 |   max_epsilon: 1.0
18 |   min_epsilon: 0.01  # openai baselines: 0.01
19 |   epsilon_decay: 0.000003  # openai baselines: 1e-7 / 1e-1
20 |   # Grad_cam
21 |   grad_cam_layer_list:
22 |     - "backbone.cnn.cnn_0.cnn"
23 |     - "backbone.cnn.cnn_1.cnn"
24 |     - "backbone.cnn.cnn_2.cnn"
25 |   # R2D1
26 |   sequence_size: 20
27 |   overlap_size: 10
28 | 
29 | learner_cfg:
30 |   type: "R2D1Learner"
31 |   loss_type:
32 |     type: "R2D1DQNLoss"
33 |   backbone:
34 |     type: "CNN"
35 |     configs:
36 |       input_sizes: [4, 32, 64]
37 |       output_sizes: [32, 64, 64]
38 |       kernel_sizes: [8, 4, 3]
39 |       strides: [4, 2, 1]
40 |       paddings: [1, 0, 0]
41 |   gru:
42 |     rnn_hidden_size: 512
43 |     burn_in_step: 10
44 |   head:
45 |     type: "DuelingMLP"
46 |     configs: 
47 |       hidden_sizes: [512]
48 |       output_activation: "identity"
49 |       # NoisyNet
50 |       use_noisy_net: False
51 |   optim_cfg:
52 |     lr_dqn: 0.0001
53 |     weight_decay: 0.0
54 |     adam_eps: 0.00000001
55 | 


--------------------------------------------------------------------------------
/configs/lunarlander_continuous_v2/sacfd.yaml:
--------------------------------------------------------------------------------
 1 | type: "SACfDAgent"
 2 | hyper_params:
 3 |   gamma: 0.99
 4 |   tau: 0.001
 5 |   buffer_size: 100000
 6 |   batch_size: 64
 7 |   initial_random_action: 5000
 8 |   multiple_update: 2  # multiple learning updates
 9 |   policy_update_freq: 2
10 |   w_entropy: 0.001
11 |   w_mean_reg: 0.001
12 |   w_std_reg: 0.001
13 |   w_pre_activation_reg: 0.0
14 |   auto_entropy_tuning: True
15 |   # fD
16 |   per_alpha: 0.6
17 |   per_beta: 0.4
18 |   per_eps: 0.000001
19 |   per_eps_demo: 1.0
20 |   n_step: 3
21 |   pretrain_step: 100
22 |   lambda1: 1.0  # N-step return weight
23 |   # lambda2 = weight_decay
24 |   lambda3: 1.0  # actor loss contribution of prior weight
25 |   demo_path: "data/lunarlander_continuous_demo.pkl"
26 | 
27 | learner_cfg:
28 |   type: "SACfDLearner"
29 |   backbone:
30 |     actor:
31 |     critic_vf:
32 |     critic_qf:
33 |     shared_actor_critic:
34 |   head:
35 |     actor:
36 |       type: "TanhGaussianDistParams"
37 |       configs: 
38 |         hidden_sizes: [256, 256]
39 |         output_activation: "identity"
40 |         fixed_logstd: False
41 |     critic_vf:
42 |       type: "MLP"
43 |       configs:
44 |         hidden_sizes: [256, 256]
45 |         output_size: 1
46 |         output_activation: "identity"
47 |     critic_qf:
48 |       type: "MLP"
49 |       configs:
50 |         hidden_sizes: [256, 256]
51 |         output_size: 1
52 |         output_activation: "identity"
53 |   optim_cfg:
54 |     lr_actor: 0.0003
55 |     lr_vf: 0.0003
56 |     lr_qf1: 0.0003
57 |     lr_qf2: 0.0003
58 |     lr_entropy: 0.0003
59 |     weight_decay: 0.00001
60 | 


--------------------------------------------------------------------------------
/configs/pong_no_frameskip_v4/dqn.yaml:
--------------------------------------------------------------------------------
 1 | type: "DQNAgent"
 2 | hyper_params:
 3 |   gamma: 0.99
 4 |   tau: 0.005
 5 |   buffer_size: 10000  # openai baselines: 10000
 6 |   batch_size: 32  # openai baselines: 32
 7 |   update_starts_from: 10000  # openai baselines: 10000
 8 |   multiple_update: 1  # multiple learning updates
 9 |   train_freq: 4  # in openai baselines, train_freq = 4
10 |   gradient_clip: 10.0  # dueling: 10.0
11 |   n_step: 3
12 |   w_n_step: 1.0
13 |   w_q_reg: 0.0
14 |   per_alpha: 0.6  # openai baselines: 0.6
15 |   per_beta: 0.4
16 |   per_eps: 0.000001
17 |   max_epsilon: 0.0
18 |   min_epsilon: 0.0  # openai baselines: 0.01
19 |   epsilon_decay: 0.000001  # openai baselines: 1e-7 / 1e-1
20 |   # Grad_cam
21 |   grad_cam_layer_list:
22 |     - "backbone.cnn.cnn_0.cnn"
23 |     - "backbone.cnn.cnn_1.cnn"
24 |     - "backbone.cnn.cnn_2.cnn"
25 | 
26 | learner_cfg:
27 |   type: "DQNLearner"
28 |   loss_type:
29 |     type: "IQNLoss"
30 |   backbone:
31 |     type: "CNN"
32 |     configs:
33 |       input_sizes: [4, 32, 64]
34 |       output_sizes: [32, 64, 64]
35 |       kernel_sizes: [8, 4, 3]
36 |       strides: [4, 2, 1]
37 |       paddings: [1, 0, 0]
38 |   head:
39 |     type: "IQNMLP"
40 |     configs: 
41 |       hidden_sizes: [512]
42 |       n_tau_samples: 64
43 |       n_tau_prime_samples: 64
44 |       n_quantile_samples: 32
45 |       quantile_embedding_dim: 64
46 |       kappa: 1.0
47 |       output_activation: "identity"
48 |       # NoisyNet
49 |       use_noisy_net: True
50 |       std_init: 0.5
51 |   optim_cfg:
52 |     lr_dqn: 0.0001
53 |     weight_decay: 0.0
54 |     adam_eps: 0.00000001
55 | 


--------------------------------------------------------------------------------
/tests/buffer/test_uniform_buffer.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Tuple
 2 | 
 3 | import numpy as np
 4 | from scipy.stats import chisquare
 5 | 
 6 | from rl_algorithms.common.buffer.replay_buffer import ReplayBuffer
 7 | 
 8 | 
 9 | def generate_transition(idx: int) -> Tuple[np.ndarray, ...]:
10 |     """Make dummy transition for testing buffer."""
11 |     obs = np.array([0])
12 |     act = np.array([0])
13 |     reward = idx
14 |     next_obs = np.array([0])
15 |     done = False
16 |     return (obs, act, reward, next_obs, done)
17 | 
18 | 
19 | def generate_sample_idx(buffer: ReplayBuffer) -> int:
20 |     """Generate indices to test whether sampled uniformly or not."""
21 |     for i in range(buffer.max_len):
22 |         buffer.add(generate_transition(i))
23 |     _, _, idx, _, _ = buffer.sample()
24 |     return idx
25 | 
26 | 
27 | def check_uniform(lst: List) -> bool:
28 |     """Check the distribution is Uniform Distribution."""
29 |     res = chisquare(lst)
30 |     return res[1] >= 0.05
31 | 
32 | 
33 | def test_uniform_sample(buffer_length=32, batch_size=8):
34 |     """Test whether transitions are uniformly sampled from replay buffer."""
35 | 
36 |     n_repeat = 10000
37 | 
38 |     buffer = ReplayBuffer(max_len=buffer_length, batch_size=batch_size)
39 | 
40 |     sampled_lst = [0] * buffer.max_len
41 |     # sampling index for the n_repeat times
42 |     for _ in range(n_repeat):
43 |         indices = generate_sample_idx(buffer)
44 |         for idx in indices:
45 |             sampled_lst[int(idx)] += 1 / n_repeat
46 | 
47 |     assert check_uniform(sampled_lst), "This distribution is not uniform."
48 | 
49 | 
50 | if __name__ == "__main__":
51 |     test_uniform_sample()
52 | 


--------------------------------------------------------------------------------
/rl_algorithms/ppo/utils.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """Utility functions for PPO.
 3 | 
 4 | This module has PPO util functions.
 5 | 
 6 | - Author: Curt Park
 7 | - Contact: curt.park@medipixel.io
 8 | - Paper: https://arxiv.org/abs/1707.06347
 9 | """
10 | 
11 | from collections import deque
12 | from typing import List
13 | 
14 | import numpy as np
15 | import torch
16 | 
17 | 
18 | def compute_gae(
19 |     next_value: list,
20 |     rewards: list,
21 |     masks: list,
22 |     values: list,
23 |     gamma: float = 0.99,
24 |     tau: float = 0.95,
25 | ) -> List:
26 |     """Compute gae."""
27 |     values = values + [next_value]
28 |     gae = 0
29 |     returns: deque = deque()
30 | 
31 |     for step in reversed(range(len(rewards))):
32 |         delta = rewards[step] + gamma * values[step + 1] * masks[step] - values[step]
33 |         gae = delta + gamma * tau * masks[step] * gae
34 |         returns.appendleft(gae + values[step])
35 | 
36 |     return list(returns)
37 | 
38 | 
39 | def ppo_iter(
40 |     epoch: int,
41 |     mini_batch_size: int,
42 |     states: torch.Tensor,
43 |     actions: torch.Tensor,
44 |     values: torch.Tensor,
45 |     log_probs: torch.Tensor,
46 |     returns: torch.Tensor,
47 |     advantages: torch.Tensor,
48 | ):
49 |     """Yield mini-batches."""
50 |     batch_size = states.size(0)
51 |     for ep in range(epoch):
52 |         for _ in range(batch_size // mini_batch_size):
53 |             rand_ids = np.random.choice(batch_size, mini_batch_size)
54 |             yield states[rand_ids, :], actions[rand_ids, :], values[
55 |                 rand_ids, :
56 |             ], log_probs[rand_ids, :], returns[rand_ids, :], advantages[rand_ids, :], ep
57 | 


--------------------------------------------------------------------------------
/rl_algorithms/common/env/utils.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """Util functions for env.
 3 | 
 4 | - Author: Curt Park
 5 | - Contact: curt.park@medipixel.io
 6 | """
 7 | 
 8 | from typing import Callable, List, Tuple
 9 | 
10 | import gym
11 | from gym.spaces import Discrete
12 | 
13 | from rl_algorithms.common.env.multiprocessing_env import SubprocVecEnv
14 | from rl_algorithms.common.env.normalizers import ActionNormalizer
15 | 
16 | 
17 | def set_env(
18 |     env: gym.Env, max_episode_steps: int, env_wrappers: List[gym.Wrapper] = None
19 | ) -> Tuple[gym.Env, int]:
20 |     """Set environment according to user's config."""
21 |     if max_episode_steps > 0:
22 |         env._max_episode_steps = max_episode_steps
23 |     else:
24 |         max_episode_steps = env._max_episode_steps
25 | 
26 |     if not isinstance(env.action_space, Discrete):
27 |         env = ActionNormalizer(env)
28 | 
29 |     if env_wrappers:
30 |         for env_wrapper in env_wrappers:
31 |             env = env_wrapper(env)
32 | 
33 |     return env, max_episode_steps
34 | 
35 | 
36 | def env_generator(
37 |     env_name: str, max_episode_steps: int, env_wrappers: List[gym.Wrapper] = None
38 | ) -> Callable:
39 |     """Return env creating function (with normalizers)."""
40 | 
41 |     def _thunk(rank: int):
42 |         env = gym.make(env_name)
43 |         env.seed(777 + rank + 1)
44 |         env, _ = set_env(env, max_episode_steps, env_wrappers)
45 |         return env
46 | 
47 |     return _thunk
48 | 
49 | 
50 | def make_envs(env_gen: Callable, n_envs: int = 8) -> SubprocVecEnv:
51 |     """Make multiple environments running on multiprocssors."""
52 |     envs = [env_gen(i) for i in range(n_envs)]
53 |     subproc_env = SubprocVecEnv(envs)
54 |     return subproc_env
55 | 


--------------------------------------------------------------------------------
/configs/pong_no_frameskip_v4/dqn_resnet.yaml:
--------------------------------------------------------------------------------
 1 | type: "DQNAgent"
 2 | hyper_params:
 3 |   gamma: 0.99
 4 |   tau: 0.005
 5 |   buffer_size: 10000  # openai baselines: 10000
 6 |   batch_size: 16  # openai baselines: 32
 7 |   update_starts_from: 10000  # openai baselines: 10000
 8 |   multiple_update: 1  # multiple learning updates
 9 |   train_freq: 4  # in openai baselines, train_freq = 4
10 |   gradient_clip: 10.0  # dueling: 10.0
11 |   n_step: 3
12 |   w_n_step: 1.0
13 |   w_q_reg: 0.0
14 |   per_alpha: 0.6  # openai baselines: 0.6
15 |   per_beta: 0.4
16 |   per_eps: 0.000001
17 |   max_epsilon: 0.0
18 |   min_epsilon: 0.0  # openai baselines: 0.01
19 |   epsilon_decay: 0.000001  # openai baselines: 1e-7 / 1e-1
20 |   # Grad_cam
21 |   grad_cam_layer_list:
22 |     - "backbone.layer1.0.conv2"
23 |     - "backbone.layer2.0.shortcut.0"
24 |     - "backbone.layer3.0.shortcut.0"
25 |     - "backbone.layer4.0.shortcut.0"
26 |     - "backbone.conv_out"
27 | 
28 | learner_cfg:
29 |   type: "DQNLearner"
30 |   loss_type:
31 |     type: "IQNLoss"
32 |   backbone:
33 |     type: "ResNet"
34 |     configs:
35 |       use_bottleneck: False
36 |       num_blocks: [1, 1, 1, 1]
37 |       block_output_sizes: [32, 32, 64, 64]
38 |       block_strides: [1, 2, 2, 2]
39 |       first_input_size: 4
40 |       first_output_size: 32
41 |       expansion: 1
42 |       channel_compression: 4  # compression ratio
43 |   head:
44 |     type: "IQNMLP"
45 |     configs: 
46 |       hidden_sizes: [512]
47 |       n_tau_samples: 64
48 |       n_tau_prime_samples: 64
49 |       n_quantile_samples: 32
50 |       quantile_embedding_dim: 64
51 |       kappa: 1.0
52 |       output_activation: "identity"
53 |       # NoisyNet
54 |       use_noisy_net: True
55 |       std_init: 0.5
56 |   optim_cfg:
57 |     lr_dqn: 0.0001
58 |     weight_decay: 0.0
59 |     adam_eps: 0.00000001
60 | 


--------------------------------------------------------------------------------
/tests/test_helper_funcion.py:
--------------------------------------------------------------------------------
 1 | from collections import deque
 2 | import random
 3 | from typing import Deque
 4 | 
 5 | import numpy as np
 6 | 
 7 | from rl_algorithms.common.helper_functions import get_n_step_info
 8 | 
 9 | 
10 | def generate_dummy_buffer(maxlen: int, index: int) -> Deque:
11 |     """Generate dummy n_step buffer."""
12 |     assert index <= maxlen
13 |     n_step_buffer = deque(maxlen=maxlen)
14 |     for i in range(maxlen):
15 |         done = i == index
16 |         transition = (np.array([i]), np.array([0]), i, np.array([i + 1]), done)
17 |         n_step_buffer.append(transition)
18 |     return n_step_buffer
19 | 
20 | 
21 | def check_case1(maxlen: int):
22 |     """Test when the transition is terminal state."""
23 |     done_index = 0
24 |     n_step_buffer = generate_dummy_buffer(maxlen, done_index)
25 |     reward, next_state, _ = get_n_step_info(n_step_buffer, gamma=1)
26 |     assert reward == done_index
27 |     assert next_state == done_index + 1
28 | 
29 | 
30 | def check_case2(maxlen: int):
31 |     """Test when there are no terminal within n_step."""
32 |     done_index = maxlen
33 |     n_step_buffer = generate_dummy_buffer(maxlen, done_index)
34 |     reward, next_state, _ = get_n_step_info(n_step_buffer, gamma=1)
35 |     assert reward * 2 == maxlen * (maxlen - 1)
36 |     assert next_state == maxlen
37 | 
38 | 
39 | def check_case3(maxlen: int):
40 |     """Test when the terminal states exist within n_step."""
41 |     done_index = random.randint(1, maxlen - 1)
42 |     n_step_buffer = generate_dummy_buffer(maxlen, done_index)
43 |     reward, next_state, _ = get_n_step_info(n_step_buffer, gamma=1)
44 |     assert reward * 2 == done_index * (done_index + 1)
45 |     assert next_state == done_index + 1
46 | 
47 | 
48 | def test_get_n_step_info(maxlen=10):
49 |     check_case1(maxlen)
50 |     check_case2(maxlen)
51 |     check_case3(maxlen)
52 | 
53 | 
54 | if __name__ == "__main__":
55 |     test_get_n_step_info(maxlen=10)
56 | 


--------------------------------------------------------------------------------
/rl_algorithms/common/buffer/gail_buffer.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """Demo buffer for GAIL algorithm."""
 3 | 
 4 | import pickle
 5 | from typing import List, Tuple
 6 | 
 7 | import numpy as np
 8 | import torch
 9 | 
10 | from rl_algorithms.common.abstract.buffer import BaseBuffer
11 | 
12 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
13 | 
14 | 
15 | class GAILBuffer(BaseBuffer):
16 |     """Buffer to store expert states and actions.
17 | 
18 |     Attributes:
19 |         obs_buf (np.ndarray): observations
20 |         acts_buf (np.ndarray): actions
21 |     """
22 | 
23 |     def __init__(self, dataset_path: str):
24 |         """Initialize a Buffer.
25 | 
26 |         Args:
27 |             dataset_path (str): path of the demo dataset
28 |         """
29 | 
30 |         self.obs_buf: np.ndarray = None
31 |         self.acts_buf: np.ndarray = None
32 | 
33 |         self.load_demo(dataset_path)
34 | 
35 |     def load_demo(self, dataset_path: str):
36 |         """load demo data."""
37 |         with open(dataset_path, "rb") as f:
38 |             demo = list(pickle.load(f))
39 |         demo = np.array(demo)
40 |         self.obs_buf = np.array(list(map(np.array, demo[:, 0])))
41 |         self.acts_buf = np.array(list(map(np.array, demo[:, 1])))
42 | 
43 |     def add(self):
44 |         pass
45 | 
46 |     def sample(self, batch_size, indices: List[int] = None) -> Tuple[np.ndarray, ...]:
47 |         """Randomly sample a batch of experiences from memory."""
48 |         assert 0 < batch_size < len(self)
49 | 
50 |         if indices is None:
51 |             indices = np.random.choice(len(self), size=batch_size)
52 | 
53 |         states = self.obs_buf[indices]
54 |         actions = self.acts_buf[indices]
55 | 
56 |         return torch.Tensor(states).to(device), torch.Tensor(actions).to(device)
57 | 
58 |     def __len__(self) -> int:
59 |         """Return the current size of internal memory."""
60 |         return len(self.obs_buf)
61 | 


--------------------------------------------------------------------------------
/tests/integration/test_run_apex.py:
--------------------------------------------------------------------------------
 1 | """Test only one step of run file for training."""
 2 | 
 3 | import os
 4 | import os.path as osp
 5 | import re
 6 | import shutil
 7 | import subprocess
 8 | 
 9 | 
10 | def check_run_apex(config_root: str, run_file: str):
11 |     """Test that 1 episode of run file works well."""
12 |     test_dir = osp.dirname(osp.abspath(__file__))
13 |     pkg_root_dir = osp.dirname(osp.dirname(test_dir))
14 |     os.chdir(pkg_root_dir)
15 | 
16 |     # loop of configs
17 |     configs = os.listdir(config_root)
18 |     for cfg in configs:
19 |         # except such as __init__, __pycache__
20 |         if "__" in cfg or "apex" not in cfg:
21 |             continue
22 | 
23 |         cmd = (
24 |             f"python {run_file} --cfg-path {config_root}{cfg} --integration-test "
25 |             + f"--off-render --seed 12345 --interim-test-num 1"
26 |         )
27 | 
28 |         p = subprocess.Popen(
29 |             cmd,
30 |             stdout=subprocess.PIPE,
31 |             stderr=subprocess.STDOUT,
32 |             universal_newlines=True,
33 |             shell=True,
34 |         )
35 |         output, _ = p.communicate()
36 |         print(str(output))
37 |         assert p.returncode == 0
38 | 
39 |         # Find saved checkpoint path
40 |         pattern = r"./checkpoint/.+/"
41 |         save_path = re.findall(pattern, str(output))[0]
42 |         print(save_path)
43 | 
44 |         check_save_path(save_path)
45 | 
46 | 
47 | def check_save_path(save_path: str):
48 |     """Check checkpoint that tested run file makes and remove the checkpoint."""
49 |     assert os.path.exists(save_path)
50 | 
51 |     # Remove checkpoint dir
52 |     shutil.rmtree(save_path)
53 | 
54 | 
55 | def test_run_pong_no_frame_skip():
56 |     """Test all agents that train PongNoFrameskip-v4 env."""
57 |     check_run_apex("configs/pong_no_frameskip_v4/", "run_pong_no_frameskip_v4.py")
58 | 
59 | 
60 | if __name__ == "__main__":
61 |     test_run_pong_no_frame_skip()
62 | 


--------------------------------------------------------------------------------
/configs/pong_no_frameskip_v4/apex_dqn.yaml:
--------------------------------------------------------------------------------
 1 | type: "ApeX"
 2 | hyper_params:
 3 |   gamma: 0.99
 4 |   tau: 0.005
 5 |   buffer_size: 100000  # openai baselines: 10000
 6 |   batch_size: 512  # openai baselines: 32
 7 |   update_starts_from: 30000  # openai baselines: 10000
 8 |   multiple_update: 1  # multiple learning updates
 9 |   train_freq: 1  # in openai baselines, train_freq = 4
10 |   gradient_clip: 10.0  # dueling: 10.0
11 |   n_step: 5
12 |   w_n_step: 1.0
13 |   w_q_reg: 0.0
14 |   per_alpha: 0.6  # openai baselines: 0.6
15 |   per_beta: 0.4
16 |   per_eps: 0.000001
17 |   max_epsilon: 1.0
18 |   min_epsilon: 0.1  # openai baselines: 0.01
19 |   epsilon_decay: 0.0000005  # openai baselines: 1e-7 / 1e-1
20 |   # Grad_cam
21 |   grad_cam_layer_list:
22 |     - "backbone.cnn.cnn_0.cnn"
23 |     - "backbone.cnn.cnn_1.cnn"
24 |     - "backbone.cnn.cnn_2.cnn"
25 |   # ApeX
26 |   num_workers: 2
27 |   local_buffer_max_size: 1000
28 |   worker_update_interval: 50
29 |   logger_interval: 1000
30 |   max_update_step: 100000
31 |   is_worker_log: True
32 |   is_worker_render: False
33 | 
34 | learner_cfg:
35 |   type: "DQNLearner"
36 |   loss_type:
37 |     type: "DQNLoss"
38 |   backbone:
39 |     type: "CNN"
40 |     configs:
41 |       input_sizes: [4, 32, 64]
42 |       output_sizes: [32, 64, 64]
43 |       kernel_sizes: [8, 4, 3]
44 |       strides: [4, 2, 1]
45 |       paddings: [1, 0, 0]
46 |   head:
47 |     type: "DuelingMLP"
48 |     configs: 
49 |       hidden_sizes: [512]
50 |       output_activation: "identity"
51 |       # NoisyNet
52 |       use_noisy_net: False
53 |   optim_cfg:
54 |     lr_dqn: 0.0003
55 |     weight_decay: 0.0
56 |     adam_eps: 0.00000001
57 | 
58 | worker_cfg:
59 |   type: "DQNWorker"
60 |   device: "cpu"
61 | 
62 | logger_cfg:
63 |   type: "DQNLogger"
64 | 
65 | comm_cfg:
66 |   learner_buffer_port: 6554
67 |   learner_worker_port: 6555
68 |   worker_buffer_port: 6556
69 |   learner_logger_port: 6557
70 |   send_batch_port: 6558
71 |   priorities_port: 6559
72 | 


--------------------------------------------------------------------------------
/configs/pong_no_frameskip_v4/distillation_dqn.yaml:
--------------------------------------------------------------------------------
 1 | type: "DistillationDQNAgent"
 2 | hyper_params:
 3 |   gamma: 0.99
 4 |   tau: 0.005
 5 |   buffer_size: 10000  # openai baselines: 10000
 6 |   batch_size: 32  # openai baselines: 32
 7 |   update_starts_from: 10000  # openai baselines: 10000
 8 |   multiple_update: 1  # multiple learning updates
 9 |   train_freq: 4  # in openai baselines, train_freq = 4
10 |   gradient_clip: 10.0  # dueling: 10.0
11 |   n_step: 3
12 |   w_n_step: 1.0
13 |   w_q_reg: 0.0
14 |   per_alpha: 0.6  # openai baselines: 0.6
15 |   per_beta: 0.4
16 |   per_eps: 0.000001
17 |   max_epsilon: 0.0
18 |   min_epsilon: 0.0  # openai baselines: 0.01
19 |   epsilon_decay: 0.000001  # openai baselines: 1e-7 / 1e-1
20 |   # Grad_cam
21 |   grad_cam_layer_list:
22 |     - "backbone.cnn.cnn_0.cnn"
23 |     - "backbone.cnn.cnn_1.cnn"
24 |     - "backbone.cnn.cnn_2.cnn"
25 |   # Distillation
26 |   dataset_path:
27 |     - "data/distillation_buffer/PongNoFrameskip-v4/20200821134905"
28 |     - "data/distillation_buffer/PongNoFrameskip-v4/20200821142921"
29 |     - "data/distillation_buffer/PongNoFrameskip-v4/20200821145228"
30 |   save_dir: "data/"
31 |   epochs: 20
32 |   n_frame_from_last: 50000
33 |   is_student: False
34 | 
35 | learner_cfg:
36 |   type: "DQNLearner"
37 |   loss_type:
38 |     type: "IQNLoss"
39 |   backbone:
40 |     type: "CNN"
41 |     configs:
42 |       input_sizes: [4, 32, 64]
43 |       output_sizes: [32, 64, 64]
44 |       kernel_sizes: [8, 4, 3]
45 |       strides: [4, 2, 1]
46 |       paddings: [1, 0, 0]
47 |   head:
48 |     type: "IQNMLP"
49 |     configs: 
50 |       hidden_sizes: [512]
51 |       n_tau_samples: 64
52 |       n_tau_prime_samples: 64
53 |       n_quantile_samples: 32
54 |       quantile_embedding_dim: 64
55 |       kappa: 1.0
56 |       output_activation: "identity"
57 |       # NoisyNet
58 |       use_noisy_net: True
59 |       std_init: 0.5
60 |   optim_cfg:
61 |     lr_dqn: 0.0001
62 |     weight_decay: 0.0
63 |     adam_eps: 0.00000001
64 | 


--------------------------------------------------------------------------------
/rl_algorithms/registry.py:
--------------------------------------------------------------------------------
 1 | from rl_algorithms.utils import Registry, build_from_cfg, build_ray_obj_from_cfg
 2 | from rl_algorithms.utils.config import ConfigDict
 3 | 
 4 | AGENTS = Registry("agents")
 5 | LEARNERS = Registry("learners")
 6 | BACKBONES = Registry("backbones")
 7 | HEADS = Registry("heads")
 8 | LOSSES = Registry("losses")
 9 | HERS = Registry("hers")
10 | WORKERS = Registry("workers")
11 | LOGGERS = Registry("loggers")
12 | 
13 | 
14 | def build_agent(cfg: ConfigDict, build_args: dict = None):
15 |     """Build agent using config and additional arguments."""
16 |     return build_from_cfg(cfg, AGENTS, build_args)
17 | 
18 | 
19 | def build_learner(cfg: ConfigDict, build_args: dict = None):
20 |     """Build learner using config and additional arguments."""
21 |     return build_from_cfg(cfg, LEARNERS, build_args)
22 | 
23 | 
24 | def build_backbone(cfg: ConfigDict, build_args: dict = None):
25 |     """Build backbone using config and additional arguments."""
26 |     return build_from_cfg(cfg, BACKBONES, build_args)
27 | 
28 | 
29 | def build_head(cfg: ConfigDict, build_args: dict = None):
30 |     """Build head using config and additional arguments."""
31 |     return build_from_cfg(cfg, HEADS, build_args)
32 | 
33 | 
34 | def build_loss(cfg: ConfigDict, build_args: dict = None):
35 |     """Build loss using config and additional arguments."""
36 |     return build_from_cfg(cfg, LOSSES, build_args)
37 | 
38 | 
39 | def build_her(cfg: ConfigDict, build_args: dict = None):
40 |     """Build her using config and additional arguments."""
41 |     return build_from_cfg(cfg, HERS, build_args)
42 | 
43 | 
44 | def build_worker(cfg: ConfigDict, build_args: dict = None):
45 |     """Build ray worker using config and additional arguments."""
46 |     # return build_ray_obj_from_cfg(cfg, WORKERS, build_args)
47 |     return build_from_cfg(cfg, WORKERS, build_args)
48 | 
49 | 
50 | def build_logger(cfg: ConfigDict, build_args: dict = None):
51 |     """Build ray worker using config and additional arguments."""
52 |     return build_ray_obj_from_cfg(cfg, LOGGERS, build_args)
53 | 


--------------------------------------------------------------------------------
/rl_algorithms/dqn/distributed_logger.py:
--------------------------------------------------------------------------------
 1 | """DQN Logger for distributed training.
 2 | 
 3 | - Author: Chris Yoon
 4 | - Contact: chris.yoon@medipixel.io
 5 | """
 6 | 
 7 | import numpy as np
 8 | import torch
 9 | import wandb
10 | 
11 | from rl_algorithms.common.abstract.distributed_logger import DistributedLogger
12 | from rl_algorithms.registry import LOGGERS
13 | 
14 | 
15 | @LOGGERS.register_module
16 | class DQNLogger(DistributedLogger):
17 |     """DQN Logger for distributed training."""
18 | 
19 |     def load_params(self, path: str):
20 |         """Load model and optimizer parameters."""
21 |         # Logger only runs on cpu
22 |         DistributedLogger.load_params(self, path)
23 | 
24 |         params = torch.load(path, map_location="cpu")
25 |         self.brain.load_state_dict(params["dqn_state_dict"])
26 |         print("[INFO] loaded the model and optimizer from", path)
27 | 
28 |     def select_action(self, state: np.ndarray):
29 |         """Select action to be executed at given state."""
30 |         with torch.no_grad():
31 |             state = self._preprocess_state(state, self.device)
32 |             selected_action = self.brain(state).argmax()
33 |         selected_action = selected_action.cpu().numpy()
34 | 
35 |         return selected_action
36 | 
37 |     def write_log(self, log_value: dict):
38 |         """Write log about loss and score."""
39 |         print(
40 |             "[INFO] update_step %d, average score: %f, "
41 |             "loss: %f, avg q-value: %f"
42 |             % (
43 |                 log_value["update_step"],
44 |                 log_value["avg_score"],
45 |                 log_value["step_info"][0],
46 |                 log_value["step_info"][1],
47 |             )
48 |         )
49 | 
50 |         if self.is_log:
51 |             wandb.log(
52 |                 {
53 |                     "test score": log_value["avg_score"],
54 |                     "dqn loss": log_value["step_info"][0],
55 |                     "avg q values": log_value["step_info"][1],
56 |                 },
57 |                 step=log_value["update_step"],
58 |             )
59 | 


--------------------------------------------------------------------------------
/rl_algorithms/common/noise.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """Noise classes for algorithms."""
 3 | 
 4 | import copy
 5 | import random
 6 | 
 7 | import numpy as np
 8 | 
 9 | 
10 | class GaussianNoise:
11 |     """Gaussian Noise.
12 | 
13 |     Taken from https://github.com/vitchyr/rlkit
14 |     """
15 | 
16 |     def __init__(
17 |         self,
18 |         action_dim: int,
19 |         min_sigma: float = 1.0,
20 |         max_sigma: float = 1.0,
21 |         decay_period: int = 1000000,
22 |     ):
23 |         """Initialize."""
24 |         self.action_dim = action_dim
25 |         self.max_sigma = max_sigma
26 |         self.min_sigma = min_sigma
27 |         self.decay_period = decay_period
28 | 
29 |     def sample(self, t: int = 0) -> float:
30 |         """Get an action with gaussian noise."""
31 |         sigma = self.max_sigma - (self.max_sigma - self.min_sigma) * min(
32 |             1.0, t / self.decay_period
33 |         )
34 |         return np.random.normal(0, sigma, size=self.action_dim)
35 | 
36 | 
37 | class OUNoise:
38 |     """Ornstein-Uhlenbeck process.
39 | 
40 |     Taken from Udacity deep-reinforcement-learning github repository:
41 |     https://github.com/udacity/deep-reinforcement-learning/blob/master/
42 |     ddpg-pendulum/ddpg_agent.py
43 |     """
44 | 
45 |     def __init__(
46 |         self, size: int, mu: float = 0.0, theta: float = 0.15, sigma: float = 0.2
47 |     ):
48 |         """Initialize parameters and noise process."""
49 |         self.state = np.float64(0.0)
50 |         self.mu = mu * np.ones(size)
51 |         self.theta = theta
52 |         self.sigma = sigma
53 |         self.reset()
54 | 
55 |     def reset(self):
56 |         """Reset the internal state (= noise) to mean (mu)."""
57 |         self.state = copy.copy(self.mu)
58 | 
59 |     def sample(self) -> float:
60 |         """Update internal state and return it as a noise sample."""
61 |         x = self.state
62 |         dx = self.theta * (self.mu - x) + self.sigma * np.array(
63 |             [random.random() for _ in range(len(x))]
64 |         )
65 |         self.state = x + dx
66 |         return self.state
67 | 


--------------------------------------------------------------------------------
/rl_algorithms/distillation/README.md:
--------------------------------------------------------------------------------
 1 | # Using policy distillation
 2 | 
 3 | 
 4 | We implemented 3 featues for training policy distillation.
 5 | 
 6 | ## 1. Student training using trained agent's data (expert data)
 7 | 
 8 | You can generate trained agent's data(expert data) by iterating the test episode.
 9 | 
10 | ```
11 | python run_env_name.py --cfg-path <distillation-config-path> --load-from <teacher-checkpoint-path> --test 
12 | ```
13 | The collected states will be stored in directory:  `data/distribution_buffer/<env_name>`.
14 | 
15 | 
16 | If the expert data is generated, Put the path of the train-phase data in the dataset_path list in the distillation config file. Also change `is_student` to `True` in config file. And then execute the training just as the code below:
17 | 
18 | ```
19 | python run_env_name.py --cfg-path <distillation-config-path>  
20 | ```
21 | 
22 | You can set `epoch` and `batch_size` of the student learning through `epochs` and `batch_size` variables in the distillation config file.
23 | 
24 | ## 2. Student training using training-phase states and trained agent 
25 | 
26 | This method provides the way to train the student using states that are generated as you train the agent(which we call it the train-phase data). 
27 | 
28 | Using distillation config file for training will automatically generate the train-phase data.
29 | ```
30 | python run_env_name.py --cfg-path <distillation-config-path>
31 | ```
32 | 
33 | The generated data will be stored in directory:  `data/distribution_buffer/<env_name>`.
34 | 
35 | 
36 | Since train-phase data doesn't contains the q value, you should load trained agent to generate q values for train-phase data. After putting the path of the train-phase data and changing `is_student` to `True` in the dataset_path list in the distillation config file, You can execute the training as the code below:
37 | ```
38 | python run_env_name.py --cfg-path <distillation-config-path> --load-from <teacher-checkpoint-path>
39 | ```
40 | 
41 | ## 3. Test student agent
42 | If you only want to check the performance of the student agent, you should use the orginal agent config file instead of distillation config file. In pong environment for instance, you can use `dqn.py` config file instead of `distillation_dqn.py`. Using distillation config will also work well, but it will generate expert data while you're running the test. 
43 | ```
44 | python run_env_name.py --test --load-from <student-checkpoint-path> --cfg-path <config-path>
45 | ```
46 | 


--------------------------------------------------------------------------------
/rl_algorithms/common/networks/backbones/cnn.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """CNN modules for RL algorithms.
 3 | 
 4 | - Authors: Kyunghwan Kim, Curt Park
 5 | - Contacts: kh.kim@medipixel.io
 6 |             curt.park@medipixel.io
 7 | """
 8 | 
 9 | from typing import Callable
10 | 
11 | import torch
12 | import torch.nn as nn
13 | import torch.nn.functional as F
14 | 
15 | from rl_algorithms.common.helper_functions import identity
16 | from rl_algorithms.registry import BACKBONES
17 | from rl_algorithms.utils.config import ConfigDict
18 | 
19 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
20 | 
21 | 
22 | # TODO: Remove it when upgrade torch>=1.7
23 | # pylint: disable=abstract-method
24 | class CNNLayer(nn.Module):
25 |     def __init__(
26 |         self,
27 |         input_size: int,
28 |         output_size: int,
29 |         kernel_size: int,
30 |         stride: int = 1,
31 |         padding: int = 0,
32 |         pre_activation_fn: Callable = identity,
33 |         activation_fn: Callable = F.relu,
34 |         post_activation_fn: Callable = identity,
35 |     ):
36 |         super(CNNLayer, self).__init__()
37 | 
38 |         self.cnn = nn.Conv2d(
39 |             input_size,
40 |             output_size,
41 |             kernel_size=kernel_size,
42 |             stride=stride,
43 |             padding=padding,
44 |         )
45 | 
46 |         self.pre_activation_fn = pre_activation_fn
47 |         self.activation_fn = activation_fn
48 |         self.post_activation_fn = post_activation_fn
49 | 
50 |     def forward(self, x):
51 |         x = self.cnn(x)
52 |         x = self.pre_activation_fn(x)
53 |         x = self.activation_fn(x)
54 |         x = self.post_activation_fn(x)
55 | 
56 |         return x
57 | 
58 | 
59 | # TODO: Remove it when upgrade torch>=1.7
60 | # pylint: disable=abstract-method
61 | @BACKBONES.register_module
62 | class CNN(nn.Module):
63 |     """Baseline of Convolution neural network."""
64 | 
65 |     def __init__(self, configs: ConfigDict):
66 |         super(CNN, self).__init__()
67 | 
68 |         cnn_layers = list(map(CNNLayer, *configs.values()))
69 |         self.cnn = nn.Sequential()
70 |         for i, cnn_layer in enumerate(cnn_layers):
71 |             self.cnn.add_module("cnn_{}".format(i), cnn_layer)
72 | 
73 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
74 |         """Forward method implementation."""
75 |         if len(x.size()) == 3:
76 |             x = x.unsqueeze(0)
77 |         x = self.cnn(x)
78 |         x = x.view(x.size(0), -1)
79 |         return x
80 | 


--------------------------------------------------------------------------------
/tests/buffer/test_distillation_buffer.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pickle
 3 | import random
 4 | import shutil
 5 | 
 6 | import numpy as np
 7 | import pytest
 8 | 
 9 | from rl_algorithms.common.buffer.distillation_buffer import DistillationBuffer
10 | 
11 | FOLDER_PATH_LIST = [
12 |     "data/distillation_buffer/test/expert_data/",
13 |     "data/distillation_buffer/test/expert_data2/",
14 |     "data/distillation_buffer/test/trainphase_data/",
15 | ]
16 | 
17 | 
18 | def gen_test_data(num_files: int):
19 |     """Generate dummy data."""
20 |     for _dir in FOLDER_PATH_LIST:
21 |         os.makedirs(_dir, exist_ok=True)
22 | 
23 |     for i, _dir in enumerate(FOLDER_PATH_LIST):
24 |         for j in range(num_files):
25 |             state = np.random.randint(0, 255, size=(3, 3, 2), dtype=np.uint8)
26 |             action = np.zeros(3)
27 |             action[random.randint(0, len(action) - 1)] = 1
28 |             action = action.astype(np.int)
29 |             if "trainphase" in _dir:
30 |                 with open(f"{FOLDER_PATH_LIST[i]}{j:07}.pkl", "wb") as f:
31 |                     pickle.dump([state], f)
32 |             else:
33 |                 with open(f"{FOLDER_PATH_LIST[i]}{j:07}.pkl", "wb") as f:
34 |                     pickle.dump([state, action], f)
35 | 
36 | 
37 | def check_multiple_data_load(num_files: int):
38 |     """Check if DistillationBuffer can load data from multiple path."""
39 |     batch_size = num_files * len(FOLDER_PATH_LIST[:-1])
40 |     memory = DistillationBuffer(batch_size, FOLDER_PATH_LIST[:-1])
41 |     memory.reset_dataloader()
42 |     state, _ = memory.sample_for_diltillation()
43 |     assert state.shape[0] == batch_size
44 | 
45 | 
46 | def check_mixture_data_assert(num_files: int):
47 |     """Check if DistillationBuffer can check whether trainphase & expert data is mixed."""
48 |     memory = DistillationBuffer(num_files, FOLDER_PATH_LIST)
49 |     with pytest.raises(AssertionError, match=r"mixture"):
50 |         memory.reset_dataloader()
51 | 
52 | 
53 | def delete_path(path: str):
54 |     """Delete directory."""
55 |     shutil.rmtree(path)
56 | 
57 | 
58 | def test_distillation_buffer():
59 |     """Test DistillationBuffer."""
60 |     try:
61 |         num_file = 2
62 |         gen_test_data(num_file)
63 |         check_multiple_data_load(num_file)
64 |         check_mixture_data_assert(num_file)
65 | 
66 |     except AssertionError as e:
67 |         raise e
68 | 
69 |     finally:
70 |         delete_path("data/distillation_buffer/test")
71 | 
72 | 
73 | if __name__ == "__main__":
74 |     test_distillation_buffer()
75 | 


--------------------------------------------------------------------------------
/tests/integration/test_run_agent.py:
--------------------------------------------------------------------------------
 1 | """Test only one step of run file for training."""
 2 | 
 3 | import os
 4 | import os.path as osp
 5 | import re
 6 | import shutil
 7 | import subprocess
 8 | 
 9 | 
10 | def check_run_env(config_root: str, run_file: str):
11 |     """Test that 1 episode of run file works well."""
12 |     test_dir = osp.dirname(osp.abspath(__file__))
13 |     pkg_root_dir = osp.dirname(osp.dirname(test_dir))
14 |     os.chdir(pkg_root_dir)
15 | 
16 |     # loop of configs
17 |     configs = os.listdir(config_root)
18 |     for cfg in configs:
19 |         # except such as __init__, __pycache__
20 |         if "__" in cfg or "apex" in cfg or "distillation" in cfg:
21 |             continue
22 | 
23 |         cmd = (
24 |             f"python {run_file} --cfg-path {config_root}{cfg} --integration-test "
25 |             + f"--off-render --episode-num 1 --max-episode-step 1 --seed 12345 "
26 |             + f"--interim-test-num 1"
27 |         )
28 | 
29 |         p = subprocess.Popen(
30 |             cmd,
31 |             stdout=subprocess.PIPE,
32 |             stderr=subprocess.STDOUT,
33 |             universal_newlines=True,
34 |             shell=True,
35 |         )
36 |         output, _ = p.communicate()
37 |         print(str(output))
38 |         assert p.returncode == 0, "Subprocess doesn't finished successfully."
39 | 
40 |         # Find saved checkpoint path
41 |         pattern = r"./checkpoint/.+/"
42 |         save_path = re.findall(pattern, str(output))[0]
43 |         print(save_path)
44 | 
45 |         check_save_path(save_path)
46 | 
47 | 
48 | def check_save_path(save_path: str):
49 |     """Check checkpoint that tested run file makes and remove the checkpoint."""
50 |     assert os.path.exists(save_path)
51 | 
52 |     # Remove checkpoint dir
53 |     shutil.rmtree(save_path)
54 | 
55 | 
56 | def test_run_lunarlander_continuous():
57 |     """Test all agents that train LunarLanderContinuous-v2 env."""
58 |     check_run_env(
59 |         "configs/lunarlander_continuous_v2/", "run_lunarlander_continuous_v2.py"
60 |     )
61 | 
62 | 
63 | def test_run_lunarlander():
64 |     """Test all agents that train LunarLander-v2 env."""
65 |     check_run_env("configs/lunarlander_v2/", "run_lunarlander_v2.py")
66 | 
67 | 
68 | def test_run_pong_no_frame_skip():
69 |     """Test all agents that train PongNoFrameskip-v4 env."""
70 |     check_run_env("configs/pong_no_frameskip_v4/", "run_pong_no_frameskip_v4.py")
71 | 
72 | 
73 | if __name__ == "__main__":
74 |     test_run_lunarlander_continuous()
75 |     test_run_lunarlander()
76 |     test_run_pong_no_frame_skip()
77 | 


--------------------------------------------------------------------------------
/rl_algorithms/utils/config.py:
--------------------------------------------------------------------------------
 1 | import collections.abc as collections_abc
 2 | import os.path as osp
 3 | 
 4 | from addict import Dict
 5 | import yaml
 6 | 
 7 | 
 8 | class ConfigDict(Dict):
 9 |     def __missing__(self, name):
10 |         raise KeyError(name)
11 | 
12 |     def __getattr__(self, name):
13 |         try:
14 |             value = super(ConfigDict, self).__getattr__(name)
15 |         except KeyError:
16 |             ex = AttributeError(
17 |                 "'{}' object has no attribute '{}'".format(
18 |                     self.__class__.__name__, name
19 |                 )
20 |             )
21 |         except Exception as e:
22 |             ex = e
23 |         else:
24 |             return value
25 |         raise ex
26 | 
27 |     def __setitem__(self, name, value):
28 |         if isinstance(value, dict):
29 |             value = ConfigDict(value)
30 | 
31 |         super(ConfigDict, self).__setitem__(name, value)
32 | 
33 | 
34 | def add_args(parser, cfg, prefix=""):
35 |     for k, v in cfg.items():
36 |         if isinstance(v, str):
37 |             parser.add_argument("--" + prefix + k)
38 |         elif isinstance(v, int):
39 |             parser.add_argument("--" + prefix + k, type=int)
40 |         elif isinstance(v, float):
41 |             parser.add_argument("--" + prefix + k, type=float)
42 |         elif isinstance(v, bool):
43 |             parser.add_argument("--" + prefix + k, action="store_true")
44 |         elif isinstance(v, dict):
45 |             add_args(parser, v, k + ".")
46 |         elif isinstance(v, collections_abc.Iterable):
47 |             parser.add_argument("--" + prefix + k, type=type(v[0]), nargs="+")
48 |         else:
49 |             print("connot parse key {} of type {}".format(prefix + k, type(v)))
50 |     return parser
51 | 
52 | 
53 | class YamlConfig:
54 |     """Manager of ConfigDict from yaml."""
55 | 
56 |     def __init__(self, config_paths: dict):
57 |         """Make ConfigDict from yaml path."""
58 |         self.cfg = ConfigDict()
59 |         for key, path in config_paths.items():
60 |             self.cfg[key] = self._yaml_to_config_dict(path)
61 | 
62 |     @staticmethod
63 |     def _yaml_to_config_dict(path: str) -> ConfigDict:
64 |         """Return ConfigDict from yaml."""
65 |         try:
66 |             with open(path) as f:
67 |                 data = yaml.load(f, Loader=yaml.FullLoader)
68 |         except FileNotFoundError:
69 |             with open(osp.expanduser(path)) as f:
70 |                 data = yaml.load(f, Loader=yaml.FullLoader)
71 |         return ConfigDict(data)
72 | 
73 |     def get_config_dict(self):
74 |         return self.cfg
75 | 


--------------------------------------------------------------------------------
/rl_algorithms/__init__.py:
--------------------------------------------------------------------------------
 1 | from .a2c.agent import A2CAgent
 2 | from .a2c.learner import A2CLearner
 3 | from .acer.agent import ACERAgent
 4 | from .acer.learner import ACERLearner
 5 | from .bc.ddpg_agent import BCDDPGAgent
 6 | from .bc.ddpg_learner import BCDDPGLearner
 7 | from .bc.her import LunarLanderContinuousHER, ReacherHER
 8 | from .bc.sac_agent import BCSACAgent
 9 | from .bc.sac_learner import BCSACLearner
10 | from .common.apex.architecture import ApeX
11 | from .common.networks.backbones import CNN, ResNet
12 | from .ddpg.agent import DDPGAgent
13 | from .ddpg.learner import DDPGLearner
14 | from .distillation.dqn_agent import DistillationDQNAgent
15 | from .dqn.agent import DQNAgent
16 | from .dqn.distributed_logger import DQNLogger
17 | from .dqn.distributed_worker import DQNWorker
18 | from .dqn.learner import DQNLearner
19 | from .dqn.losses import C51Loss, DQNLoss, IQNLoss
20 | from .fd.ddpg_agent import DDPGfDAgent
21 | from .fd.ddpg_learner import DDPGfDLearner
22 | from .fd.dqn_agent import DQfDAgent
23 | from .fd.dqn_learner import DQfDLearner
24 | from .fd.sac_agent import SACfDAgent
25 | from .fd.sac_learner import SACfDLearner
26 | from .gail.agent import GAILPPOAgent
27 | from .gail.learner import GAILPPOLearner
28 | from .ppo.agent import PPOAgent
29 | from .ppo.learner import PPOLearner
30 | from .recurrent.dqn_agent import R2D1Agent
31 | from .recurrent.learner import R2D1Learner
32 | from .recurrent.losses import R2D1C51Loss, R2D1DQNLoss, R2D1IQNLoss
33 | from .registry import build_agent, build_her
34 | from .sac.agent import SACAgent
35 | from .sac.learner import SACLearner
36 | from .td3.agent import TD3Agent
37 | from .td3.learner import TD3Learner
38 | 
39 | __all__ = [
40 |     "A2CAgent",
41 |     "BCDDPGAgent",
42 |     "BCSACAgent",
43 |     "DDPGAgent",
44 |     "DQNAgent",
45 |     "DDPGfDAgent",
46 |     "DQfDAgent",
47 |     "R2D1Agent",
48 |     "SACfDAgent",
49 |     "PPOAgent",
50 |     "SACAgent",
51 |     "TD3Agent",
52 |     "GAILPPOAgent",
53 |     "A2CLearner",
54 |     "BCDDPGLearner",
55 |     "BCSACLearner",
56 |     "DDPGLearner",
57 |     "DQNLearner",
58 |     "DDPGfDLearner",
59 |     "DQfDLearner",
60 |     "SACfDLearner",
61 |     "PPOLearner",
62 |     "SACLearner",
63 |     "TD3Learner",
64 |     "GAILPPOLearner",
65 |     "R2D1Learner",
66 |     "LunarLanderContinuousHER",
67 |     "ReacherHER",
68 |     "build_agent",
69 |     "build_her",
70 |     "CNN",
71 |     "ResNet",
72 |     "IQNLoss",
73 |     "C51Loss",
74 |     "DQNLoss",
75 |     "R2D1IQNLoss",
76 |     "R2D1C51Loss",
77 |     "R2D1DQNLoss",
78 |     "ApeX",
79 |     "DQNWorker",
80 |     "DQNLogger",
81 |     "ACERLearner",
82 |     "ACERAgent",
83 |     "DistillationDQNAgent",
84 | ]
85 | 


--------------------------------------------------------------------------------
/rl_algorithms/gail/networks.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from typing import Tuple, Union
 4 | 
 5 | import torch
 6 | import torch.nn as nn
 7 | 
 8 | from rl_algorithms.common.helper_functions import identity
 9 | from rl_algorithms.registry import build_backbone, build_head
10 | from rl_algorithms.utils.config import ConfigDict
11 | 
12 | 
13 | # TODO: Remove it when upgrade torch>=1.7
14 | # pylint: disable=abstract-method
15 | class Discriminator(nn.Module):
16 |     """Discriminator to classify experience data and expert data"""
17 | 
18 |     def __init__(
19 |         self,
20 |         backbone_cfg: ConfigDict,
21 |         head_cfg: ConfigDict,
22 |         action_embedder_cfg: ConfigDict,
23 |         shared_backbone: nn.Module = None,
24 |     ):
25 |         nn.Module.__init__(self)
26 |         if shared_backbone is not None:
27 |             self.backbone = shared_backbone
28 |             head_cfg.configs.input_size = self.calculate_fc_input_size(
29 |                 head_cfg.configs.state_size
30 |             )
31 |         elif not backbone_cfg:
32 |             self.backbone = identity
33 |             head_cfg.configs.input_size = head_cfg.configs.state_size[0]
34 |         else:
35 |             self.backbone = build_backbone(backbone_cfg)
36 |             head_cfg.configs.input_size = self.calculate_fc_input_size(
37 |                 head_cfg.configs.state_size
38 |             )
39 | 
40 |         self.action_embedder = None
41 |         if action_embedder_cfg:
42 |             action_embedder_cfg.configs.input_size = head_cfg.configs.action_size
43 |             self.action_embedder = build_head(action_embedder_cfg)
44 |             head_cfg.configs.input_size += action_embedder_cfg.configs.output_size
45 |         else:
46 |             head_cfg.configs.input_size += head_cfg.configs.action_size
47 | 
48 |         self.head = build_head(head_cfg)
49 | 
50 |     def forward(
51 |         self, state_action: Tuple[torch.Tensor, torch.Tensor]
52 |     ) -> Union[torch.Tensor, Tuple[torch.Tensor, ...]]:
53 |         """Forward method implementation. Use in get_action method in agent."""
54 |         state_feature = self.backbone(state_action[0])
55 |         action_feature = state_action[1]
56 |         if self.action_embedder:
57 |             action_feature = self.forward_action_embedder(action_feature)
58 |         return self.head(torch.cat([state_feature, action_feature], dim=-1))
59 | 
60 |     def forward_action_embedder(
61 |         self, x: torch.Tensor
62 |     ) -> Union[torch.Tensor, Tuple[torch.Tensor, ...]]:
63 |         """Forward method of action embedder."""
64 |         return self.action_embedder(x)
65 | 
66 |     def calculate_fc_input_size(self, state_dim: tuple):
67 |         """Calculate fc input size according to the shape of cnn."""
68 |         x = torch.zeros(state_dim).unsqueeze(0)
69 |         output = self.backbone(x).detach().view(-1)
70 |         return output.shape[0]
71 | 


--------------------------------------------------------------------------------
/tests/buffer/test_prioritized_buffer.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Tuple
 2 | 
 3 | import numpy as np
 4 | from scipy.stats import ks_2samp
 5 | 
 6 | from rl_algorithms.common.buffer.replay_buffer import ReplayBuffer
 7 | from rl_algorithms.common.buffer.wrapper import PrioritizedBufferWrapper
 8 | 
 9 | 
10 | def generate_prioritized_buffer(
11 |     buffer_length: int, batch_size: int, idx_lst=None, prior_lst=None
12 | ) -> Tuple[PrioritizedBufferWrapper, List]:
13 |     """Generate Prioritized Replay Buffer with random Prior."""
14 |     buffer = ReplayBuffer(max_len=buffer_length, batch_size=batch_size)
15 |     prioritized_buffer = PrioritizedBufferWrapper(buffer)
16 |     priority = np.random.randint(10, size=buffer_length)
17 |     for i, j in enumerate(priority):
18 |         prioritized_buffer.sum_tree[i] = j
19 |     if idx_lst:
20 |         for i, j in list(zip(idx_lst, prior_lst)):
21 |             priority[i] = j
22 |             prioritized_buffer.sum_tree[i] = j
23 | 
24 |     prop_lst = [i / sum(priority) for i in priority]
25 |     prioritized_buffer.buffer.length = buffer_length
26 | 
27 |     return prioritized_buffer, prop_lst
28 | 
29 | 
30 | def sample_dummy(prioritized_buffer: PrioritizedBufferWrapper, times: int) -> List:
31 |     """Sample from prioritized buffer and Return indices."""
32 |     assert isinstance(prioritized_buffer, PrioritizedBufferWrapper)
33 | 
34 |     sampled_lst = [0] * prioritized_buffer.buffer.max_len
35 |     for _ in range(times):
36 |         indices = prioritized_buffer._sample_proportional(
37 |             prioritized_buffer.buffer.batch_size
38 |         )
39 |         for idx in indices:
40 |             sampled_lst[idx] += 1 / (times * prioritized_buffer.buffer.batch_size)
41 |     return sampled_lst
42 | 
43 | 
44 | def check_prioritized(prop_lst: List, sampled_lst: List) -> bool:
45 |     """Check two input lists have same distribution by kstest.
46 | 
47 |     Reference:
48 |     https://en.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test
49 |     """
50 |     res = ks_2samp(prop_lst, sampled_lst)
51 |     return res[1] >= 0.05
52 | 
53 | 
54 | def test_prioritized(buffer_length=32, batch_size=4):
55 |     """Test whether transitions are prioritized sampled from replay buffer."""
56 | 
57 |     n_repeat = 1000
58 |     idx_lst = [0, 1, 2, 3]
59 |     prior_lst = [100, 10, 1, 1]
60 | 
61 |     # generate prioitized buffer, return buffer and its proportion
62 |     buffer, prop = generate_prioritized_buffer(
63 |         buffer_length, batch_size, idx_lst, prior_lst
64 |     )
65 |     assert isinstance(buffer, PrioritizedBufferWrapper)
66 |     sampled_lst = [0] * buffer.buffer.max_len
67 |     # sample index from buffer
68 |     for _ in range(n_repeat):
69 |         indices = buffer._sample_proportional(buffer.buffer.batch_size)
70 |         for idx in indices:
71 |             sampled_lst[idx] += 1 / (n_repeat * buffer.buffer.batch_size)
72 | 
73 |     assert check_prioritized(prop, sampled_lst), "Two distributions are different."
74 | 
75 | 
76 | if __name__ == "__main__":
77 |     test_prioritized()
78 | 


--------------------------------------------------------------------------------
/tests/integration/test_run_distillation_agent.py:
--------------------------------------------------------------------------------
 1 | """Test only one step of distillation file for training."""
 2 | 
 3 | import os
 4 | import pickle
 5 | import re
 6 | import shutil
 7 | import subprocess
 8 | 
 9 | 
10 | def check_distillation_agent(config: str, run_file: str):
11 |     """Test that 1 episode of run file works well."""
12 |     cmd = (
13 |         f"python {run_file} --cfg-path {config} --integration-test "
14 |         + f"--episode-num 1 --interim-test 1 --off-render"
15 |     )
16 | 
17 |     p = subprocess.Popen(
18 |         cmd,
19 |         stdout=subprocess.PIPE,
20 |         stderr=subprocess.STDOUT,
21 |         universal_newlines=True,
22 |         shell=True,
23 |     )
24 |     output, _ = p.communicate()
25 |     print(str(output))
26 |     assert p.returncode == 0
27 | 
28 |     # Find saved checkpoint path and data path.
29 |     pattern = r"./checkpoint/.+/"
30 |     data_pattern = r"data/.+/"
31 |     checkpoint_path = re.findall(pattern, str(output))[0]
32 |     full_data_path, n_frame_from_last_path = re.findall(data_pattern, str(output))
33 | 
34 |     try:
35 |         num_episode_step = re.findall(r"episode step: \d+", str(output))[0]
36 |         num_episode_step = int(re.findall(r"\d+", num_episode_step)[0])
37 | 
38 |         # Check if the number of data is same with iterated episode step.
39 |         saved_data_list = os.listdir(full_data_path)
40 |         assert (
41 |             len(saved_data_list) == num_episode_step
42 |         ), "The number of data does not match the number of iterated episode steps."
43 | 
44 |         # Check if n_frame_from_last works well.
45 |         n_frame_from_last_data_list = os.listdir(n_frame_from_last_path)
46 |         assert 3 == len(
47 |             n_frame_from_last_data_list
48 |         ), f"n_frame_from_last doesn't work properly(expected num of data: 3, num of data: {len(n_frame_from_last_data_list)})."
49 | 
50 |         # Check if train-phase data only contaions state, not state & q value.
51 |         with open(full_data_path + saved_data_list[0], "rb") as f:
52 |             datum = pickle.load(f)
53 |         assert (
54 |             len(datum) == 1
55 |         ), "The length of the data is not appropriate(length must be 1, state only)."
56 | 
57 |     except Exception as e:
58 |         raise e
59 | 
60 |     finally:
61 |         """Delete generated directories."""
62 |         delete_path(checkpoint_path)
63 |         delete_path(full_data_path)
64 |         delete_path(n_frame_from_last_path)
65 | 
66 | 
67 | def delete_path(path: str):
68 |     """Delete directory."""
69 |     shutil.rmtree(path)
70 | 
71 | 
72 | # TODO: Add student training test code.
73 | def test_distillation():
74 |     """Test distillation agent."""
75 |     check_distillation_agent(
76 |         "configs/pong_no_frameskip_v4/distillation_dqn.yaml",
77 |         "run_pong_no_frameskip_v4.py",
78 |     )
79 |     check_distillation_agent(
80 |         "configs/lunarlander_v2/distillation_dqn.yaml", "run_lunarlander_v2.py"
81 |     )
82 | 
83 | 
84 | if __name__ == "__main__":
85 |     test_distillation()
86 | 


--------------------------------------------------------------------------------
/tests/test_config_registry.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import datetime
 3 | 
 4 | import gym
 5 | 
 6 | from rl_algorithms import build_agent
 7 | from rl_algorithms.common.abstract.agent import Agent
 8 | from rl_algorithms.utils import YamlConfig
 9 | 
10 | 
11 | def parse_args(args: list):
12 |     parser = argparse.ArgumentParser(description="Pytorch RL rl_algorithms")
13 |     parser.add_argument(
14 |         "--cfg-path",
15 |         type=str,
16 |         default="./configs/lunarlander_continuous_v2/ddpg.yaml",
17 |         help="config path",
18 |     )
19 |     parser.add_argument(
20 |         "--test", dest="test", action="store_true", help="test mode (no training)"
21 |     )
22 |     parser.add_argument(
23 |         "--load-from",
24 |         type=str,
25 |         default=None,
26 |         help="load the saved model and optimizer at the beginning",
27 |     )
28 |     parser.add_argument(
29 |         "--off-render", dest="render", action="store_false", help="turn off rendering"
30 |     )
31 |     parser.add_argument(
32 |         "--render-after",
33 |         type=int,
34 |         default=0,
35 |         help="start rendering after the input number of episode",
36 |     )
37 |     parser.add_argument(
38 |         "--log", dest="log", action="store_true", help="turn on logging"
39 |     )
40 |     parser.add_argument(
41 |         "--save-period", type=int, default=100, help="save model period"
42 |     )
43 |     parser.add_argument(
44 |         "--episode-num", type=int, default=1500, help="total episode num"
45 |     )
46 |     parser.add_argument(
47 |         "--max-episode-steps", type=int, default=300, help="max episode step"
48 |     )
49 |     parser.add_argument(
50 |         "--interim-test-num",
51 |         type=int,
52 |         default=10,
53 |         help="number of test during training",
54 |     )
55 |     return parser.parse_args(args)
56 | 
57 | 
58 | def test_config_registry():
59 |     # configurations
60 |     args = parse_args(["--test"])
61 | 
62 |     # set env
63 |     env = gym.make("LunarLanderContinuous-v2")
64 | 
65 |     # check start time
66 |     NOWTIMES = datetime.datetime.now()
67 |     curr_time = NOWTIMES.strftime("%y%m%d_%H%M%S")
68 | 
69 |     cfg = YamlConfig(dict(agent=args.cfg_path)).get_config_dict()
70 |     env_info = dict(
71 |         name=env.spec.id,
72 |         observation_space=env.observation_space,
73 |         action_space=env.action_space,
74 |         is_atari=False,
75 |     )
76 |     log_cfg = dict(agent=cfg.agent.type, curr_time=curr_time, cfg_path=args.cfg_path)
77 |     build_args = dict(
78 |         env=env,
79 |         env_info=env_info,
80 |         log_cfg=log_cfg,
81 |         is_test=args.test,
82 |         load_from=args.load_from,
83 |         is_render=args.render,
84 |         render_after=args.render_after,
85 |         is_log=args.log,
86 |         save_period=args.save_period,
87 |         episode_num=args.episode_num,
88 |         max_episode_steps=args.max_episode_steps,
89 |         interim_test_num=args.interim_test_num,
90 |     )
91 |     agent = build_agent(cfg.agent, build_args)
92 |     assert isinstance(agent, Agent)
93 | 
94 | 
95 | if __name__ == "__main__":
96 |     test_config_registry()
97 | 


--------------------------------------------------------------------------------
/.all-contributorsrc:
--------------------------------------------------------------------------------
  1 | {
  2 |   "files": [
  3 |     "README.md"
  4 |   ],
  5 |   "imageSize": 100,
  6 |   "commit": false,
  7 |   "contributors": [
  8 |     {
  9 |       "login": "Curt-Park",
 10 |       "name": "Jinwoo Park (Curt)",
 11 |       "avatar_url": "https://avatars3.githubusercontent.com/u/14961526?v=4",
 12 |       "profile": "https://github.com/Curt-Park",
 13 |       "contributions": [
 14 |         "code"
 15 |       ]
 16 |     },
 17 |     {
 18 |       "login": "MrSyee",
 19 |       "name": "Kyunghwan Kim",
 20 |       "avatar_url": "https://avatars3.githubusercontent.com/u/17582508?v=4",
 21 |       "profile": "https://github.com/MrSyee",
 22 |       "contributions": [
 23 |         "code"
 24 |       ]
 25 |     },
 26 |     {
 27 |       "login": "darthegg",
 28 |       "name": "darthegg",
 29 |       "avatar_url": "https://avatars3.githubusercontent.com/u/16010242?v=4",
 30 |       "profile": "https://github.com/darthegg",
 31 |       "contributions": [
 32 |         "code"
 33 |       ]
 34 |     },
 35 |     {
 36 |       "login": "mclearning2",
 37 |       "name": "Mincheol Kim",
 38 |       "avatar_url": "https://avatars3.githubusercontent.com/u/43226417?v=4",
 39 |       "profile": "https://github.com/mclearning2",
 40 |       "contributions": [
 41 |         "code"
 42 |       ]
 43 |     },
 44 |     {
 45 |       "login": "minseop4898",
 46 |       "name": "김민섭",
 47 |       "avatar_url": "https://avatars1.githubusercontent.com/u/34338299?v=4",
 48 |       "profile": "https://github.com/minseop4898",
 49 |       "contributions": [
 50 |         "code"
 51 |       ]
 52 |     },
 53 |     {
 54 |       "login": "jinPrelude",
 55 |       "name": "Leejin Jung",
 56 |       "avatar_url": "https://avatars1.githubusercontent.com/u/16518993?v=4",
 57 |       "profile": "https://github.com/jinPrelude",
 58 |       "contributions": [
 59 |         "code"
 60 |       ]
 61 |     },
 62 |     {
 63 |       "login": "cyoon1729",
 64 |       "name": "Chris Yoon",
 65 |       "avatar_url": "https://avatars2.githubusercontent.com/u/33583101?v=4",
 66 |       "profile": "https://github.com/cyoon1729",
 67 |       "contributions": [
 68 |         "code"
 69 |       ]
 70 |     },
 71 |     {
 72 |       "login": "jiseongHAN",
 73 |       "name": "Jiseong Han",
 74 |       "avatar_url": "https://avatars2.githubusercontent.com/u/48741026?v=4",
 75 |       "profile": "https://jiseonghan.github.io/",
 76 |       "contributions": [
 77 |         "code"
 78 |       ]
 79 |     },
 80 |     {
 81 |       "login": "sehyun-hwang",
 82 |       "name": "Sehyun Hwang",
 83 |       "avatar_url": "https://avatars3.githubusercontent.com/u/23437715?v=4",
 84 |       "profile": "https://github.com/sehyun-hwang",
 85 |       "contributions": [
 86 |         "maintenance"
 87 |       ]
 88 |     },
 89 |     {
 90 |       "login": "isk03276",
 91 |       "name": "eunjin",
 92 |       "avatar_url": "https://avatars.githubusercontent.com/u/23740495?v=4",
 93 |       "profile": "https://github.com/isk03276",
 94 |       "contributions": [
 95 |         "code"
 96 |       ]
 97 |     }
 98 |   ],
 99 |   "contributorsPerLine": 7,
100 |   "projectName": "rl_algorithms",
101 |   "projectOwner": "medipixel",
102 |   "repoType": "github",
103 |   "repoHost": "https://github.com",
104 |   "skipCi": true
105 | }
106 | 


--------------------------------------------------------------------------------
/tests/test_cnn_cfg.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | 
  4 | from rl_algorithms.common.networks.backbones import CNN, ResNet
  5 | from rl_algorithms.common.networks.brain import Brain
  6 | from rl_algorithms.utils.config import ConfigDict
  7 | 
  8 | cnn_cfg = ConfigDict(
  9 |     type="CNN",
 10 |     configs=dict(
 11 |         input_sizes=[3, 32, 32],
 12 |         output_sizes=[32, 32, 64],
 13 |         kernel_sizes=[5, 3, 3],
 14 |         strides=[4, 3, 2],
 15 |         paddings=[2, 0, 1],
 16 |     ),
 17 | )
 18 | 
 19 | resnet_cfg = ConfigDict(
 20 |     type="ResNet",
 21 |     configs=dict(
 22 |         use_bottleneck=False,
 23 |         num_blocks=[1, 1, 1, 1],
 24 |         block_output_sizes=[32, 32, 64, 64],
 25 |         block_strides=[1, 2, 2, 2],
 26 |         first_input_size=3,
 27 |         first_output_size=32,
 28 |         expansion=4,
 29 |         channel_compression=4,
 30 |     ),
 31 | )
 32 | 
 33 | head_cfg = ConfigDict(
 34 |     type="IQNMLP",
 35 |     configs=dict(
 36 |         hidden_sizes=[512],
 37 |         n_tau_samples=64,
 38 |         n_tau_prime_samples=64,
 39 |         n_quantile_samples=32,
 40 |         quantile_embedding_dim=64,
 41 |         kappa=1.0,
 42 |         output_activation="identity",
 43 |         # NoisyNet
 44 |         use_noisy_net=True,
 45 |         std_init=0.5,
 46 |     ),
 47 | )
 48 | 
 49 | test_state_dim = (3, 256, 256)
 50 | 
 51 | 
 52 | def test_brain():
 53 |     """Test wheter brain make fc layer based on backbone's output size."""
 54 | 
 55 |     head_cfg.configs.state_size = test_state_dim
 56 |     head_cfg.configs.output_size = 8
 57 | 
 58 |     model = Brain(resnet_cfg, head_cfg)
 59 |     assert model.head.input_size == 16384
 60 | 
 61 | 
 62 | def test_cnn_with_config():
 63 |     """Test whether CNN module can make proper model according to the configs given."""
 64 |     conv_layer_size = [[1, 32, 64, 64], [1, 32, 21, 21], [1, 64, 11, 11]]
 65 |     test_cnn_model = CNN(configs=cnn_cfg.configs)
 66 |     conv_layers = [
 67 |         module for module in test_cnn_model.modules() if isinstance(module, nn.Conv2d)
 68 |     ]
 69 |     x = torch.zeros(test_state_dim).unsqueeze(0)
 70 |     for i, layer in enumerate(conv_layers):
 71 |         layer_output = layer(x)
 72 |         x = layer_output
 73 |         assert list(x.shape) == conv_layer_size[i]
 74 | 
 75 | 
 76 | def test_resnet_with_config():
 77 |     """Test whether ResNet module can make proper model according to the configs given."""
 78 |     conv_layer_size = [
 79 |         [1, 32, 256, 256],
 80 |         [1, 32, 256, 256],
 81 |         [1, 128, 256, 256],
 82 |         [1, 128, 256, 256],
 83 |         [1, 32, 128, 128],
 84 |         [1, 128, 128, 128],
 85 |         [1, 128, 128, 128],
 86 |         [1, 64, 64, 64],
 87 |         [1, 256, 64, 64],
 88 |         [1, 256, 64, 64],
 89 |         [1, 64, 32, 32],
 90 |         [1, 256, 32, 32],
 91 |         [1, 256, 32, 32],
 92 |         [1, 16, 32, 32],
 93 |     ]
 94 |     test_resnet_model = ResNet(configs=resnet_cfg.configs)
 95 |     conv_layers = [
 96 |         module
 97 |         for module in test_resnet_model.modules()
 98 |         if isinstance(module, nn.Conv2d)
 99 |     ]
100 |     x = torch.zeros(test_state_dim).unsqueeze(0)
101 |     skip_x = x
102 |     for i, layer in enumerate(conv_layers):
103 |         if i % 3 == 0:
104 |             layer_output = layer(skip_x)
105 |             skip_x = layer_output
106 |             x = layer_output
107 |         else:
108 |             layer_output = layer(x)
109 |             x = layer_output
110 |         assert list(x.shape) == conv_layer_size[i]
111 | 
112 | 
113 | if __name__ == "__main__":
114 |     test_brain()
115 |     test_cnn_with_config()
116 | 


--------------------------------------------------------------------------------
/rl_algorithms/common/abstract/her.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """Abstract class used for Hindsight Experience Replay.
  3 | 
  4 | - Author: Kyunghwan Kim
  5 | - Contact: kh.kim@medipixel.io
  6 | - Paper: https://arxiv.org/pdf/1707.01495.pdf
  7 | """
  8 | 
  9 | from abc import ABC, abstractmethod
 10 | from typing import Callable, Tuple
 11 | 
 12 | import numpy as np
 13 | 
 14 | 
 15 | class HER(ABC):
 16 |     """Abstract class for HER (final strategy).
 17 | 
 18 |     Attributes:
 19 |         reward_fn (Callable): returns reward from state, action, next_state
 20 | 
 21 |     """
 22 | 
 23 |     def __init__(self, reward_fn: Callable[[tuple, np.ndarray], np.float64]):
 24 |         """Initialize.
 25 | 
 26 |         Args:
 27 |             reward_fn (Callable): returns reward from state, action, next_state
 28 | 
 29 |         """
 30 |         self.reward_fn = reward_fn
 31 | 
 32 |     @abstractmethod
 33 |     def fetch_desired_states_from_demo(self, demo: list):
 34 |         pass
 35 | 
 36 |     @abstractmethod
 37 |     def get_desired_state(self, *args) -> np.ndarray:
 38 |         pass
 39 | 
 40 |     @abstractmethod
 41 |     def generate_demo_transitions(self, demo: list) -> list:
 42 |         pass
 43 | 
 44 |     @abstractmethod
 45 |     def _get_final_state(self, transition: tuple) -> np.ndarray:
 46 |         pass
 47 | 
 48 |     def _append_origin_transitions(
 49 |         self, origin_transitions: list, transition: tuple, desired_state: np.ndarray
 50 |     ):
 51 |         """Append original transitions adding goal state for training."""
 52 |         origin_transitions.append(self._get_transition(transition, desired_state))
 53 | 
 54 |     def _append_new_transitions(
 55 |         self, new_transitions: list, transition: tuple, final_state: np.ndarray
 56 |     ):
 57 |         """Append new transitions made by HER strategy (final) for training."""
 58 |         new_transitions.append(self._get_transition(transition, final_state))
 59 | 
 60 |     def _get_transition(
 61 |         self, transition: tuple, goal_state: np.ndarray
 62 |     ) -> Tuple[np.ndarray, np.ndarray, np.float64, np.ndarray, bool]:
 63 |         """Get a single transition concatenated with a goal state."""
 64 |         state, action, _, next_state, done = transition
 65 | 
 66 |         done = np.array_equal(next_state, goal_state)
 67 |         reward = self.reward_fn(transition, goal_state)
 68 |         state = np.concatenate((state, goal_state), axis=-1)
 69 |         next_state = np.concatenate((next_state, goal_state), axis=-1)
 70 | 
 71 |         return state, action, reward, next_state, done
 72 | 
 73 |     def generate_transitions(
 74 |         self,
 75 |         transitions: list,
 76 |         desired_state: np.ndarray,
 77 |         success_score: float,
 78 |         is_demo: bool = False,
 79 |     ) -> list:
 80 |         """Generate new transitions concatenated with desired states."""
 81 |         origin_transitions: list = list()
 82 |         new_transitions: list = list()
 83 |         final_state = self._get_final_state(transitions[-1])
 84 |         score = np.sum(np.array(transitions), axis=0)[2]
 85 | 
 86 |         for transition in transitions:
 87 |             # process transitions with the initial goal state
 88 |             self._append_origin_transitions(
 89 |                 origin_transitions, transition, desired_state
 90 |             )
 91 | 
 92 |             # do not need to append new transitions if sum of reward is big enough
 93 |             if not is_demo and score <= success_score:
 94 |                 self._append_new_transitions(new_transitions, transition, final_state)
 95 | 
 96 |         return origin_transitions + new_transitions
 97 | 
 98 |     def __str__(self):
 99 |         return self.__class__.__name__
100 | 


--------------------------------------------------------------------------------
/rl_algorithms/common/saliency_map.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """Functions for Saliency map.
  3 | 
  4 | - Author: Euijin Jeong
  5 | - Contact: euijin.jeong@medipixel.io
  6 | """
  7 | 
  8 | import os
  9 | import pickle
 10 | import shutil
 11 | 
 12 | from PIL import Image
 13 | import matplotlib.pyplot as plt
 14 | import numpy as np
 15 | import torch
 16 | 
 17 | plt.rcParams["figure.figsize"] = (10.0, 8.0)  # set default size of plots
 18 | plt.rcParams["image.interpolation"] = "nearest"
 19 | plt.rcParams["image.cmap"] = "gray"
 20 | 
 21 | 
 22 | def make_saliency_dir(date_time: str) -> str:
 23 |     """Make directories for saving saliency map result."""
 24 |     save_dir = f"./data/saliency_map/{date_time}"
 25 |     if os.path.exists(save_dir):
 26 |         shutil.rmtree(save_dir)
 27 |     os.makedirs(save_dir)
 28 |     os.makedirs(f"./data/saliency_map/{date_time}/input_image")
 29 |     os.makedirs(f"./data/saliency_map/{date_time}/state")
 30 |     os.makedirs(f"./data/saliency_map/{date_time}/saliency")
 31 |     os.makedirs(f"./data/saliency_map/{date_time}/overlay")
 32 |     saliency_map_dir = f"./data/saliency_map/{date_time}/"
 33 |     return saliency_map_dir
 34 | 
 35 | 
 36 | def compute_saliency_maps(X, y, model, device):
 37 |     """Compute a class saliency map using the model for images X and labels y."""
 38 | 
 39 |     # Make input tensor require gradient
 40 |     if isinstance(X, list):  # For R2D1
 41 |         input_list = []
 42 |         for x in X:
 43 |             input_list.append(x.requires_grad_())
 44 | 
 45 |         saliency = None
 46 |         X = input_list
 47 |         scores, _ = model(X[0], X[1], X[2], X[3])
 48 |         scores = (scores.gather(1, y.unsqueeze(0))).squeeze(0)
 49 |         scores.backward(torch.FloatTensor([1.0]).to(device))
 50 |         saliency, _ = torch.max(X[0].grad.data.abs(), dim=1)
 51 |     else:
 52 |         X.requires_grad_()
 53 | 
 54 |         saliency = None
 55 |         scores = model(X)
 56 |         scores = (scores.gather(1, y.unsqueeze(0))).squeeze(0)
 57 |         scores.backward(torch.FloatTensor([1.0]).to(device))
 58 |         saliency, _ = torch.max(X.grad.data.abs(), dim=1)
 59 | 
 60 |     return saliency
 61 | 
 62 | 
 63 | def save_saliency_maps(i, X, y, model, device, saliency_map_dir):
 64 |     """Make and save saliency maps in directory."""
 65 | 
 66 |     # Convert X and y from numpy arrays to Torch Tensors
 67 |     if isinstance(X, tuple):  # For R2D1
 68 |         input_image = X[0][-1]
 69 |         X_tensor = []
 70 |         for x in X:
 71 |             if not isinstance(x, torch.Tensor):
 72 |                 X_tensor.append(torch.Tensor(x).float().to(device).unsqueeze(0))
 73 |             else:
 74 |                 X_tensor.append(x)
 75 |     else:
 76 |         input_image = X[-1]
 77 |         X_tensor = torch.Tensor(X).float().to(device).unsqueeze(0)
 78 |     y = int(y)
 79 |     y_tensor = torch.LongTensor([y]).to(device)
 80 | 
 81 |     # Compute saliency maps for images in X
 82 |     saliency = compute_saliency_maps(X_tensor, y_tensor, model, device)
 83 | 
 84 |     # image
 85 |     saliency = saliency.cpu().numpy()
 86 |     saliency = np.flip(saliency, axis=1)
 87 |     input_image = np.rot90(input_image, 3)
 88 |     input_image = Image.fromarray(np.uint8(input_image * 255.0))
 89 |     input_image.save(saliency_map_dir + "/input_image/{}.png".format(i))
 90 | 
 91 |     # numpy array
 92 |     with open(saliency_map_dir + "/state/{}.pkl".format(i), "wb") as f:
 93 |         pickle.dump(X, f)
 94 | 
 95 |     cmap = plt.cm.hot
 96 |     norm = plt.Normalize(saliency.min(), saliency.max())
 97 |     saliency = cmap(norm(saliency[0]))
 98 |     saliency = np.rot90(saliency, 3)
 99 |     saliency = Image.fromarray(np.uint8(saliency * 255.0))
100 |     saliency.save(saliency_map_dir + "/saliency/{}.png".format(i))
101 | 
102 |     overlay = Image.blend(input_image.convert("RGBA"), saliency, alpha=0.5)
103 |     overlay.save(saliency_map_dir + "/overlay/{}.png".format(i))
104 |     return saliency
105 | 


--------------------------------------------------------------------------------
/rl_algorithms/acer/buffer.py:
--------------------------------------------------------------------------------
  1 | # TODO : Move to common buffer
  2 | import random
  3 | from typing import Tuple
  4 | 
  5 | import numpy as np
  6 | import torch
  7 | 
  8 | from rl_algorithms.common.abstract.buffer import BaseBuffer
  9 | 
 10 | 
 11 | class ReplayMemory(BaseBuffer):
 12 |     """ReplayMemory for ACER.
 13 | 
 14 |     Attributes:
 15 |         obs_buf (np.ndarray): observations
 16 |         acts_buf (np.ndarray): actions
 17 |         rews_buf (np.ndarray): rewards
 18 |         probs_buf (np.ndarray): probability of actions
 19 |         done_buf (np.ndarray): dones
 20 |         max_len (int): size of buffers
 21 |         n_rollout (int): number of rollout
 22 |         num_in_buffer (int): amount of memory filled
 23 |         idx (int): memory index to add the next incoming transition
 24 |     """
 25 | 
 26 |     def __init__(self, buffer_size: int, n_rollout: int):
 27 |         """Initialize a ReplayBuffer object."""
 28 |         self.obs_buf = None
 29 |         self.acts_buf = None
 30 |         self.rews_buf = None
 31 |         self.probs_buf = None
 32 |         self.done_buf = None
 33 |         self.buffer_size = buffer_size
 34 |         self.idx = 0
 35 |         self.num_in_buffer = 0
 36 |         self.n_rollout = n_rollout
 37 | 
 38 |     def add(self, seq_data: list):
 39 |         """Add a new experience to memory.
 40 |         If the buffer is empty, it is respectively initialized by size of arguments.
 41 |         """
 42 |         if self.num_in_buffer == 0:
 43 |             state, action, reward, prob, done_mask = seq_data[0]
 44 |             self._initialize_buffers(state, prob)
 45 | 
 46 |         self.idx = (self.idx + 1) % (self.buffer_size - 1)
 47 | 
 48 |         for i, transition in enumerate(seq_data):
 49 |             state, action, reward, prob, done_mask = transition
 50 |             self.obs_buf[self.idx][i] = state
 51 |             self.acts_buf[self.idx][i] = action
 52 |             self.rews_buf[self.idx][i] = reward
 53 |             self.probs_buf[self.idx][i] = prob
 54 |             self.done_buf[self.idx][i] = done_mask
 55 | 
 56 |         self.num_in_buffer += 1
 57 |         self.num_in_buffer = min(self.buffer_size - 1, self.num_in_buffer)
 58 | 
 59 |     def _initialize_buffers(self, state: np.ndarray, probs: np.ndarray):
 60 |         """Initialze buffers for state, action, reward, prob, done."""
 61 |         self.obs_buf = np.zeros(
 62 |             [self.buffer_size, self.n_rollout] + list(state.shape), dtype=state.dtype
 63 |         )
 64 |         self.acts_buf = np.zeros([self.buffer_size, self.n_rollout, 1], dtype=np.uint8)
 65 |         self.rews_buf = np.zeros(
 66 |             [self.buffer_size, self.n_rollout, 1], dtype=np.float64
 67 |         )
 68 |         self.probs_buf = np.zeros(
 69 |             [self.buffer_size, self.n_rollout] + list(probs.shape), dtype=probs.dtype
 70 |         )
 71 |         self.done_buf = np.zeros([self.buffer_size, self.n_rollout, 1])
 72 | 
 73 |     def sample(self, on_policy=False) -> Tuple[torch.Tensor, ...]:
 74 |         """Randomly sample a batch of experiences from memory.
 75 |         If on_policy, using last experience."""
 76 | 
 77 |         if on_policy:
 78 |             state = self.obs_buf[self.idx]
 79 |             action = self.acts_buf[self.idx]
 80 |             reward = self.rews_buf[self.idx]
 81 |             prob = self.probs_buf[self.idx]
 82 |             done = self.done_buf[self.idx]
 83 | 
 84 |         else:
 85 |             idx = random.randint(1, self.num_in_buffer)
 86 |             state = self.obs_buf[idx]
 87 |             action = self.acts_buf[idx]
 88 |             reward = self.rews_buf[idx]
 89 |             prob = self.probs_buf[idx]
 90 |             done = self.done_buf[idx]
 91 | 
 92 |         state = torch.FloatTensor(state)
 93 |         action = torch.LongTensor(action)
 94 |         reward = torch.FloatTensor(reward)
 95 |         prob = torch.FloatTensor(prob)
 96 |         done = torch.FloatTensor(done)
 97 | 
 98 |         return state, action, reward, prob, done
 99 | 
100 |     def __len__(self) -> int:
101 |         """Return the current size of internal memory."""
102 |         return self.num_in_buffer
103 | 


--------------------------------------------------------------------------------
/run_lunarlander_v2.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """Train or test algorithms on LunarLander-v2.
  3 | 
  4 | - Author: Curt Park
  5 | - Contact: curt.park@medipixel.io
  6 | """
  7 | 
  8 | import argparse
  9 | import datetime
 10 | 
 11 | import gym
 12 | 
 13 | from rl_algorithms import build_agent
 14 | import rl_algorithms.common.env.utils as env_utils
 15 | import rl_algorithms.common.helper_functions as common_utils
 16 | from rl_algorithms.utils import YamlConfig
 17 | 
 18 | 
 19 | def parse_args() -> argparse.Namespace:
 20 |     # configurations
 21 |     parser = argparse.ArgumentParser(description="Pytorch RL algorithms")
 22 |     parser.add_argument(
 23 |         "--seed", type=int, default=777, help="random seed for reproducibility"
 24 |     )
 25 |     parser.add_argument(
 26 |         "--integration-test",
 27 |         dest="integration_test",
 28 |         action="store_true",
 29 |         help="for integration test",
 30 |     )
 31 |     parser.add_argument(
 32 |         "--cfg-path",
 33 |         type=str,
 34 |         default="./configs/lunarlander_v2/dqn.yaml",
 35 |         help="config path",
 36 |     )
 37 |     parser.add_argument(
 38 |         "--test", dest="test", action="store_true", help="test mode (no training)"
 39 |     )
 40 |     parser.add_argument(
 41 |         "--load-from",
 42 |         type=str,
 43 |         default=None,
 44 |         help="load the saved model and optimizer at the beginning",
 45 |     )
 46 |     parser.add_argument(
 47 |         "--off-render", dest="render", action="store_false", help="turn off rendering"
 48 |     )
 49 |     parser.add_argument(
 50 |         "--render-after",
 51 |         type=int,
 52 |         default=0,
 53 |         help="start rendering after the input number of episode",
 54 |     )
 55 |     parser.add_argument(
 56 |         "--log", dest="log", action="store_true", help="turn on logging"
 57 |     )
 58 |     parser.add_argument(
 59 |         "--save-period", type=int, default=100, help="save model period"
 60 |     )
 61 |     parser.add_argument(
 62 |         "--episode-num", type=int, default=1500, help="total episode num"
 63 |     )
 64 |     parser.add_argument(
 65 |         "--max-episode-steps", type=int, default=300, help="max episode step"
 66 |     )
 67 |     parser.add_argument(
 68 |         "--interim-test-num",
 69 |         type=int,
 70 |         default=10,
 71 |         help="number of test during training",
 72 |     )
 73 | 
 74 |     return parser.parse_args()
 75 | 
 76 | 
 77 | def main():
 78 |     """Main."""
 79 |     args = parse_args()
 80 | 
 81 |     # env initialization
 82 |     env_name = "LunarLander-v2"
 83 |     env = gym.make(env_name)
 84 |     env, max_episode_steps = env_utils.set_env(env, args.max_episode_steps)
 85 | 
 86 |     # set a random seed
 87 |     common_utils.set_random_seed(args.seed, env)
 88 | 
 89 |     # run
 90 |     NOWTIMES = datetime.datetime.now()
 91 |     curr_time = NOWTIMES.strftime("%y%m%d_%H%M%S")
 92 | 
 93 |     cfg = YamlConfig(dict(agent=args.cfg_path)).get_config_dict()
 94 | 
 95 |     # If running integration test, simplify experiment
 96 |     if args.integration_test:
 97 |         cfg = common_utils.set_cfg_for_intergration_test(cfg)
 98 | 
 99 |     env_info = dict(
100 |         name=env.spec.id,
101 |         observation_space=env.observation_space,
102 |         action_space=env.action_space,
103 |         is_atari=False,
104 |     )
105 |     log_cfg = dict(agent=cfg.agent.type, curr_time=curr_time, cfg_path=args.cfg_path)
106 |     build_args = dict(
107 |         env=env,
108 |         env_info=env_info,
109 |         log_cfg=log_cfg,
110 |         is_test=args.test,
111 |         load_from=args.load_from,
112 |         is_render=args.render,
113 |         render_after=args.render_after,
114 |         is_log=args.log,
115 |         save_period=args.save_period,
116 |         episode_num=args.episode_num,
117 |         max_episode_steps=max_episode_steps,
118 |         interim_test_num=args.interim_test_num,
119 |     )
120 |     agent = build_agent(cfg.agent, build_args)
121 | 
122 |     if not args.test:
123 |         agent.train()
124 |     else:
125 |         agent.test()
126 | 
127 | 
128 | if __name__ == "__main__":
129 |     main()
130 | 


--------------------------------------------------------------------------------
/rl_algorithms/fd/dqn_learner.py:
--------------------------------------------------------------------------------
  1 | """Learner for DQfD Agent.
  2 | 
  3 | - Author: Kyunghwan Kim
  4 | - Contact: kh.kim@medipixel.io
  5 | """
  6 | 
  7 | from typing import Tuple, Union
  8 | 
  9 | import numpy as np
 10 | import torch
 11 | from torch.nn.utils import clip_grad_norm_
 12 | 
 13 | from rl_algorithms.common.abstract.learner import TensorTuple
 14 | import rl_algorithms.common.helper_functions as common_utils
 15 | from rl_algorithms.dqn.learner import DQNLearner
 16 | from rl_algorithms.registry import LEARNERS
 17 | 
 18 | 
 19 | @LEARNERS.register_module
 20 | class DQfDLearner(DQNLearner):
 21 |     """Learner for DDPGfD Agent."""
 22 | 
 23 |     def update_model(
 24 |         self, experience: Union[TensorTuple, Tuple[TensorTuple]]
 25 |     ) -> TensorTuple:  # type: ignore
 26 |         """Train the model after each episode."""
 27 |         if self.use_n_step:
 28 |             experience_1, experience_n = experience
 29 |         else:
 30 |             experience_1 = experience
 31 | 
 32 |         weights, indices, eps_d = experience_1[-3:]
 33 |         actions = experience_1[1]
 34 | 
 35 |         # 1 step loss
 36 |         gamma = self.hyper_params.gamma
 37 |         dq_loss_element_wise, q_values = self.loss_fn(
 38 |             self.dqn, self.dqn_target, experience_1, gamma, self.head_cfg
 39 |         )
 40 |         dq_loss = torch.mean(dq_loss_element_wise * weights)
 41 | 
 42 |         # n step loss
 43 |         if self.use_n_step:
 44 |             gamma = self.hyper_params.gamma ** self.hyper_params.n_step
 45 |             dq_loss_n_element_wise, q_values_n = self.loss_fn(
 46 |                 self.dqn, self.dqn_target, experience_n, gamma, self.head_cfg
 47 |             )
 48 | 
 49 |             # to update loss and priorities
 50 |             q_values = 0.5 * (q_values + q_values_n)
 51 |             dq_loss_element_wise += dq_loss_n_element_wise * self.hyper_params.lambda1
 52 |             dq_loss = torch.mean(dq_loss_element_wise * weights)
 53 | 
 54 |         # supervised loss using demo for only demo transitions
 55 |         demo_idxs = np.where(eps_d != 0.0)
 56 |         n_demo = demo_idxs[0].size
 57 |         if n_demo != 0:  # if 1 or more demos are sampled
 58 |             # get margin for each demo transition
 59 |             action_idxs = actions[demo_idxs].long()
 60 |             margin = torch.ones(q_values.size()) * self.hyper_params.margin
 61 |             margin[demo_idxs, action_idxs] = 0.0  # demo actions have 0 margins
 62 |             margin = margin.to(self.device)
 63 | 
 64 |             # calculate supervised loss
 65 |             demo_q_values = q_values[demo_idxs, action_idxs].squeeze()
 66 |             supervised_loss = torch.max(q_values + margin, dim=-1)[0]
 67 |             supervised_loss = supervised_loss[demo_idxs] - demo_q_values
 68 |             supervised_loss = torch.mean(supervised_loss) * self.hyper_params.lambda2
 69 |         else:  # no demo sampled
 70 |             supervised_loss = torch.zeros(1, device=self.device)
 71 | 
 72 |         # q_value regularization
 73 |         q_regular = torch.norm(q_values, 2).mean() * self.hyper_params.w_q_reg
 74 | 
 75 |         # total loss
 76 |         loss = dq_loss + supervised_loss + q_regular
 77 | 
 78 |         # train dqn
 79 |         self.dqn_optim.zero_grad()
 80 |         loss.backward()
 81 |         clip_grad_norm_(self.dqn.parameters(), self.hyper_params.gradient_clip)
 82 |         self.dqn_optim.step()
 83 | 
 84 |         # update target networks
 85 |         common_utils.soft_update(self.dqn, self.dqn_target, self.hyper_params.tau)
 86 | 
 87 |         # update priorities in PER
 88 |         loss_for_prior = dq_loss_element_wise.detach().cpu().numpy().squeeze()
 89 |         new_priorities = loss_for_prior + self.hyper_params.per_eps
 90 |         new_priorities += eps_d
 91 | 
 92 |         if self.head_cfg.configs.use_noisy_net:
 93 |             self.dqn.head.reset_noise()
 94 |             self.dqn_target.head.reset_noise()
 95 | 
 96 |         return (
 97 |             loss.item(),
 98 |             dq_loss.item(),
 99 |             supervised_loss.item(),
100 |             q_values.mean().item(),
101 |             n_demo,
102 |             indices,
103 |             new_priorities,
104 |         )
105 | 


--------------------------------------------------------------------------------
/run_lunarlander_continuous_v2.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """Train or test algorithms on LunarLanderContinuous-v2.
  3 | 
  4 | - Author: Curt Park
  5 | - Contact: curt.park@medipixel.io
  6 | """
  7 | 
  8 | import argparse
  9 | import datetime
 10 | 
 11 | import gym
 12 | 
 13 | from rl_algorithms import build_agent
 14 | import rl_algorithms.common.env.utils as env_utils
 15 | import rl_algorithms.common.helper_functions as common_utils
 16 | from rl_algorithms.utils import YamlConfig
 17 | 
 18 | 
 19 | def parse_args() -> argparse.Namespace:
 20 |     # configurations
 21 |     parser = argparse.ArgumentParser(description="Pytorch RL algorithms")
 22 |     parser.add_argument(
 23 |         "--seed", type=int, default=777, help="random seed for reproducibility"
 24 |     )
 25 |     parser.add_argument(
 26 |         "--integration-test",
 27 |         dest="integration_test",
 28 |         action="store_true",
 29 |         help="for integration test",
 30 |     )
 31 |     parser.add_argument(
 32 |         "--cfg-path",
 33 |         type=str,
 34 |         default="./configs/lunarlander_continuous_v2/ddpg.yaml",
 35 |         help="config path",
 36 |     )
 37 |     parser.add_argument(
 38 |         "--test", dest="test", action="store_true", help="test mode (no training)"
 39 |     )
 40 |     parser.add_argument(
 41 |         "--load-from",
 42 |         type=str,
 43 |         default=None,
 44 |         help="load the saved model and optimizer at the beginning",
 45 |     )
 46 |     parser.add_argument(
 47 |         "--off-render", dest="render", action="store_false", help="turn off rendering"
 48 |     )
 49 |     parser.add_argument(
 50 |         "--render-after",
 51 |         type=int,
 52 |         default=0,
 53 |         help="start rendering after the input number of episode",
 54 |     )
 55 |     parser.add_argument(
 56 |         "--log", dest="log", action="store_true", help="turn on logging"
 57 |     )
 58 |     parser.add_argument(
 59 |         "--save-period", type=int, default=100, help="save model period"
 60 |     )
 61 |     parser.add_argument(
 62 |         "--episode-num", type=int, default=1500, help="total episode num"
 63 |     )
 64 |     parser.add_argument(
 65 |         "--max-episode-steps", type=int, default=300, help="max episode step"
 66 |     )
 67 |     parser.add_argument(
 68 |         "--interim-test-num",
 69 |         type=int,
 70 |         default=10,
 71 |         help="number of test during training",
 72 |     )
 73 | 
 74 |     return parser.parse_args()
 75 | 
 76 | 
 77 | def main():
 78 |     """Main."""
 79 |     args = parse_args()
 80 | 
 81 |     # env initialization
 82 |     env_name = "LunarLanderContinuous-v2"
 83 |     env = gym.make(env_name)
 84 |     env, max_episode_steps = env_utils.set_env(env, args.max_episode_steps)
 85 | 
 86 |     # set a random seed
 87 |     common_utils.set_random_seed(args.seed, env)
 88 | 
 89 |     # run
 90 |     NOWTIMES = datetime.datetime.now()
 91 |     curr_time = NOWTIMES.strftime("%y%m%d_%H%M%S")
 92 | 
 93 |     cfg = YamlConfig(dict(agent=args.cfg_path)).get_config_dict()
 94 | 
 95 |     # If running integration test, simplify experiment
 96 |     if args.integration_test:
 97 |         cfg = common_utils.set_cfg_for_intergration_test(cfg)
 98 | 
 99 |     env_info = dict(
100 |         name=env.spec.id,
101 |         observation_space=env.observation_space,
102 |         action_space=env.action_space,
103 |         is_atari=False,
104 |     )
105 |     log_cfg = dict(agent=cfg.agent.type, curr_time=curr_time, cfg_path=args.cfg_path)
106 |     build_args = dict(
107 |         env=env,
108 |         env_info=env_info,
109 |         log_cfg=log_cfg,
110 |         is_test=args.test,
111 |         load_from=args.load_from,
112 |         is_render=args.render,
113 |         render_after=args.render_after,
114 |         is_log=args.log,
115 |         save_period=args.save_period,
116 |         episode_num=args.episode_num,
117 |         max_episode_steps=max_episode_steps,
118 |         interim_test_num=args.interim_test_num,
119 |     )
120 |     agent = build_agent(cfg.agent, build_args)
121 | 
122 |     if not args.test:
123 |         agent.train()
124 |     else:
125 |         agent.test()
126 | 
127 | 
128 | if __name__ == "__main__":
129 |     main()
130 | 


--------------------------------------------------------------------------------
/rl_algorithms/utils/registry.py:
--------------------------------------------------------------------------------
  1 | import inspect
  2 | 
  3 | import ray
  4 | 
  5 | from rl_algorithms.utils.config import ConfigDict
  6 | 
  7 | 
  8 | class Registry:
  9 |     def __init__(self, name):
 10 |         self._name = name
 11 |         self._module_dict = dict()
 12 | 
 13 |     def __repr__(self):
 14 |         format_str = self.__class__.__name__ + "(name={}, items={})".format(
 15 |             self._name, list(self._module_dict.keys())
 16 |         )
 17 |         return format_str
 18 | 
 19 |     @property
 20 |     def name(self):
 21 |         return self._name
 22 | 
 23 |     @property
 24 |     def module_dict(self):
 25 |         return self._module_dict
 26 | 
 27 |     def get(self, key):
 28 |         return self._module_dict.get(key, None)
 29 | 
 30 |     def _register_module(self, module_class):
 31 |         """Register a module.
 32 |         Args:
 33 |             module (:obj:`nn.Module`): Module to be registered.
 34 |         """
 35 |         if not inspect.isclass(module_class):
 36 |             raise TypeError(
 37 |                 "module must be a class, but got {}".format(type(module_class))
 38 |             )
 39 |         module_name = module_class.__name__
 40 |         if module_name in self._module_dict:
 41 |             raise KeyError(
 42 |                 "{} is already registered in {}".format(module_name, self.name)
 43 |             )
 44 |         self._module_dict[module_name] = module_class
 45 | 
 46 |     def register_module(self, cls):
 47 |         self._register_module(cls)
 48 |         return cls
 49 | 
 50 | 
 51 | def build_from_cfg(cfg: ConfigDict, registry: Registry, default_args: dict = None):
 52 |     """Build a module from config dict.
 53 |     Args:
 54 |         cfg (:obj: `ConfigDict`): Config dict. It should at least contain the key "type".
 55 |         registry (:obj:`Registry`): The registry to search the type from.
 56 |         default_args (dict, optional): Default initialization arguments.
 57 |     Returns:
 58 |         obj: The constructed object.
 59 |     """
 60 |     assert isinstance(cfg, dict) and "type" in cfg
 61 |     assert isinstance(default_args, dict) or default_args is None
 62 |     args = cfg.copy()
 63 |     obj_type = args.pop("type")
 64 |     if isinstance(obj_type, str):
 65 |         obj_cls = registry.get(obj_type)
 66 |         if obj_cls is None:
 67 |             raise KeyError(
 68 |                 "{} is not in the {} registry".format(obj_type, registry.name)
 69 |             )
 70 |     elif inspect.isclass(obj_type):
 71 |         obj_cls = obj_type
 72 |     else:
 73 |         raise TypeError(
 74 |             "type must be a str or valid type, but got {}".format(type(obj_type))
 75 |         )
 76 | 
 77 |     if default_args is not None:
 78 |         for name, value in default_args.items():
 79 |             args.setdefault(name, value)
 80 |     return obj_cls(**args)
 81 | 
 82 | 
 83 | def build_ray_obj_from_cfg(
 84 |     cfg: ConfigDict, registry: Registry, default_args: dict = None
 85 | ):
 86 |     """Build a module from config dict.
 87 |     Args:
 88 |         cfg (:obj: `ConfigDict`): Config dict. It should at least contain the key "type".
 89 |         registry (:obj:`Registry`): The registry to search the type from.
 90 |         default_args (dict, optional): Default initialization arguments.
 91 |     Returns:
 92 |         obj: The constructed object.
 93 |     """
 94 |     assert isinstance(cfg, dict) and "type" in cfg
 95 |     assert isinstance(default_args, dict) or default_args is None
 96 |     args = cfg.copy()
 97 |     obj_type = args.pop("type")
 98 |     if isinstance(obj_type, str):
 99 |         obj_cls = registry.get(obj_type)
100 |         if obj_cls is None:
101 |             raise KeyError(
102 |                 "{} is not in the {} registry".format(obj_type, registry.name)
103 |             )
104 |     elif inspect.isclass(obj_type):
105 |         obj_cls = obj_type
106 |     else:
107 |         raise TypeError(
108 |             "type must be a str or valid type, but got {}".format(type(obj_type))
109 |         )
110 | 
111 |     if default_args is not None:
112 |         for name, value in default_args.items():
113 |             args.setdefault(name, value)
114 |     return ray.remote(num_cpus=1)(obj_cls).remote(**args)
115 | 


--------------------------------------------------------------------------------
/rl_algorithms/bc/ddpg_learner.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | from torch.nn.utils import clip_grad_norm_
 4 | 
 5 | from rl_algorithms.common.abstract.learner import TensorTuple
 6 | import rl_algorithms.common.helper_functions as common_utils
 7 | from rl_algorithms.ddpg.learner import DDPGLearner
 8 | from rl_algorithms.registry import LEARNERS
 9 | 
10 | 
11 | @LEARNERS.register_module
12 | class BCDDPGLearner(DDPGLearner):
13 |     """Learner for BCDDPG Agent.
14 | 
15 |     Attributes:
16 |         hyper_params (ConfigDict): hyper-parameters
17 |         optim_cfg (ConfigDict): config of optimizer
18 |         log_cfg (ConfigDict): configuration for saving log and checkpoint
19 |         actor (nn.Module): actor model to select actions
20 |         actor_target (nn.Module): target actor model to select actions
21 |         critic (nn.Module): critic model to predict state values
22 |         critic_target (nn.Module): target critic model to predict state values
23 |         actor_optim (Optimizer): optimizer for training actor
24 |         critic_optim (Optimizer): optimizer for training critic
25 | 
26 |     """
27 | 
28 |     def update_model(
29 |         self, experience: TensorTuple, demos: TensorTuple
30 |     ) -> TensorTuple:  # type: ignore
31 |         """Update actor and critic networks."""
32 |         exp_states, exp_actions, exp_rewards, exp_next_states, exp_dones = experience
33 |         demo_states, demo_actions, demo_rewards, demo_next_states, demo_dones = demos
34 | 
35 |         states = torch.cat((exp_states, demo_states), dim=0)
36 |         actions = torch.cat((exp_actions, demo_actions), dim=0)
37 |         rewards = torch.cat((exp_rewards, demo_rewards), dim=0)
38 |         next_states = torch.cat((exp_next_states, demo_next_states), dim=0)
39 |         dones = torch.cat((exp_dones, demo_dones), dim=0)
40 | 
41 |         # G_t   = r + gamma * v(s_{t+1})  if state != Terminal
42 |         #       = r                       otherwise
43 |         masks = 1 - dones
44 |         next_actions = self.actor_target(next_states)
45 |         next_values = self.critic_target(torch.cat((next_states, next_actions), dim=-1))
46 |         curr_returns = rewards + (self.hyper_params.gamma * next_values * masks)
47 |         curr_returns = curr_returns.to(self.device)
48 | 
49 |         # critic loss
50 |         gradient_clip_ac = self.hyper_params.gradient_clip_ac
51 |         gradient_clip_cr = self.hyper_params.gradient_clip_cr
52 | 
53 |         values = self.critic(torch.cat((states, actions), dim=-1))
54 |         critic_loss = F.mse_loss(values, curr_returns)
55 | 
56 |         # train critic
57 |         self.critic_optim.zero_grad()
58 |         critic_loss.backward()
59 |         clip_grad_norm_(self.critic.parameters(), gradient_clip_cr)
60 |         self.critic_optim.step()
61 | 
62 |         # policy loss
63 |         actions = self.actor(states)
64 |         policy_loss = -self.critic(torch.cat((states, actions), dim=-1)).mean()
65 | 
66 |         # bc loss
67 |         pred_actions = self.actor(demo_states)
68 |         qf_mask = torch.gt(
69 |             self.critic(torch.cat((demo_states, demo_actions), dim=-1)),
70 |             self.critic(torch.cat((demo_states, pred_actions), dim=-1)),
71 |         ).to(self.device)
72 |         qf_mask = qf_mask.float()
73 |         n_qf_mask = int(qf_mask.sum().item())
74 | 
75 |         if n_qf_mask == 0:
76 |             bc_loss = torch.zeros(1, device=self.device)
77 |         else:
78 |             bc_loss = (
79 |                 torch.mul(pred_actions, qf_mask) - torch.mul(demo_actions, qf_mask)
80 |             ).pow(2).sum() / n_qf_mask
81 | 
82 |         # train actor: pg loss + BC loss
83 |         actor_loss = (
84 |             self.hyper_params.lambda1 * policy_loss
85 |             + self.hyper_params.lambda2 * bc_loss
86 |         )
87 |         self.actor_optim.zero_grad()
88 |         actor_loss.backward()
89 |         clip_grad_norm_(self.actor.parameters(), gradient_clip_ac)
90 |         self.actor_optim.step()
91 | 
92 |         # update target networks
93 |         common_utils.soft_update(self.actor, self.actor_target, self.hyper_params.tau)
94 |         common_utils.soft_update(self.critic, self.critic_target, self.hyper_params.tau)
95 | 
96 |         return actor_loss.item(), critic_loss.item(), n_qf_mask
97 | 


--------------------------------------------------------------------------------
/run_reacher_v2.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """Train or test algorithms on Reacher-v2 of Mujoco.
  3 | 
  4 | - Author: Kyunghwan Kim
  5 | - Contact: kh.kim@medipixel.io
  6 | """
  7 | 
  8 | import argparse
  9 | import datetime
 10 | 
 11 | import gym
 12 | 
 13 | from rl_algorithms import build_agent
 14 | import rl_algorithms.common.env.utils as env_utils
 15 | import rl_algorithms.common.helper_functions as common_utils
 16 | from rl_algorithms.utils import YamlConfig
 17 | 
 18 | 
 19 | def parse_args() -> argparse.Namespace:
 20 |     # configurations
 21 |     parser = argparse.ArgumentParser(description="Pytorch RL rl_algorithms")
 22 |     parser.add_argument(
 23 |         "--seed", type=int, default=777, help="random seed for reproducibility"
 24 |     )
 25 |     parser.add_argument("--algo", type=str, default="ddpg", help="choose an algorithm")
 26 |     parser.add_argument(
 27 |         "--cfg-path",
 28 |         type=str,
 29 |         default="./configs/reacher_v2/ddpg.yaml",
 30 |         help="config path",
 31 |     )
 32 |     parser.add_argument(
 33 |         "--integration-test",
 34 |         dest="integration_test",
 35 |         action="store_true",
 36 |         help="for integration test",
 37 |     )
 38 |     parser.add_argument(
 39 |         "--test", dest="test", action="store_true", help="test mode (no training)"
 40 |     )
 41 |     parser.add_argument(
 42 |         "--load-from",
 43 |         type=str,
 44 |         default=None,
 45 |         help="load the saved model and optimizer at the beginning",
 46 |     )
 47 |     parser.add_argument(
 48 |         "--off-render", dest="render", action="store_false", help="turn off rendering"
 49 |     )
 50 |     parser.add_argument(
 51 |         "--render-after",
 52 |         type=int,
 53 |         default=0,
 54 |         help="start rendering after the input number of episode",
 55 |     )
 56 |     parser.add_argument(
 57 |         "--log", dest="log", action="store_true", help="turn on logging"
 58 |     )
 59 |     parser.add_argument(
 60 |         "--save-period", type=int, default=200, help="save model period"
 61 |     )
 62 |     parser.add_argument(
 63 |         "--episode-num", type=int, default=20000, help="total episode num"
 64 |     )
 65 |     parser.add_argument(
 66 |         "--max-episode-steps", type=int, default=-1, help="max episode step"
 67 |     )
 68 |     parser.add_argument(
 69 |         "--interim-test-num",
 70 |         type=int,
 71 |         default=10,
 72 |         help="number of test during training",
 73 |     )
 74 | 
 75 |     return parser.parse_args()
 76 | 
 77 | 
 78 | def main():
 79 |     """Main."""
 80 |     args = parse_args()
 81 | 
 82 |     # env initialization
 83 |     env_name = "Reacher-v2"
 84 |     env = gym.make(env_name)
 85 |     env, max_episode_steps = env_utils.set_env(env, args.max_episode_steps)
 86 | 
 87 |     # set a random seed
 88 |     common_utils.set_random_seed(args.seed, env)
 89 | 
 90 |     # run
 91 |     NOWTIMES = datetime.datetime.now()
 92 |     curr_time = NOWTIMES.strftime("%y%m%d_%H%M%S")
 93 | 
 94 |     cfg = YamlConfig(dict(agent=args.cfg_path)).get_config_dict()
 95 | 
 96 |     # If running integration test, simplify experiment
 97 |     if args.integration_test:
 98 |         cfg = common_utils.set_cfg_for_intergration_test(cfg)
 99 | 
100 |     env_info = dict(
101 |         name=env.spec.id,
102 |         observation_space=env.observation_space,
103 |         action_space=env.action_space,
104 |         is_atari=False,
105 |     )
106 |     log_cfg = dict(agent=cfg.agent.type, curr_time=curr_time, cfg_path=args.cfg_path)
107 |     build_args = dict(
108 |         env=env,
109 |         env_info=env_info,
110 |         log_cfg=log_cfg,
111 |         is_test=args.test,
112 |         load_from=args.load_from,
113 |         is_render=args.render,
114 |         render_after=args.render_after,
115 |         is_log=args.log,
116 |         save_period=args.save_period,
117 |         episode_num=args.episode_num,
118 |         max_episode_steps=max_episode_steps,
119 |         interim_test_num=args.interim_test_num,
120 |     )
121 |     agent = build_agent(cfg.agent, build_args)
122 | 
123 |     if not args.test:
124 |         agent.train()
125 |     else:
126 |         agent.test()
127 | 
128 | 
129 | if __name__ == "__main__":
130 |     main()
131 | 


--------------------------------------------------------------------------------
/rl_algorithms/common/buffer/distillation_buffer.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """Distillation buffer."""
  3 | 
  4 | import os
  5 | import pickle
  6 | from typing import List
  7 | 
  8 | import torch
  9 | from torch.utils.data import DataLoader, Dataset
 10 | 
 11 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 12 | 
 13 | 
 14 | class DistillationBuffer:
 15 |     """Class for managing reading and writing of distillation data.
 16 |        Distillation data is stored in the dataset_path location on the actual hard disk.
 17 |        The data collected by the teacher is stored as individual pickle files.
 18 |        It is also read as batch through the pytorch DataLoader class.
 19 | 
 20 |     Attributes:
 21 |         batch_size (int): size of batch size from distillation buffer for training
 22 |         dataset_path (list): list of distillation buffer path
 23 |         curr_time (str): program's start time to distinguish between teacher agents
 24 |         idx (int): index of data
 25 |         buffer_size (int): distillation buffer size
 26 |         dataloader (DataLoader): pytorch library for random batch data sampling
 27 | 
 28 |     """
 29 | 
 30 |     def __init__(
 31 |         self,
 32 |         batch_size: int,
 33 |         dataset_path: List[str],
 34 |     ):
 35 |         """Initialize a DistillationBuffer object.
 36 | 
 37 |         Args:
 38 |             batch_size (int): size of a batched sampled from distillation buffer for training
 39 |             dataset_path (list): list of distillation buffer path
 40 |             curr_time (str): program's start time to distinguish between teacher agents
 41 | 
 42 |         """
 43 |         self.batch_size = batch_size
 44 |         self.dataset_path = dataset_path
 45 |         self.idx = 0
 46 |         self.buffer_size = 0
 47 |         self.dataloader = None
 48 |         self.is_contain_q = False
 49 | 
 50 |     def reset_dataloader(self):
 51 |         """Initialize and reset DataLoader class.
 52 |         DataLoader class must be reset for every epoch.
 53 |         """
 54 |         dataset = DistillationDataset(self.dataset_path)
 55 |         self.is_contain_q = dataset.is_contain_q
 56 |         self.buffer_size = len(dataset)
 57 |         self.dataloader = iter(
 58 |             DataLoader(dataset, batch_size=self.batch_size, shuffle=True, num_workers=4)
 59 |         )
 60 | 
 61 |     def sample_for_diltillation(self):
 62 |         """Sample a batch of state and Q-value for student's learning."""
 63 |         assert (
 64 |             self.buffer_size >= self.batch_size
 65 |         ), f"buffer size({self.buffer_size}) < ({self.batch_size})"
 66 | 
 67 |         return next(self.dataloader)
 68 | 
 69 | 
 70 | class DistillationDataset(Dataset):
 71 |     """Pytorch Dataset class for random batch data sampling.
 72 | 
 73 |     Attributes:
 74 |         dataset_path (str): distillation buffer path
 75 | 
 76 |     """
 77 | 
 78 |     def __init__(self, dataset_path: List[str]):
 79 |         """Initialize a DistillationBuffer object.
 80 | 
 81 |         Args:
 82 |             dataset_path (str): distillation buffer path
 83 |             file_name_list (list): transition's file name list in distillation buffer path
 84 | 
 85 |         """
 86 |         super().__init__()
 87 |         self.dataset_path = dataset_path
 88 |         self.file_name_list = []
 89 | 
 90 |         sum_data_len = 0
 91 |         for _dir in self.dataset_path:
 92 |             tmp = os.listdir(_dir)
 93 |             self.file_name_list += [os.path.join(_dir, x) for x in tmp]
 94 |             with open(self.file_name_list[-1], "rb") as f:
 95 |                 data = pickle.load(f)
 96 |             sum_data_len += int(len(data) == 2)
 97 | 
 98 |         if sum_data_len == len(self.dataset_path):
 99 |             self.is_contain_q = True
100 |         elif sum_data_len == 0:
101 |             self.is_contain_q = False
102 |         else:
103 |             raise AssertionError(
104 |                 "There is a mixture of data with q present and non-existent ones"
105 |                 + "in buffer-path."
106 |             )
107 | 
108 |     def __len__(self):
109 |         """Denotes the total number of samples."""
110 |         return len(self.file_name_list)
111 | 
112 |     def __getitem__(self, index):
113 |         """Generates one sample of data."""
114 |         with open(self.file_name_list[index], "rb") as f:
115 |             transition = pickle.load(f)
116 |         return transition
117 | 


--------------------------------------------------------------------------------
/rl_algorithms/dqn/linear.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """Linear module for dqn algorithms
  3 | 
  4 | - Author: Kyunghwan Kim
  5 | - Contact: kh.kim@medipixel.io
  6 | """
  7 | 
  8 | import math
  9 | 
 10 | import numpy as np
 11 | import torch
 12 | import torch.nn as nn
 13 | import torch.nn.functional as F
 14 | 
 15 | from rl_algorithms.common.helper_functions import numpy2floattensor
 16 | 
 17 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 18 | 
 19 | 
 20 | # TODO: Remove it when upgrade torch>=1.7
 21 | # pylint: disable=abstract-method
 22 | class NoisyLinear(nn.Module):
 23 |     """Noisy linear module for NoisyNet.
 24 | 
 25 |     References:
 26 |         https://github.com/higgsfield/RL-Adventure/blob/master/5.noisy%20dqn.ipynb
 27 |         https://github.com/Kaixhin/Rainbow/blob/master/model.py
 28 | 
 29 |     Attributes:
 30 |         in_features (int): input size of linear module
 31 |         out_features (int): output size of linear module
 32 |         std_init (float): initial std value
 33 |         weight_mu (nn.Parameter): mean value weight parameter
 34 |         weight_sigma (nn.Parameter): std value weight parameter
 35 |         bias_mu (nn.Parameter): mean value bias parameter
 36 |         bias_sigma (nn.Parameter): std value bias parameter
 37 | 
 38 |     """
 39 | 
 40 |     def __init__(self, in_features: int, out_features: int, std_init: float = 0.5):
 41 |         """Initialize."""
 42 |         super(NoisyLinear, self).__init__()
 43 |         self.in_features = in_features
 44 |         self.out_features = out_features
 45 |         self.std_init = std_init
 46 | 
 47 |         self.weight_mu = nn.Parameter(torch.Tensor(out_features, in_features))
 48 |         self.weight_sigma = nn.Parameter(torch.Tensor(out_features, in_features))
 49 |         self.register_buffer("weight_epsilon", torch.Tensor(out_features, in_features))
 50 | 
 51 |         self.bias_mu = nn.Parameter(torch.Tensor(out_features))
 52 |         self.bias_sigma = nn.Parameter(torch.Tensor(out_features))
 53 |         self.register_buffer("bias_epsilon", torch.Tensor(out_features))
 54 | 
 55 |         self.reset_parameters()
 56 |         self.reset_noise()
 57 | 
 58 |     def reset_parameters(self):
 59 |         """Reset trainable network parameters (factorized gaussian noise)."""
 60 |         mu_range = 1 / math.sqrt(self.in_features)
 61 |         self.weight_mu.data.uniform_(-mu_range, mu_range)
 62 |         self.weight_sigma.data.fill_(self.std_init / math.sqrt(self.in_features))
 63 |         self.bias_mu.data.uniform_(-mu_range, mu_range)
 64 |         self.bias_sigma.data.fill_(self.std_init / math.sqrt(self.out_features))
 65 | 
 66 |     @staticmethod
 67 |     def scale_noise(size: int) -> torch.Tensor:
 68 |         """Set scale to make noise (factorized gaussian noise)."""
 69 |         x = numpy2floattensor(np.random.normal(loc=0.0, scale=1.0, size=size), device)
 70 | 
 71 |         return x.sign().mul(x.abs().sqrt())
 72 | 
 73 |     def reset_noise(self):
 74 |         """Make new noise."""
 75 |         epsilon_in = self.scale_noise(self.in_features)
 76 |         epsilon_out = self.scale_noise(self.out_features)
 77 | 
 78 |         # outer product
 79 |         self.weight_epsilon.copy_(epsilon_out.ger(epsilon_in))
 80 |         self.bias_epsilon.copy_(epsilon_out)
 81 | 
 82 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
 83 |         """Forward method implementation.
 84 | 
 85 |         We don't use separate statements on train / eval mode.
 86 |         It doesn't show remarkable difference of performance.
 87 |         """
 88 |         return F.linear(
 89 |             x,
 90 |             self.weight_mu + self.weight_sigma * self.weight_epsilon,
 91 |             self.bias_mu + self.bias_sigma * self.bias_epsilon,
 92 |         )
 93 | 
 94 | 
 95 | class NoisyLinearConstructor:
 96 |     """Constructor class for changing hyper parameters of NoisyLinear.
 97 | 
 98 |     Attributes:
 99 |         std_init (float): initial std value
100 | 
101 |     """
102 | 
103 |     def __init__(self, std_init: float = 0.5):
104 |         """Initialize."""
105 |         self.std_init = std_init
106 | 
107 |     def __call__(self, in_features: int, out_features: int) -> NoisyLinear:
108 |         """Return NoisyLinear instance set hyper parameters"""
109 |         return NoisyLinear(in_features, out_features, self.std_init)
110 | 
111 | 
112 | class NoisyMLPHandler:
113 |     """Includes methods to handle noisy linear."""
114 | 
115 |     def reset_noise(self):
116 |         """Re-sample noise"""
117 |         for _, module in self.named_children():
118 |             module.reset_noise()
119 | 


--------------------------------------------------------------------------------
/rl_algorithms/fd/ddpg_learner.py:
--------------------------------------------------------------------------------
  1 | from typing import Tuple, Union
  2 | 
  3 | import torch
  4 | from torch.nn.utils import clip_grad_norm_
  5 | 
  6 | from rl_algorithms.common.abstract.learner import TensorTuple
  7 | import rl_algorithms.common.helper_functions as common_utils
  8 | from rl_algorithms.ddpg.learner import DDPGLearner
  9 | from rl_algorithms.registry import LEARNERS
 10 | 
 11 | 
 12 | @LEARNERS.register_module
 13 | class DDPGfDLearner(DDPGLearner):
 14 |     """Learner for DDPGfD Agent.
 15 | 
 16 |     Attributes:
 17 |         hyper_params (ConfigDict): hyper-parameters
 18 |         optim_cfg (ConfigDict): config of optimizer
 19 |         log_cfg (ConfigDict): configuration for saving log and checkpoint
 20 |         actor (nn.Module): actor model to select actions
 21 |         actor_target (nn.Module): target actor model to select actions
 22 |         critic (nn.Module): critic model to predict state values
 23 |         critic_target (nn.Module): target critic model to predict state values
 24 |         actor_optim (Optimizer): optimizer for training actor
 25 |         critic_optim (Optimizer): optimizer for training critic
 26 | 
 27 |     """
 28 | 
 29 |     def _get_critic_loss(
 30 |         self, experiences: Tuple[TensorTuple, ...], gamma: float
 31 |     ) -> torch.Tensor:
 32 |         """Return element-wise critic loss."""
 33 |         states, actions, rewards, next_states, dones = experiences[:5]
 34 | 
 35 |         # G_t   = r + gamma * v(s_{t+1})  if state != Terminal
 36 |         #       = r                       otherwise
 37 |         masks = 1 - dones
 38 |         next_actions = self.actor_target(next_states)
 39 |         next_states_actions = torch.cat((next_states, next_actions), dim=-1)
 40 |         next_values = self.critic_target(next_states_actions)
 41 |         curr_returns = rewards + gamma * next_values * masks
 42 |         curr_returns = curr_returns.to(self.device).detach()
 43 | 
 44 |         # train critic
 45 |         values = self.critic(torch.cat((states, actions), dim=-1))
 46 |         critic_loss_element_wise = (values - curr_returns).pow(2)
 47 | 
 48 |         return critic_loss_element_wise
 49 | 
 50 |     def update_model(
 51 |         self, experience: Union[TensorTuple, Tuple[TensorTuple]]
 52 |     ) -> TensorTuple:  # type: ignore
 53 |         """Train the model after each episode."""
 54 |         use_n_step = self.hyper_params.n_step > 1
 55 |         if use_n_step:
 56 |             experience_1, experience_n = experience
 57 |         else:
 58 |             experience_1 = experience
 59 | 
 60 |         states, actions = experience_1[:2]
 61 |         weights, indices, eps_d = experience_1[-3:]
 62 |         gamma = self.hyper_params.gamma
 63 | 
 64 |         # train critic
 65 |         gradient_clip_ac = self.hyper_params.gradient_clip_ac
 66 |         gradient_clip_cr = self.hyper_params.gradient_clip_cr
 67 | 
 68 |         critic_loss_element_wise = self._get_critic_loss(experience_1, gamma)
 69 |         critic_loss = torch.mean(critic_loss_element_wise * weights)
 70 | 
 71 |         if use_n_step:
 72 |             gamma = gamma ** self.hyper_params.n_step
 73 | 
 74 |             critic_loss_n_element_wise = self._get_critic_loss(experience_n, gamma)
 75 |             # to update loss and priorities
 76 |             critic_loss_element_wise += (
 77 |                 critic_loss_n_element_wise * self.hyper_params.lambda1
 78 |             )
 79 |             critic_loss = torch.mean(critic_loss_element_wise * weights)
 80 | 
 81 |         self.critic_optim.zero_grad()
 82 |         critic_loss.backward()
 83 |         clip_grad_norm_(self.critic.parameters(), gradient_clip_cr)
 84 |         self.critic_optim.step()
 85 | 
 86 |         # train actor
 87 |         actions = self.actor(states)
 88 |         actor_loss_element_wise = -self.critic(torch.cat((states, actions), dim=-1))
 89 |         actor_loss = torch.mean(actor_loss_element_wise * weights)
 90 |         self.actor_optim.zero_grad()
 91 |         actor_loss.backward()
 92 |         clip_grad_norm_(self.actor.parameters(), gradient_clip_ac)
 93 |         self.actor_optim.step()
 94 | 
 95 |         # update target networks
 96 |         common_utils.soft_update(self.actor, self.actor_target, self.hyper_params.tau)
 97 |         common_utils.soft_update(self.critic, self.critic_target, self.hyper_params.tau)
 98 | 
 99 |         # update priorities
100 |         new_priorities = critic_loss_element_wise
101 |         new_priorities += self.hyper_params.lambda3 * actor_loss_element_wise.pow(2)
102 |         new_priorities += self.hyper_params.per_eps
103 |         new_priorities = new_priorities.data.cpu().numpy().squeeze()
104 |         new_priorities += eps_d
105 | 
106 |         return (
107 |             actor_loss.item(),
108 |             critic_loss.item(),
109 |             indices,
110 |             new_priorities,
111 |         )
112 | 


--------------------------------------------------------------------------------
/rl_algorithms/common/buffer/segment_tree.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """Segment tree for Proirtized Replay Buffer."""
  3 | 
  4 | import operator
  5 | from typing import Callable
  6 | 
  7 | 
  8 | class SegmentTree:
  9 |     """Create SegmentTree.
 10 | 
 11 |     Taken from OpenAI baselines github repository:
 12 |     https://github.com/openai/baselines/blob/master/baselines/common/segment_tree.py
 13 | 
 14 |     Attributes:
 15 |         capacity (int)
 16 |         tree (list)
 17 |         operation (function)
 18 | 
 19 |     """
 20 | 
 21 |     def __init__(self, capacity: int, operation: Callable, init_value: float):
 22 |         """Initialize.
 23 | 
 24 |         Args:
 25 |             capacity (int)
 26 |             operation (function)
 27 |             init_value (float)
 28 | 
 29 |         """
 30 |         assert (
 31 |             capacity > 0 and capacity & (capacity - 1) == 0
 32 |         ), "capacity must be positive and a power of 2."
 33 |         self.capacity = capacity
 34 |         self.tree = [init_value for _ in range(2 * capacity)]
 35 |         self.operation = operation
 36 | 
 37 |     def _operate_helper(
 38 |         self, start: int, end: int, node: int, node_start: int, node_end: int
 39 |     ) -> float:
 40 |         """Returns result of operation in segment."""
 41 |         if start == node_start and end == node_end:
 42 |             return self.tree[node]
 43 |         mid = (node_start + node_end) // 2
 44 |         if end <= mid:
 45 |             return self._operate_helper(start, end, 2 * node, node_start, mid)
 46 |         else:
 47 |             if mid + 1 <= start:
 48 |                 return self._operate_helper(start, end, 2 * node + 1, mid + 1, node_end)
 49 |             else:
 50 |                 return self.operation(
 51 |                     self._operate_helper(start, mid, 2 * node, node_start, mid),
 52 |                     self._operate_helper(mid + 1, end, 2 * node + 1, mid + 1, node_end),
 53 |                 )
 54 | 
 55 |     def operate(self, start: int = 0, end: int = 0) -> float:
 56 |         """Returns result of applying `self.operation`."""
 57 |         if end <= 0:
 58 |             end += self.capacity
 59 |         end -= 1
 60 | 
 61 |         return self._operate_helper(start, end, 1, 0, self.capacity - 1)
 62 | 
 63 |     def __setitem__(self, idx: int, val: float):
 64 |         """Set value in tree."""
 65 |         idx += self.capacity
 66 |         self.tree[idx] = val
 67 | 
 68 |         idx //= 2
 69 |         while idx >= 1:
 70 |             self.tree[idx] = self.operation(self.tree[2 * idx], self.tree[2 * idx + 1])
 71 |             idx //= 2
 72 | 
 73 |     def __getitem__(self, idx: int) -> float:
 74 |         """Get real value in leaf node of tree."""
 75 |         assert 0 <= idx < self.capacity
 76 | 
 77 |         return self.tree[self.capacity + idx]
 78 | 
 79 | 
 80 | class SumSegmentTree(SegmentTree):
 81 |     """Create SumSegmentTree.
 82 | 
 83 |     Taken from OpenAI baselines github repository:
 84 |     https://github.com/openai/baselines/blob/master/baselines/common/segment_tree.py
 85 | 
 86 |     """
 87 | 
 88 |     def __init__(self, capacity: int):
 89 |         """Initialize.
 90 | 
 91 |         Args:
 92 |             capacity (int)
 93 | 
 94 |         """
 95 |         super(SumSegmentTree, self).__init__(
 96 |             capacity=capacity, operation=operator.add, init_value=0.0
 97 |         )
 98 | 
 99 |     def sum(self, start: int = 0, end: int = 0) -> float:
100 |         """Returns arr[start] + ... + arr[end]."""
101 |         return super(SumSegmentTree, self).operate(start, end)
102 | 
103 |     def retrieve(self, upperbound: float) -> int:
104 |         """Find the highest index `i` about upper bound in the tree."""
105 |         # TODO: Check assert case and fix bug
106 |         assert 0 <= upperbound <= self.sum() + 1e-5, "upperbound: {}".format(upperbound)
107 | 
108 |         idx = 1
109 | 
110 |         while idx < self.capacity:  # while non-leaf
111 |             left = 2 * idx
112 |             right = left + 1
113 |             if self.tree[left] > upperbound:
114 |                 idx = 2 * idx
115 |             else:
116 |                 upperbound -= self.tree[left]
117 |                 idx = right
118 |         return idx - self.capacity
119 | 
120 | 
121 | class MinSegmentTree(SegmentTree):
122 |     """Create SegmentTree.
123 | 
124 |     Taken from OpenAI baselines github repository:
125 |     https://github.com/openai/baselines/blob/master/baselines/common/segment_tree.py
126 | 
127 |     """
128 | 
129 |     def __init__(self, capacity: int):
130 |         """Initialize.
131 | 
132 |         Args:
133 |             capacity (int)
134 | 
135 |         """
136 |         super(MinSegmentTree, self).__init__(
137 |             capacity=capacity, operation=min, init_value=float("inf")
138 |         )
139 | 
140 |     def min(self, start: int = 0, end: int = 0) -> float:
141 |         """Returns min(arr[start], ...,  arr[end])."""
142 |         return super(MinSegmentTree, self).operate(start, end)
143 | 


--------------------------------------------------------------------------------
/rl_algorithms/bc/sac_learner.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | 
  4 | from rl_algorithms.common.abstract.learner import TensorTuple
  5 | import rl_algorithms.common.helper_functions as common_utils
  6 | from rl_algorithms.registry import LEARNERS
  7 | from rl_algorithms.sac.learner import SACLearner
  8 | 
  9 | 
 10 | @LEARNERS.register_module
 11 | class BCSACLearner(SACLearner):
 12 |     """Learner for BCSAC Agent.
 13 | 
 14 |     Attributes:
 15 |         hyper_params (ConfigDict): hyper-parameters
 16 |         log_cfg (ConfigDict): configuration for saving log and checkpoint
 17 |     """
 18 | 
 19 |     def update_model(
 20 |         self, experience: TensorTuple, demos: TensorTuple
 21 |     ) -> TensorTuple:  # type: ignore
 22 |         """Train the model after each episode."""
 23 |         self.update_step += 1
 24 | 
 25 |         states, actions, rewards, next_states, dones = experience
 26 |         demo_states, demo_actions, _, _, _ = demos
 27 |         new_actions, log_prob, pre_tanh_value, mu, std = self.actor(states)
 28 |         pred_actions, _, _, _, _ = self.actor(demo_states)
 29 | 
 30 |         # train alpha
 31 |         if self.hyper_params.auto_entropy_tuning:
 32 |             alpha_loss = (
 33 |                 -self.log_alpha * (log_prob + self.target_entropy).detach()
 34 |             ).mean()
 35 | 
 36 |             self.alpha_optim.zero_grad()
 37 |             alpha_loss.backward()
 38 |             self.alpha_optim.step()
 39 | 
 40 |             alpha = self.log_alpha.exp()
 41 |         else:
 42 |             alpha_loss = torch.zeros(1)
 43 |             alpha = self.hyper_params.w_entropy
 44 | 
 45 |         # Q function loss
 46 |         masks = 1 - dones
 47 |         states_actions = torch.cat((states, actions), dim=-1)
 48 |         q_1_pred = self.qf_1(states_actions)
 49 |         q_2_pred = self.qf_2(states_actions)
 50 |         v_target = self.vf_target(next_states)
 51 |         q_target = rewards + self.hyper_params.gamma * v_target * masks
 52 |         qf_1_loss = F.mse_loss(q_1_pred, q_target.detach())
 53 |         qf_2_loss = F.mse_loss(q_2_pred, q_target.detach())
 54 | 
 55 |         # V function loss
 56 |         states_actions = torch.cat((states, new_actions), dim=-1)
 57 |         v_pred = self.vf(states)
 58 |         q_pred = torch.min(self.qf_1(states_actions), self.qf_2(states_actions))
 59 |         v_target = q_pred - alpha * log_prob
 60 |         vf_loss = F.mse_loss(v_pred, v_target.detach())
 61 | 
 62 |         # update actor
 63 |         actor_loss = torch.zeros(1)
 64 |         n_qf_mask = 0
 65 |         if self.update_step % self.hyper_params.policy_update_freq == 0:
 66 |             # bc loss
 67 |             qf_mask = torch.gt(
 68 |                 self.qf_1(torch.cat((demo_states, demo_actions), dim=-1)),
 69 |                 self.qf_1(torch.cat((demo_states, pred_actions), dim=-1)),
 70 |             ).to(self.device)
 71 |             qf_mask = qf_mask.float()
 72 |             n_qf_mask = int(qf_mask.sum().item())
 73 | 
 74 |             if n_qf_mask == 0:
 75 |                 bc_loss = torch.zeros(1, device=self.device)
 76 |             else:
 77 |                 bc_loss = (
 78 |                     torch.mul(pred_actions, qf_mask) - torch.mul(demo_actions, qf_mask)
 79 |                 ).pow(2).sum() / n_qf_mask
 80 | 
 81 |             # actor loss
 82 |             advantage = q_pred - v_pred.detach()
 83 |             actor_loss = (alpha * log_prob - advantage).mean()
 84 |             actor_loss = (
 85 |                 self.hyper_params.lambda1 * actor_loss
 86 |                 + self.hyper_params.lambda2 * bc_loss
 87 |             )
 88 | 
 89 |             # regularization
 90 |             mean_reg, std_reg = (
 91 |                 self.hyper_params.w_mean_reg * mu.pow(2).mean(),
 92 |                 self.hyper_params.w_std_reg * std.pow(2).mean(),
 93 |             )
 94 |             pre_activation_reg = self.hyper_params.w_pre_activation_reg * (
 95 |                 pre_tanh_value.pow(2).sum(dim=-1).mean()
 96 |             )
 97 |             actor_reg = mean_reg + std_reg + pre_activation_reg
 98 | 
 99 |             # actor loss + regularization
100 |             actor_loss += actor_reg
101 | 
102 |             # train actor
103 |             self.actor_optim.zero_grad()
104 |             actor_loss.backward()
105 |             self.actor_optim.step()
106 | 
107 |             # update target networks
108 |             common_utils.soft_update(self.vf, self.vf_target, self.hyper_params.tau)
109 | 
110 |         # train Q functions
111 |         self.qf_1_optim.zero_grad()
112 |         qf_1_loss.backward()
113 |         self.qf_1_optim.step()
114 | 
115 |         self.qf_2_optim.zero_grad()
116 |         qf_2_loss.backward()
117 |         self.qf_2_optim.step()
118 | 
119 |         # train V function
120 |         self.vf_optim.zero_grad()
121 |         vf_loss.backward()
122 |         self.vf_optim.step()
123 | 
124 |         return (
125 |             actor_loss.item(),
126 |             qf_1_loss.item(),
127 |             qf_2_loss.item(),
128 |             vf_loss.item(),
129 |             alpha_loss.item(),
130 |             n_qf_mask,
131 |         )
132 | 


--------------------------------------------------------------------------------
/run_pong_no_frameskip_v4.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """Train or test algorithms on PongNoFrameskip-v4.
  3 | 
  4 | - Author: Curt Park
  5 | - Contact: curt.park@medipixel.io
  6 | """
  7 | 
  8 | import argparse
  9 | import datetime
 10 | 
 11 | from rl_algorithms import build_agent
 12 | from rl_algorithms.common.env.atari_wrappers import atari_env_generator
 13 | import rl_algorithms.common.helper_functions as common_utils
 14 | from rl_algorithms.utils import YamlConfig
 15 | 
 16 | 
 17 | def parse_args() -> argparse.Namespace:
 18 |     # configurations
 19 |     parser = argparse.ArgumentParser(description="Pytorch RL algorithms")
 20 |     parser.add_argument(
 21 |         "--seed", type=int, default=161, help="random seed for reproducibility"
 22 |     )
 23 |     parser.add_argument(
 24 |         "--cfg-path",
 25 |         type=str,
 26 |         default="./configs/pong_no_frameskip_v4/dqn.yaml",
 27 |         help="config path",
 28 |     )
 29 |     parser.add_argument(
 30 |         "--integration-test",
 31 |         dest="integration_test",
 32 |         action="store_true",
 33 |         help="for integration test",
 34 |     )
 35 |     parser.add_argument(
 36 |         "--grad-cam",
 37 |         dest="grad_cam",
 38 |         action="store_true",
 39 |         help="test mode with viewing Grad-CAM",
 40 |     )
 41 |     parser.add_argument(
 42 |         "--test", dest="test", action="store_true", help="test mode (no training)"
 43 |     )
 44 |     parser.add_argument(
 45 |         "--load-from",
 46 |         type=str,
 47 |         default=None,
 48 |         help="load the saved model and optimizer at the beginning",
 49 |     )
 50 |     parser.add_argument(
 51 |         "--off-render", dest="render", action="store_false", help="turn off rendering"
 52 |     )
 53 |     parser.add_argument(
 54 |         "--render-after",
 55 |         type=int,
 56 |         default=0,
 57 |         help="start rendering after the input number of episode",
 58 |     )
 59 |     parser.add_argument(
 60 |         "--log", dest="log", action="store_true", help="turn on logging"
 61 |     )
 62 |     parser.add_argument("--save-period", type=int, default=20, help="save model period")
 63 |     parser.add_argument(
 64 |         "--episode-num", type=int, default=500, help="total episode num"
 65 |     )
 66 |     parser.add_argument(
 67 |         "--max-episode-steps", type=int, default=None, help="max episode step"
 68 |     )
 69 |     parser.add_argument(
 70 |         "--interim-test-num", type=int, default=5, help="interim test number"
 71 |     )
 72 |     parser.add_argument(
 73 |         "--off-framestack",
 74 |         dest="framestack",
 75 |         action="store_false",
 76 |         help="turn off framestack",
 77 |     )
 78 |     parser.add_argument(
 79 |         "--saliency-map",
 80 |         action="store_true",
 81 |         help="save saliency map",
 82 |     )
 83 | 
 84 |     return parser.parse_args()
 85 | 
 86 | 
 87 | def env_generator(env_name, max_episode_steps, frame_stack):
 88 |     def _thunk(rank: int):
 89 |         env = atari_env_generator(env_name, max_episode_steps, frame_stack=frame_stack)
 90 |         env.seed(777 + rank + 1)
 91 |         return env
 92 | 
 93 |     return _thunk
 94 | 
 95 | 
 96 | def main():
 97 |     """Main."""
 98 |     args = parse_args()
 99 | 
100 |     # env initialization
101 |     env_name = "PongNoFrameskip-v4"
102 |     env_gen = env_generator(
103 |         env_name, args.max_episode_steps, frame_stack=args.framestack
104 |     )
105 |     env = env_gen(0)
106 | 
107 |     # set a random seed
108 |     common_utils.set_random_seed(args.seed, env)
109 | 
110 |     # run
111 |     NOWTIMES = datetime.datetime.now()
112 |     curr_time = NOWTIMES.strftime("%y%m%d_%H%M%S")
113 | 
114 |     cfg = YamlConfig(dict(agent=args.cfg_path)).get_config_dict()
115 | 
116 |     # If running integration test, simplify experiment
117 |     if args.integration_test:
118 |         cfg = common_utils.set_cfg_for_intergration_test(cfg)
119 | 
120 |     env_info = dict(
121 |         name=env.spec.id,
122 |         observation_space=env.observation_space,
123 |         action_space=env.action_space,
124 |         is_atari=True,
125 |         env_generator=env_gen,
126 |     )
127 |     log_cfg = dict(agent=cfg.agent.type, curr_time=curr_time, cfg_path=args.cfg_path)
128 |     build_args = dict(
129 |         env=env,
130 |         env_info=env_info,
131 |         log_cfg=log_cfg,
132 |         is_test=args.test,
133 |         load_from=args.load_from,
134 |         is_render=args.render,
135 |         render_after=args.render_after,
136 |         is_log=args.log,
137 |         save_period=args.save_period,
138 |         episode_num=args.episode_num,
139 |         max_episode_steps=env.spec.max_episode_steps,
140 |         interim_test_num=args.interim_test_num,
141 |     )
142 |     agent = build_agent(cfg.agent, build_args)
143 | 
144 |     if not args.test:
145 |         agent.train()
146 |     elif args.test and args.grad_cam:
147 |         agent.test_with_gradcam()
148 |     elif args.test and args.saliency_map:
149 |         agent.test_with_saliency_map()
150 |     else:
151 |         agent.test()
152 | 
153 | 
154 | if __name__ == "__main__":
155 |     main()
156 | 


--------------------------------------------------------------------------------
/rl_algorithms/fd/sac_learner.py:
--------------------------------------------------------------------------------
  1 | from typing import Tuple
  2 | 
  3 | import torch
  4 | 
  5 | from rl_algorithms.common.abstract.learner import TensorTuple
  6 | import rl_algorithms.common.helper_functions as common_utils
  7 | from rl_algorithms.registry import LEARNERS
  8 | from rl_algorithms.sac.learner import SACLearner
  9 | 
 10 | 
 11 | @LEARNERS.register_module
 12 | class SACfDLearner(SACLearner):
 13 |     """Learner for BCSAC Agent."""
 14 | 
 15 |     # pylint: disable=too-many-statements
 16 |     def update_model(self, experience: Tuple[TensorTuple, ...]) -> TensorTuple:
 17 |         """Train the model after each episode."""
 18 |         use_n_step = self.hyper_params.n_step > 1
 19 | 
 20 |         if use_n_step:
 21 |             experience_1, experience_n = experience
 22 |         else:
 23 |             experience_1 = experience
 24 | 
 25 |         states, actions, rewards, next_states, dones = experience_1[:-3]
 26 |         weights, indices, eps_d = experience_1[-3:]
 27 |         new_actions, log_prob, pre_tanh_value, mu, std = self.actor(states)
 28 | 
 29 |         # train alpha
 30 |         if self.hyper_params.auto_entropy_tuning:
 31 |             alpha_loss = torch.mean(
 32 |                 (-self.log_alpha * (log_prob + self.target_entropy).detach()) * weights
 33 |             )
 34 | 
 35 |             self.alpha_optim.zero_grad()
 36 |             alpha_loss.backward()
 37 |             self.alpha_optim.step()
 38 | 
 39 |             alpha = self.log_alpha.exp()
 40 |         else:
 41 |             alpha_loss = torch.zeros(1)
 42 |             alpha = self.hyper_params.w_entropy
 43 | 
 44 |         # Q function loss
 45 |         masks = 1 - dones
 46 |         gamma = self.hyper_params.gamma
 47 |         states_actions = torch.cat((states, actions), dim=-1)
 48 |         q_1_pred = self.qf_1(states_actions)
 49 |         q_2_pred = self.qf_2(states_actions)
 50 |         v_target = self.vf_target(next_states)
 51 |         q_target = rewards + self.hyper_params.gamma * v_target * masks
 52 |         qf_1_loss = torch.mean((q_1_pred - q_target.detach()).pow(2) * weights)
 53 |         qf_2_loss = torch.mean((q_2_pred - q_target.detach()).pow(2) * weights)
 54 | 
 55 |         if use_n_step:
 56 |             _, _, rewards, next_states, dones = experience_n
 57 | 
 58 |             gamma = gamma ** self.hyper_params.n_step
 59 |             masks = 1 - dones
 60 | 
 61 |             v_target = self.vf_target(next_states)
 62 |             q_target = rewards + gamma * v_target * masks
 63 |             qf_1_loss_n = torch.mean((q_1_pred - q_target.detach()).pow(2) * weights)
 64 |             qf_2_loss_n = torch.mean((q_2_pred - q_target.detach()).pow(2) * weights)
 65 | 
 66 |             # to update loss and priorities
 67 |             qf_1_loss = qf_1_loss + qf_1_loss_n * self.hyper_params.lambda1
 68 |             qf_2_loss = qf_2_loss + qf_2_loss_n * self.hyper_params.lambda1
 69 | 
 70 |         # V function loss
 71 |         states_actions = torch.cat((states, new_actions), dim=-1)
 72 |         v_pred = self.vf(states)
 73 |         q_pred = torch.min(self.qf_1(states_actions), self.qf_2(states_actions))
 74 |         v_target = (q_pred - alpha * log_prob).detach()
 75 |         vf_loss_element_wise = (v_pred - v_target).pow(2)
 76 |         vf_loss = torch.mean(vf_loss_element_wise * weights)
 77 | 
 78 |         # actor loss
 79 |         advantage = q_pred - v_pred.detach()
 80 |         actor_loss_element_wise = alpha * log_prob - advantage
 81 |         actor_loss = torch.mean(actor_loss_element_wise * weights)
 82 | 
 83 |         # regularization
 84 |         mean_reg = self.hyper_params.w_mean_reg * mu.pow(2).mean()
 85 |         std_reg = self.hyper_params.w_std_reg * std.pow(2).mean()
 86 |         pre_activation_reg = self.hyper_params.w_pre_activation_reg * (
 87 |             pre_tanh_value.pow(2).sum(dim=-1).mean()
 88 |         )
 89 |         actor_reg = mean_reg + std_reg + pre_activation_reg
 90 | 
 91 |         # actor loss + regularization
 92 |         actor_loss += actor_reg
 93 | 
 94 |         # train actor
 95 |         self.actor_optim.zero_grad()
 96 |         actor_loss.backward()
 97 |         self.actor_optim.step()
 98 | 
 99 |         # update target networks
100 |         common_utils.soft_update(self.vf, self.vf_target, self.hyper_params.tau)
101 | 
102 |         # update priorities
103 |         new_priorities = vf_loss_element_wise
104 |         new_priorities += self.hyper_params.lambda3 * actor_loss_element_wise.pow(2)
105 |         new_priorities += self.hyper_params.per_eps
106 |         new_priorities = new_priorities.data.cpu().numpy().squeeze()
107 |         new_priorities += eps_d
108 | 
109 |         # train Q functions
110 |         self.qf_1_optim.zero_grad()
111 |         qf_1_loss.backward()
112 |         self.qf_1_optim.step()
113 | 
114 |         self.qf_2_optim.zero_grad()
115 |         qf_2_loss.backward()
116 |         self.qf_2_optim.step()
117 | 
118 |         # train V function
119 |         self.vf_optim.zero_grad()
120 |         vf_loss.backward()
121 |         self.vf_optim.step()
122 | 
123 |         return (
124 |             actor_loss.item(),
125 |             qf_1_loss.item(),
126 |             qf_2_loss.item(),
127 |             vf_loss.item(),
128 |             alpha_loss.item(),
129 |             indices,
130 |             new_priorities,
131 |         )
132 | 


--------------------------------------------------------------------------------
/rl_algorithms/common/grad_cam.py:
--------------------------------------------------------------------------------
  1 | """Grad-CAM class for analyzing CNN network.
  2 | 
  3 | - Author: Kyunghwan Kim
  4 | - Contact: kh.kim@medipixel.io
  5 | - Paper: https://arxiv.org/pdf/1610.02391v1.pdf
  6 | - Reference: https://github.com/RRoundTable/XAI
  7 | """
  8 | 
  9 | from collections import OrderedDict
 10 | from typing import Callable
 11 | 
 12 | import numpy as np
 13 | import torch
 14 | import torch.nn as nn
 15 | from torch.nn import functional as F
 16 | 
 17 | 
 18 | # pylint: disable=attribute-defined-outside-init
 19 | class CAMBaseWrapper:
 20 |     """Base Wrapping module for CAM."""
 21 | 
 22 |     def __init__(self, model: nn.Module):
 23 |         """Initialize."""
 24 |         super(CAMBaseWrapper, self).__init__()
 25 |         self.device = next(model.parameters()).device
 26 |         self.model = model
 27 |         self.handlers = []  # a set of hook function handlers
 28 | 
 29 |     def _encode_one_hot(self, ids: torch.Tensor) -> torch.Tensor:
 30 |         """Convert input to one-hot."""
 31 |         one_hot = torch.zeros_like(self.logits).to(self.device)
 32 |         one_hot[0][ids] = 1
 33 |         return one_hot
 34 | 
 35 |     def forward(self, image: torch.Tensor) -> torch.Tensor:
 36 |         """
 37 |         Simple classification
 38 |         """
 39 |         self.model.zero_grad()
 40 |         self.logits = self.model(image)
 41 |         return self.logits
 42 | 
 43 |     def backward(self, ids: torch.Tensor) -> torch.Tensor:
 44 |         """
 45 |         Class-specific backpropagation.
 46 |         Either way works:
 47 |         1. self.logits.backward(gradient=one_hot, retain_graph=True)
 48 |         2. (self.logits * one_hot).sum().backward(retain_graph=True)
 49 |         """
 50 | 
 51 |         one_hot = self._encode_one_hot(ids)
 52 |         self.logits.backward(gradient=one_hot, retain_graph=True)
 53 | 
 54 |     def generate(self, target_layer: str):
 55 |         raise NotImplementedError
 56 | 
 57 |     def remove_hook(self):
 58 |         """
 59 |         Remove all the forward/backward hook functions
 60 |         """
 61 |         for handle in self.handlers:
 62 |             handle.remove()
 63 | 
 64 | 
 65 | # pylint: disable=attribute-defined-outside-init
 66 | class GradCAM(CAMBaseWrapper):
 67 |     """
 68 |     "Grad-CAM: Visual Explanations from Deep Networks via Gradient-based Localization"
 69 |     https://arxiv.org/pdf/1610.02391.pdf
 70 |     Look at Figure 2 on page 4
 71 |     """
 72 | 
 73 |     def __init__(self, model: nn.Module, candidate_layers: list = None):
 74 |         """Initialize."""
 75 |         super(GradCAM, self).__init__(model)
 76 |         self.fmap_pool = OrderedDict()
 77 |         self.grad_pool = OrderedDict()
 78 |         self.candidate_layers = candidate_layers  # list
 79 | 
 80 |         def forward_hook(key: str) -> Callable:
 81 |             def forward_hook_(_, __, output: torch.Tensor):
 82 |                 # Save featuremaps
 83 |                 self.fmap_pool[key] = output.detach()
 84 | 
 85 |             return forward_hook_
 86 | 
 87 |         def backward_hook(key: str) -> Callable:
 88 |             def backward_hook_(_, __, grad_out: tuple):
 89 |                 # Save the gradients correspond to the featuremaps
 90 |                 self.grad_pool[key] = grad_out[0].detach()
 91 | 
 92 |             return backward_hook_
 93 | 
 94 |         # If any candidates are not specified, the hook is registered to all the layers.
 95 |         for name, module in self.model.named_modules():
 96 |             print(name, module)
 97 |             if self.candidate_layers is None or name in self.candidate_layers:
 98 |                 self.handlers.append(module.register_forward_hook(forward_hook(name)))
 99 |                 self.handlers.append(module.register_backward_hook(backward_hook(name)))
100 | 
101 |     @staticmethod
102 |     def _find(pool: OrderedDict, target_layer: str) -> torch.Tensor:
103 |         """Get designated layer from model."""
104 |         if target_layer in pool.keys():
105 |             return pool[target_layer]
106 |         else:
107 |             raise ValueError("Invalid layer name: {}".format(target_layer))
108 | 
109 |     @staticmethod
110 |     def _compute_grad_weights(grads: torch.Tensor) -> torch.Tensor:
111 |         """Compute gradient weight with average pooling."""
112 |         return F.adaptive_avg_pool2d(grads, 1)
113 | 
114 |     def forward(self, image: np.ndarray) -> torch.Tensor:
115 |         """Forward method implementation."""
116 |         self.image_shape = image.shape[1:]
117 |         return super(GradCAM, self).forward(image)
118 | 
119 |     def generate(self, target_layer: str) -> torch.Tensor:
120 |         """Generate feature map of target layer with Grad-CAM."""
121 |         fmaps = self._find(self.fmap_pool, target_layer)
122 |         grads = self._find(self.grad_pool, target_layer)
123 |         weights = self._compute_grad_weights(grads)
124 | 
125 |         gcam = torch.mul(fmaps, weights).sum(dim=1, keepdim=True)
126 |         gcam = F.relu(gcam)
127 | 
128 |         gcam = F.interpolate(
129 |             gcam, self.image_shape, mode="bilinear", align_corners=False
130 |         )
131 | 
132 |         B, C, H, W = gcam.shape
133 |         gcam = gcam.view(B, -1)
134 |         gcam -= gcam.min(dim=1, keepdim=True)[0]
135 |         gcam /= gcam.max(dim=1, keepdim=True)[0] + 1e-7
136 |         gcam = gcam.view(B, C, H, W)
137 | 
138 |         return gcam
139 | 


--------------------------------------------------------------------------------
/rl_algorithms/dqn/distributed_worker.py:
--------------------------------------------------------------------------------
  1 | """DQN worker for distributed training.
  2 | 
  3 | - Author: Chris Yoon
  4 | - Contact: chris.yoon@medipixel.io
  5 | """
  6 | 
  7 | from collections import OrderedDict
  8 | from typing import Dict, Tuple
  9 | 
 10 | import numpy as np
 11 | import torch
 12 | 
 13 | from rl_algorithms.common.abstract.distributed_worker import DistributedWorker
 14 | from rl_algorithms.common.helper_functions import numpy2floattensor
 15 | from rl_algorithms.common.networks.brain import Brain
 16 | from rl_algorithms.registry import WORKERS, build_loss
 17 | from rl_algorithms.utils.config import ConfigDict
 18 | 
 19 | 
 20 | @WORKERS.register_module
 21 | class DQNWorker(DistributedWorker):
 22 |     """DQN worker for distributed training.
 23 | 
 24 |     Attributes:
 25 |         backbone (ConfigDict): backbone configs for building network
 26 |         head (ConfigDict): head configs for building network
 27 |         state_dict (ConfigDict): initial network state dict received form learner
 28 |         device (str): literal to indicate cpu/cuda use
 29 | 
 30 |     """
 31 | 
 32 |     def __init__(
 33 |         self,
 34 |         rank: int,
 35 |         device: str,
 36 |         hyper_params: ConfigDict,
 37 |         env_name: str,
 38 |         is_atari: bool,
 39 |         max_episode_steps: int,
 40 |         loss_type: ConfigDict,
 41 |         state_dict: OrderedDict,
 42 |         backbone: ConfigDict,
 43 |         head: ConfigDict,
 44 |         state_size: int,
 45 |         output_size: int,
 46 |     ):
 47 |         DistributedWorker.__init__(
 48 |             self, rank, device, hyper_params, env_name, is_atari, max_episode_steps
 49 |         )
 50 | 
 51 |         self.loss_fn = build_loss(loss_type)
 52 |         self.backbone_cfg = backbone
 53 |         self.head_cfg = head
 54 |         self.head_cfg.configs.state_size = state_size
 55 |         self.head_cfg.configs.output_size = output_size
 56 | 
 57 |         self.use_n_step = self.hyper_params.n_step > 1
 58 | 
 59 |         self.max_epsilon = self.hyper_params.max_epsilon
 60 |         self.min_epsilon = self.hyper_params.min_epsilon
 61 |         self.epsilon = self.hyper_params.max_epsilon
 62 | 
 63 |         self._init_networks(state_dict)
 64 | 
 65 |     # pylint: disable=attribute-defined-outside-init
 66 |     def _init_networks(self, state_dict: OrderedDict):
 67 |         """Initialize DQN policy with learner state dict."""
 68 |         self.dqn = Brain(self.backbone_cfg, self.head_cfg).to(self.device)
 69 |         self.dqn.load_state_dict(state_dict)
 70 |         self.dqn.eval()
 71 | 
 72 |     def load_params(self, path: str):
 73 |         """Load model and optimizer parameters."""
 74 |         DistributedWorker.load_params(self, path)
 75 | 
 76 |         params = torch.load(path)
 77 |         self.dqn.load_state_dict(params["dqn_state_dict"])
 78 |         print("[INFO] loaded the model and optimizer from", path)
 79 | 
 80 |     def select_action(self, state: np.ndarray) -> np.ndarray:
 81 |         """Select an action from the input space."""
 82 |         # epsilon greedy policy
 83 |         # pylint: disable=comparison-with-callable
 84 |         if self.epsilon > np.random.random():
 85 |             selected_action = np.array(self.env.action_space.sample())
 86 |         else:
 87 |             with torch.no_grad():
 88 |                 state = self._preprocess_state(state, self.device)
 89 |                 selected_action = self.dqn(state).argmax()
 90 |             selected_action = selected_action.cpu().numpy()
 91 | 
 92 |         # Decay epsilon
 93 |         self.epsilon = max(
 94 |             self.epsilon
 95 |             - (self.max_epsilon - self.min_epsilon) * self.hyper_params.epsilon_decay,
 96 |             self.min_epsilon,
 97 |         )
 98 | 
 99 |         return selected_action
100 | 
101 |     def step(self, action: np.ndarray) -> Tuple[np.ndarray, np.float64, bool, dict]:
102 |         """Take an action and return the response of the env."""
103 |         next_state, reward, done, info = self.env.step(action)
104 |         return next_state, reward, done, info
105 | 
106 |     def compute_priorities(self, memory: Dict[str, np.ndarray]) -> np.ndarray:
107 |         """Compute initial priority values of experiences in local memory."""
108 |         states = numpy2floattensor(memory["states"], self.device)
109 |         actions = numpy2floattensor(memory["actions"], self.device).long()
110 |         rewards = numpy2floattensor(memory["rewards"].reshape(-1, 1), self.device)
111 |         next_states = numpy2floattensor(memory["next_states"], self.device)
112 |         dones = numpy2floattensor(memory["dones"].reshape(-1, 1), self.device)
113 |         memory_tensors = (states, actions, rewards, next_states, dones)
114 | 
115 |         with torch.no_grad():
116 |             dq_loss_element_wise, _ = self.loss_fn(
117 |                 self.dqn,
118 |                 self.dqn,
119 |                 memory_tensors,
120 |                 self.hyper_params.gamma,
121 |                 self.head_cfg,
122 |             )
123 |         loss_for_prior = dq_loss_element_wise.detach().cpu().numpy()
124 |         new_priorities = loss_for_prior + self.hyper_params.per_eps
125 |         return new_priorities
126 | 
127 |     def synchronize(self, new_state_dict: Dict[str, np.ndarray]):
128 |         """Synchronize worker dqn with learner dqn."""
129 |         self._synchronize(self.dqn, new_state_dict)
130 | 


--------------------------------------------------------------------------------
/rl_algorithms/common/apex/learner.py:
--------------------------------------------------------------------------------
  1 | """Learner Wrapper to enable Ape-X distributed training.
  2 | 
  3 | - Author: Chris Yoon
  4 | - Contact: chris.yoon@medipixel.io
  5 | """
  6 | 
  7 | from typing import Dict, List
  8 | 
  9 | import numpy as np
 10 | import pyarrow as pa
 11 | import ray
 12 | import zmq
 13 | 
 14 | from rl_algorithms.common.abstract.learner import DistributedLearnerWrapper, Learner
 15 | from rl_algorithms.common.helper_functions import numpy2floattensor, state_dict2numpy
 16 | from rl_algorithms.utils.config import ConfigDict
 17 | 
 18 | 
 19 | @ray.remote(num_gpus=1)
 20 | class ApeXLearnerWrapper(DistributedLearnerWrapper):
 21 |     """Learner Wrapper to enable Ape-X distributed training.
 22 | 
 23 |     Attributes:
 24 |         learner (Learner): learner
 25 |         comm_config (ConfigDict): configs for communication
 26 |         update_step (int): counts update steps
 27 |         pub_socket (zmq.Socket): publisher socket for broadcasting params
 28 |         rep_socket (zmq.Socket): reply socket for receiving replay data & sending new priorities
 29 |         update_step (int): number of update steps
 30 |         max_update_step (int): maximum update steps per run
 31 |         worker_update_interval (int): num update steps between worker synchronization
 32 |         logger_interval (int): num update steps between logging
 33 | 
 34 |     """
 35 | 
 36 |     def __init__(self, learner: Learner, comm_cfg: ConfigDict):
 37 |         """Initialize."""
 38 |         DistributedLearnerWrapper.__init__(self, learner, comm_cfg)
 39 |         self.update_step = 0
 40 |         self.max_update_step = self.learner.hyper_params.max_update_step
 41 |         self.worker_update_interval = self.learner.hyper_params.worker_update_interval
 42 |         self.logger_interval = self.learner.hyper_params.logger_interval
 43 | 
 44 |         # NOTE: disable because learner uses preprocessed n_step experience
 45 |         self.learner.use_n_step = False
 46 | 
 47 |     # pylint: disable=attribute-defined-outside-init
 48 |     def init_communication(self):
 49 |         """Initialize sockets for communication."""
 50 |         ctx = zmq.Context()
 51 |         # Socket to send updated network parameters to worker
 52 |         self.pub_socket = ctx.socket(zmq.PUB)
 53 |         self.pub_socket.setsockopt(zmq.SNDHWM, 2)
 54 |         self.pub_socket.bind(f"tcp://127.0.0.1:{self.comm_cfg.learner_worker_port}")
 55 | 
 56 |         # Socket to receive replay data and send new priorities to buffer
 57 |         self.rep_socket = ctx.socket(zmq.REP)
 58 |         self.rep_socket.bind(f"tcp://127.0.0.1:{self.comm_cfg.learner_buffer_port}")
 59 | 
 60 |         # Socket to send logging data to logger
 61 |         self.push_socket = ctx.socket(zmq.PUSH)
 62 |         self.push_socket.connect(f"tcp://127.0.0.1:{self.comm_cfg.learner_logger_port}")
 63 | 
 64 |     def recv_replay_data(self):
 65 |         """Receive replay data from gloal buffer."""
 66 |         replay_data_id = self.rep_socket.recv()
 67 |         replay_data = pa.deserialize(replay_data_id)
 68 |         return replay_data
 69 | 
 70 |     def send_new_priorities(self, indices: np.ndarray, priorities: np.ndarray):
 71 |         """Send new priority values and corresponding indices to buffer."""
 72 |         new_priors = [indices, priorities]
 73 |         new_priors_id = pa.serialize(new_priors).to_buffer()
 74 |         self.rep_socket.send(new_priors_id)
 75 | 
 76 |     def publish_params(self, update_step: int, np_state_dict: Dict[str, np.ndarray]):
 77 |         """Broadcast updated params to all workers."""
 78 |         param_info = [update_step, np_state_dict]
 79 |         new_params_id = pa.serialize(param_info).to_buffer()
 80 |         self.pub_socket.send(new_params_id)
 81 | 
 82 |     def send_info_to_logger(
 83 |         self,
 84 |         np_state_dict: List[np.ndarray],
 85 |         step_info: list,
 86 |     ):
 87 |         """Send new params and log info to logger."""
 88 |         log_value = dict(update_step=self.update_step, step_info=step_info)
 89 |         log_info = dict(log_value=log_value, state_dict=np_state_dict)
 90 |         log_info_id = pa.serialize(log_info).to_buffer()
 91 |         self.push_socket.send(log_info_id)
 92 | 
 93 |     def run(self):
 94 |         """Run main training loop."""
 95 |         self.telapsed = 0
 96 |         while self.update_step < self.max_update_step:
 97 |             replay_data = self.recv_replay_data()
 98 |             if replay_data is not None:
 99 |                 replay_data = (
100 |                     numpy2floattensor(replay_data[:6], self.learner.device)
101 |                     + replay_data[6:]
102 |                 )
103 |                 info = self.update_model(replay_data)
104 |                 indices, new_priorities = info[-2:]
105 |                 step_info = info[:-2]
106 |                 self.update_step = self.update_step + 1
107 | 
108 |                 self.send_new_priorities(indices, new_priorities)
109 | 
110 |                 if self.update_step % self.worker_update_interval == 0:
111 |                     state_dict = self.get_state_dict()
112 |                     np_state_dict = state_dict2numpy(state_dict)
113 |                     self.publish_params(self.update_step, np_state_dict)
114 | 
115 |                 if self.update_step % self.logger_interval == 0:
116 |                     state_dict = self.get_state_dict()
117 |                     np_state_dict = state_dict2numpy(state_dict)
118 |                     self.send_info_to_logger(np_state_dict, step_info)
119 |                     self.learner.save_params(self.update_step)
120 | 


--------------------------------------------------------------------------------
/rl_algorithms/common/abstract/learner.py:
--------------------------------------------------------------------------------
  1 | """Base Learner & LearnerWrapper class.
  2 | 
  3 | - Author: Chris Yoon
  4 | - Contact: chris.yoon@medipixel.io
  5 | """
  6 | 
  7 | from abc import ABC, abstractmethod
  8 | from collections import OrderedDict
  9 | import os
 10 | import shutil
 11 | from typing import Tuple, Union
 12 | 
 13 | import torch
 14 | import torch.nn as nn
 15 | 
 16 | from rl_algorithms.utils.config import ConfigDict
 17 | 
 18 | TensorTuple = Tuple[torch.Tensor, ...]
 19 | 
 20 | 
 21 | class BaseLearner(ABC):
 22 |     """Abstract class for all learner objects."""
 23 | 
 24 |     @abstractmethod
 25 |     def update_model(self, experience: Union[TensorTuple, Tuple[TensorTuple]]) -> tuple:
 26 |         pass
 27 | 
 28 |     @abstractmethod
 29 |     def save_params(self, n_episode: int):
 30 |         pass
 31 | 
 32 |     @abstractmethod
 33 |     def load_params(self, path: str):
 34 |         if not os.path.exists(path):
 35 |             raise Exception(
 36 |                 f"[ERROR] the input path does not exist. Wrong path: {path}"
 37 |             )
 38 | 
 39 |     @abstractmethod
 40 |     def get_state_dict(self) -> Union[OrderedDict, Tuple[OrderedDict]]:
 41 |         pass
 42 | 
 43 | 
 44 | class Learner(BaseLearner):
 45 |     """Base class for all base learners.
 46 | 
 47 |     Attributes:
 48 |         args (argparse.Namespace): arguments including hyperparameters and training settings
 49 |         hyper_params (ConfigDict): hyper-parameters
 50 |         log_cfg (ConfigDict): configuration for saving log
 51 |         sha (str): sha code of current git commit
 52 | 
 53 |     """
 54 | 
 55 |     def __init__(
 56 |         self,
 57 |         hyper_params: ConfigDict,
 58 |         log_cfg: ConfigDict,
 59 |         env_name: str,
 60 |         is_test: bool,
 61 |     ):
 62 |         """Initialize."""
 63 |         self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 64 |         self.hyper_params = hyper_params
 65 | 
 66 |         if not is_test:
 67 |             self.ckpt_path = (
 68 |                 f"./checkpoint/{env_name}/{log_cfg.agent}/{log_cfg.curr_time}"
 69 |             )
 70 |             os.makedirs(self.ckpt_path, exist_ok=True)
 71 | 
 72 |             # save configuration
 73 |             shutil.copy(log_cfg.cfg_path, os.path.join(self.ckpt_path, "config.yaml"))
 74 | 
 75 |     @abstractmethod
 76 |     def _init_network(self):
 77 |         pass
 78 | 
 79 |     @abstractmethod
 80 |     def update_model(self, experience: Union[TensorTuple, Tuple[TensorTuple]]) -> tuple:
 81 |         pass
 82 | 
 83 |     @abstractmethod
 84 |     def save_params(self, n_episode: int):
 85 |         pass
 86 | 
 87 |     def _save_params(self, params: dict, n_episode: int):
 88 |         """Save parameters of networks."""
 89 |         os.makedirs(self.ckpt_path, exist_ok=True)
 90 | 
 91 |         path = os.path.join(self.ckpt_path, f"ep_{str(n_episode)}.pt")
 92 |         torch.save(params, path)
 93 | 
 94 |         print(f"[INFO] Saved the model and optimizer to {path} \n")
 95 | 
 96 |     @abstractmethod
 97 |     def load_params(self, path: str):
 98 |         if not os.path.exists(path):
 99 |             raise Exception(
100 |                 f"[ERROR] the input path does not exist. Wrong path: {path}"
101 |             )
102 | 
103 |     @abstractmethod
104 |     def get_state_dict(self) -> Union[OrderedDict, Tuple[OrderedDict]]:
105 |         pass
106 | 
107 |     @abstractmethod
108 |     def get_policy(self) -> nn.Module:
109 |         pass
110 | 
111 | 
112 | class LearnerWrapper(BaseLearner):
113 |     """Base class for all learner wrappers."""
114 | 
115 |     def __init__(self, learner: BaseLearner):
116 |         """Initialize."""
117 |         self.learner = learner
118 | 
119 |     def update_model(self, experience: Union[TensorTuple, Tuple[TensorTuple]]) -> tuple:
120 |         return self.learner.update_model(experience)
121 | 
122 |     def save_params(self, n_episode: int):
123 |         self.learner.save_params(n_episode)
124 | 
125 |     def load_params(self, path: str):
126 |         self.learner.load_params(path)
127 | 
128 |     def get_state_dict(self) -> Union[OrderedDict, Tuple[OrderedDict]]:
129 |         return self.learner.get_state_dict()
130 | 
131 | 
132 | class DistributedLearnerWrapper(LearnerWrapper):
133 |     """Base wrapper class for distributed learners.
134 | 
135 |     Attributes:
136 |         learner (Learner): learner
137 |         comm_cfg (ConfigDict): configs for communication
138 | 
139 |     """
140 | 
141 |     def __init__(self, learner: Learner, comm_cfg: ConfigDict):
142 |         LearnerWrapper.__init__(self, learner)
143 |         self.comm_cfg = comm_cfg
144 | 
145 |     @abstractmethod
146 |     def init_communication(self):
147 |         pass
148 | 
149 |     def update_model(self, experience: Union[TensorTuple, Tuple[TensorTuple]]) -> tuple:
150 |         """Run one step of learner model update."""
151 |         return self.learner.update_model(experience)
152 | 
153 |     def save_params(self, n_update_step: int):
154 |         """Save learner params at defined directory."""
155 |         self.learner.save_params(n_update_step)
156 | 
157 |     def load_params(self, path: str):
158 |         """Load params at start."""
159 |         self.learner.load_params(path)
160 | 
161 |     def get_policy(self):
162 |         """Return model (policy) used for action selection, used only in grad cam."""
163 |         return self.learner.get_policy()
164 | 
165 |     def get_state_dict(self):
166 |         """Return state dicts."""
167 |         return self.learner.get_state_dict()
168 | 
169 |     @abstractmethod
170 |     def run(self):
171 |         pass
172 | 


--------------------------------------------------------------------------------
/rl_algorithms/bc/her.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """HER class and reward function for Behavior Cloning.
  3 | 
  4 | - Author: Kyunghwan Kim
  5 | - Contact: kh.kim@medipixel.io
  6 | """
  7 | from typing import Callable, Tuple
  8 | 
  9 | import numpy as np
 10 | 
 11 | from rl_algorithms.common.abstract.her import HER
 12 | from rl_algorithms.common.abstract.reward_fn import RewardFn
 13 | from rl_algorithms.registry import HERS
 14 | 
 15 | 
 16 | class L1DistanceRewardFn(RewardFn):
 17 |     def __call__(self, transition: tuple, goal_state: np.ndarray) -> np.float64:
 18 |         """L1 Distance reward function."""
 19 |         next_state = transition[3]
 20 |         eps = 1e-6
 21 |         if np.abs(next_state - goal_state).sum() < eps:
 22 |             return np.float64(0.0)
 23 |         else:
 24 |             return np.float64(-1.0)
 25 | 
 26 | 
 27 | l1_distance_reward_fn = L1DistanceRewardFn()
 28 | 
 29 | 
 30 | @HERS.register_module
 31 | class LunarLanderContinuousHER(HER):
 32 |     """HER for LunarLanderContinuous-v2 environment.
 33 | 
 34 |     Attributes:
 35 |         demo_goal_indices (np.ndarray): indices about goal of demo list
 36 |         desired_states (np.ndarray): desired states from demonstration
 37 | 
 38 |     """
 39 | 
 40 |     def __init__(
 41 |         self,
 42 |         reward_fn: Callable[[tuple, np.ndarray], np.float64] = l1_distance_reward_fn,
 43 |     ):
 44 |         """Initialize."""
 45 |         HER.__init__(self, reward_fn=reward_fn)
 46 |         self.is_goal_in_state = False
 47 | 
 48 |     # pylint: disable=attribute-defined-outside-init
 49 |     def fetch_desired_states_from_demo(self, demo: list):
 50 |         """Return desired goal states from demonstration data."""
 51 |         np_demo: np.ndarray = np.array(demo)
 52 |         self.demo_goal_indices: np.ndarray = np.where(np_demo[:, 4])[0]
 53 |         self.desired_states: np.ndarray = np_demo[self.demo_goal_indices][:, 0]
 54 | 
 55 |     def get_desired_state(self, *args) -> np.ndarray:
 56 |         """Sample one of the desired states."""
 57 |         return np.random.choice(self.desired_states, 1).item()
 58 | 
 59 |     def _get_final_state(self, transition: tuple) -> np.ndarray:
 60 |         """Get final state from transitions for making HER transitions."""
 61 |         return transition[0]
 62 | 
 63 |     def generate_demo_transitions(self, demo: list) -> list:
 64 |         """Return generated demo transitions for HER."""
 65 |         new_demo: list = list()
 66 | 
 67 |         # generate demo transitions
 68 |         prev_idx = 0
 69 |         for idx in self.demo_goal_indices:
 70 |             demo_final_state = self._get_final_state(demo[idx])
 71 |             transitions = [demo[i] for i in range(prev_idx, idx + 1)]
 72 |             prev_idx = idx + 1
 73 | 
 74 |             transitions = self.generate_transitions(
 75 |                 transitions, demo_final_state, 0, is_demo=True
 76 |             )
 77 | 
 78 |             new_demo.extend(transitions)
 79 | 
 80 |         return new_demo
 81 | 
 82 | 
 83 | class ReacherRewardFn(RewardFn):
 84 |     def __call__(self, transition: tuple, _) -> np.float64:
 85 |         """Reward function for Reacher-v2 environment."""
 86 |         state, action = transition[0:2]
 87 |         diff_vec = state[-3:]
 88 |         reward_dist = -1 * np.linalg.norm(diff_vec)
 89 |         reward_ctrl = -np.square(action).sum()
 90 | 
 91 |         return reward_dist + reward_ctrl
 92 | 
 93 | 
 94 | reacher_reward_fn = ReacherRewardFn()
 95 | 
 96 | 
 97 | @HERS.register_module
 98 | class ReacherHER(HER):
 99 |     """HER for Reacher-v2 environment."""
100 | 
101 |     def __init__(
102 |         self, reward_fn: Callable[[tuple, np.ndarray], np.float64] = reacher_reward_fn
103 |     ):
104 |         """Initialize."""
105 |         HER.__init__(self, reward_fn=reward_fn)
106 |         self.is_goal_in_state = True
107 | 
108 |     def fetch_desired_states_from_demo(self, _: list):
109 |         """Return desired goal states from demonstration data.
110 | 
111 |         DO NOT use this method because demo states have a goal position.
112 |         """
113 |         raise Exception("Do not use this method.")
114 | 
115 |     def get_desired_state(self, *args) -> np.ndarray:
116 |         """Sample one of the desired states.
117 | 
118 |         Returns an empty array since demo states have a goal position.
119 |         """
120 |         return np.array([])
121 | 
122 |     def _get_final_state(self, transition_final: tuple) -> np.ndarray:
123 |         """Get a finger-tip position from the final transition."""
124 |         return transition_final[0][8:10] + transition_final[0][2:4]
125 | 
126 |     def generate_demo_transitions(self, demo: list) -> list:
127 |         """Return generated demo transitions for HER.
128 | 
129 |         Works as an identity function in this class.
130 |         """
131 |         return demo
132 | 
133 |     def _append_origin_transitions(
134 |         self, origin_transitions: list, transition: tuple, _: np.ndarray
135 |     ):
136 |         """Append original transitions for training."""
137 |         origin_transitions.append(transition)
138 | 
139 |     def _get_transition(
140 |         self, transition: tuple, goal_state: np.ndarray
141 |     ) -> Tuple[np.ndarray, np.ndarray, np.float64, np.ndarray, bool]:
142 |         """Get a single transition concatenated with a goal state."""
143 |         state, action, _, next_state, done = transition
144 | 
145 |         reward = self.reward_fn(transition, goal_state)
146 |         state_ = state
147 |         state_[4:6] = goal_state
148 |         next_state_ = next_state
149 |         next_state_[4:6] = goal_state
150 | 
151 |         return state_, action, reward, next_state_, done
152 | 


--------------------------------------------------------------------------------
/rl_algorithms/a2c/learner.py:
--------------------------------------------------------------------------------
  1 | from collections import OrderedDict
  2 | from typing import Tuple
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch.nn.functional as F
  7 | from torch.nn.utils import clip_grad_norm_
  8 | import torch.optim as optim
  9 | 
 10 | from rl_algorithms.common.abstract.learner import Learner, TensorTuple
 11 | from rl_algorithms.common.helper_functions import numpy2floattensor
 12 | from rl_algorithms.common.networks.brain import Brain
 13 | from rl_algorithms.registry import LEARNERS
 14 | from rl_algorithms.utils.config import ConfigDict
 15 | 
 16 | 
 17 | @LEARNERS.register_module
 18 | class A2CLearner(Learner):
 19 |     """Learner for A2C Agent.
 20 | 
 21 |     Attributes:
 22 |         hyper_params (ConfigDict): hyper-parameters
 23 |         log_cfg (ConfigDict): configuration for saving log and checkpoint
 24 |         actor (nn.Module): actor model to select actions
 25 |         critic (nn.Module): critic model to predict state values
 26 |         actor_optim (Optimizer): optimizer for training actor
 27 |         critic_optim (Optimizer): optimizer for training critic
 28 | 
 29 |     """
 30 | 
 31 |     def __init__(
 32 |         self,
 33 |         hyper_params: ConfigDict,
 34 |         log_cfg: ConfigDict,
 35 |         backbone: ConfigDict,
 36 |         head: ConfigDict,
 37 |         optim_cfg: ConfigDict,
 38 |         env_name: str,
 39 |         state_size: tuple,
 40 |         output_size: int,
 41 |         is_test: bool,
 42 |         load_from: str,
 43 |     ):
 44 |         Learner.__init__(self, hyper_params, log_cfg, env_name, is_test)
 45 | 
 46 |         self.backbone_cfg = backbone
 47 |         self.head_cfg = head
 48 |         self.head_cfg.actor.configs.state_size = (
 49 |             self.head_cfg.critic.configs.state_size
 50 |         ) = state_size
 51 |         self.head_cfg.actor.configs.output_size = output_size
 52 |         self.optim_cfg = optim_cfg
 53 |         self.load_from = load_from
 54 | 
 55 |         self._init_network()
 56 | 
 57 |     def _init_network(self):
 58 |         """Initialize networks and optimizers."""
 59 |         self.actor = Brain(self.backbone_cfg.actor, self.head_cfg.actor).to(self.device)
 60 |         self.critic = Brain(self.backbone_cfg.critic, self.head_cfg.critic).to(
 61 |             self.device
 62 |         )
 63 | 
 64 |         # create optimizer
 65 |         self.actor_optim = optim.Adam(
 66 |             self.actor.parameters(),
 67 |             lr=self.optim_cfg.lr_actor,
 68 |             weight_decay=self.optim_cfg.weight_decay,
 69 |         )
 70 | 
 71 |         self.critic_optim = optim.Adam(
 72 |             self.critic.parameters(),
 73 |             lr=self.optim_cfg.lr_critic,
 74 |             weight_decay=self.optim_cfg.weight_decay,
 75 |         )
 76 | 
 77 |         if self.load_from is not None:
 78 |             self.load_params(self.load_from)
 79 | 
 80 |     def update_model(self, experience: TensorTuple) -> TensorTuple:
 81 |         """Update A2C actor and critic networks"""
 82 | 
 83 |         log_prob, pred_value, next_state, reward, done = experience
 84 |         next_state = numpy2floattensor(next_state, self.device)
 85 | 
 86 |         # Q_t   = r + gamma * V(s_{t+1})  if state != Terminal
 87 |         #       = r                       otherwise
 88 |         mask = 1 - done
 89 |         next_value = self.critic(next_state).detach()
 90 |         q_value = reward + self.hyper_params.gamma * next_value * mask
 91 |         q_value = q_value.to(self.device)
 92 | 
 93 |         # advantage = Q_t - V(s_t)
 94 |         advantage = q_value - pred_value
 95 | 
 96 |         # calculate loss at the current step
 97 |         policy_loss = -advantage.detach() * log_prob  # adv. is not backpropagated
 98 |         policy_loss += self.hyper_params.w_entropy * -log_prob  # entropy
 99 |         value_loss = F.smooth_l1_loss(pred_value, q_value.detach())
100 | 
101 |         # train
102 |         gradient_clip_ac = self.hyper_params.gradient_clip_ac
103 |         gradient_clip_cr = self.hyper_params.gradient_clip_cr
104 | 
105 |         self.actor_optim.zero_grad()
106 |         policy_loss.backward()
107 |         clip_grad_norm_(self.actor.parameters(), gradient_clip_ac)
108 |         self.actor_optim.step()
109 | 
110 |         self.critic_optim.zero_grad()
111 |         value_loss.backward()
112 |         clip_grad_norm_(self.critic.parameters(), gradient_clip_cr)
113 |         self.critic_optim.step()
114 | 
115 |         return policy_loss.item(), value_loss.item()
116 | 
117 |     def save_params(self, n_episode: int):
118 |         """Save model and optimizer parameters."""
119 |         params = {
120 |             "actor_state_dict": self.actor.state_dict(),
121 |             "critic_state_dict": self.critic.state_dict(),
122 |             "actor_optim_state_dict": self.actor_optim.state_dict(),
123 |             "critic_optim_state_dict": self.critic_optim.state_dict(),
124 |         }
125 | 
126 |         Learner._save_params(self, params, n_episode)
127 | 
128 |     def load_params(self, path: str):
129 |         """Load model and optimizer parameters."""
130 |         Learner.load_params(self, path)
131 | 
132 |         params = torch.load(path)
133 |         self.actor.load_state_dict(params["actor_state_dict"])
134 |         self.critic.load_state_dict(params["critic_state_dict"])
135 |         self.actor_optim.load_state_dict(params["actor_optim_state_dict"])
136 |         self.critic_optim.load_state_dict(params["critic_optim_state_dict"])
137 |         print("[INFO] Loaded the model and optimizer from", path)
138 | 
139 |     def get_state_dict(self) -> Tuple[OrderedDict]:
140 |         """Return state dicts, mainly for distributed worker."""
141 |         return (self.critic.state_dict(), self.actor.state_dict())
142 | 
143 |     def get_policy(self) -> nn.Module:
144 |         """Return model (policy) used for action selection."""
145 |         return self.actor
146 | 


--------------------------------------------------------------------------------
/rl_algorithms/common/abstract/distributed_worker.py:
--------------------------------------------------------------------------------
  1 | """Worker classes for distributed training.
  2 | 
  3 | - Author: Chris Yoon
  4 | - Contact: chris.yoon@medipixel.io
  5 | """
  6 | 
  7 | from abc import ABC, abstractmethod
  8 | import os
  9 | import random
 10 | from typing import Deque, Dict, Tuple
 11 | 
 12 | import gym
 13 | import numpy as np
 14 | import torch
 15 | 
 16 | from rl_algorithms.common.env.atari_wrappers import atari_env_generator
 17 | import rl_algorithms.common.env.utils as env_utils
 18 | from rl_algorithms.common.helper_functions import numpy2floattensor, set_random_seed
 19 | from rl_algorithms.common.networks.brain import Brain
 20 | from rl_algorithms.utils.config import ConfigDict
 21 | 
 22 | 
 23 | class BaseDistributedWorker(ABC):
 24 |     """Base class for Worker classes."""
 25 | 
 26 |     @abstractmethod
 27 |     def select_action(self, state: np.ndarray) -> np.ndarray:
 28 |         pass
 29 | 
 30 |     @abstractmethod
 31 |     def step(self, action: np.ndarray) -> Tuple[np.ndarray, np.float64, bool, dict]:
 32 |         pass
 33 | 
 34 |     @abstractmethod
 35 |     def synchronize(self, new_state_dict: Dict[str, np.ndarray]):
 36 |         pass
 37 | 
 38 |     # pylint: disable=no-self-use
 39 |     def _synchronize(self, network: Brain, new_state_dict: Dict[str, np.ndarray]):
 40 |         """Copy parameters from numpy arrays."""
 41 |         param_name_list = list(new_state_dict.keys())
 42 |         for worker_named_param in network.named_parameters():
 43 |             worker_param_name = worker_named_param[0]
 44 |             if worker_param_name in param_name_list:
 45 |                 new_param = numpy2floattensor(
 46 |                     new_state_dict[worker_param_name], self.device
 47 |                 )
 48 |                 worker_named_param[1].data.copy_(new_param)
 49 | 
 50 | 
 51 | class DistributedWorker(BaseDistributedWorker):
 52 |     """Base class for all functioning RL workers.
 53 | 
 54 |     Attributes:
 55 |         rank (int): rank (ID) of worker
 56 |         hyper_params (ConfigDict): algorithm hyperparameters
 57 |         device (torch.Device): device on which worker process runs
 58 |         env (gym.ENV): gym environment
 59 |     """
 60 | 
 61 |     def __init__(
 62 |         self,
 63 |         rank: int,
 64 |         device: str,
 65 |         hyper_params: ConfigDict,
 66 |         env_name: str,
 67 |         is_atari: bool,
 68 |         max_episode_steps: int,
 69 |     ):
 70 |         """Initialize."""
 71 |         self.rank = rank
 72 |         self.device = torch.device(device)
 73 | 
 74 |         self.hyper_params = hyper_params
 75 |         self.env_name = env_name
 76 |         self.is_atari = is_atari
 77 |         self.max_episode_steps = max_episode_steps
 78 | 
 79 |         self._init_env()
 80 | 
 81 |     # pylint: disable=attribute-defined-outside-init, no-self-use
 82 |     def _init_env(self):
 83 |         """Intialize worker local environment."""
 84 |         if self.is_atari:
 85 |             self.env = atari_env_generator(
 86 |                 self.env_name, self.max_episode_steps, frame_stack=True
 87 |             )
 88 |         else:
 89 |             self.env = gym.make(self.env_name)
 90 |             env_utils.set_env(self.env, self.max_episode_steps)
 91 | 
 92 |         random.seed(self.rank)
 93 |         env_seed = random.randint(0, 999)
 94 |         set_random_seed(env_seed, self.env)
 95 | 
 96 |     @abstractmethod
 97 |     def load_params(self, path: str):
 98 |         if not os.path.exists(path):
 99 |             raise Exception(
100 |                 f"[ERROR] the input path does not exist. Wrong path: {path}"
101 |             )
102 | 
103 |     @abstractmethod
104 |     def select_action(self, state: np.ndarray) -> np.ndarray:
105 |         pass
106 | 
107 |     @abstractmethod
108 |     def step(self, action: np.ndarray) -> Tuple[np.ndarray, np.float64, bool, dict]:
109 |         pass
110 | 
111 |     # NOTE: No need to explicitly implement for non-PER/non-Ape-X workers
112 |     @abstractmethod
113 |     def compute_priorities(self, experience: Dict[str, np.ndarray]):
114 |         pass
115 | 
116 |     @abstractmethod
117 |     def synchronize(self, new_state_dict: Dict[str, np.ndarray]):
118 |         pass
119 | 
120 |     @staticmethod
121 |     def _preprocess_state(state: np.ndarray, device: torch.device) -> torch.Tensor:
122 |         """Preprocess state so that actor selects an action."""
123 |         state = numpy2floattensor(state, device)
124 |         return state
125 | 
126 | 
127 | class DistributedWorkerWrapper(BaseDistributedWorker):
128 |     """Base wrapper class for distributed worker wrappers."""
129 | 
130 |     def __init__(self, worker: DistributedWorker, comm_cfg: ConfigDict):
131 |         self.worker = worker
132 |         self.comm_cfg = comm_cfg
133 | 
134 |     @abstractmethod
135 |     def init_communication(self):
136 |         pass
137 | 
138 |     def select_action(self, state: np.ndarray) -> np.ndarray:
139 |         """Select an action from the input space."""
140 |         return self.worker.select_action(state)
141 | 
142 |     def step(self, action: np.ndarray) -> Tuple[np.ndarray, np.float64, bool, dict]:
143 |         """Take an action and return the response of the env."""
144 |         return self.worker.step(action)
145 | 
146 |     def synchronize(self, new_state_dict: Dict[str, np.ndarray]):
147 |         """Synchronize worker brain with learner brain."""
148 |         self.worker.synchronize(new_state_dict)
149 | 
150 |     @abstractmethod
151 |     def collect_data(self) -> Dict[str, np.ndarray]:
152 |         pass
153 | 
154 |     @abstractmethod
155 |     def run(self):
156 |         pass
157 | 
158 |     def preprocess_nstep(self, nstepqueue: Deque) -> Tuple[np.ndarray, ...]:
159 |         """Return n-step transition with discounted reward."""
160 |         discounted_reward = 0
161 |         _, _, _, last_state, done = nstepqueue[-1]
162 |         for transition in list(reversed(nstepqueue)):
163 |             state, action, reward, _, _ = transition
164 |             discounted_reward = reward + self.hyper_params.gamma * discounted_reward
165 |         nstep_data = (state, action, discounted_reward, last_state, done)
166 | 
167 |         return nstep_data
168 | 


--------------------------------------------------------------------------------