├── playground ├── __init__.py ├── tests │ ├── __init__.py │ ├── test_memory.py │ └── test_utils.py ├── utils │ ├── __init__.py │ ├── wrappers.py │ ├── misc.py │ └── tf_ops.py ├── configs │ ├── __init__.py │ ├── data │ │ ├── reinforce-cartpole-v1.json │ │ ├── ddpg-bipedalwalker-v2.json │ │ ├── dqn-conv-pong-v0.json │ │ ├── dqn-dense-cartpole-v1.json │ │ ├── actor-critic-cartpole-v1.json │ │ ├── ppo-lunarlander-v2.json │ │ ├── qlearning-cartpole-v1.json │ │ └── dqn-lstm-mspacman-ram-v0.json │ └── manager.py ├── policies │ ├── __init__.py │ ├── memory.py │ ├── qlearning.py │ ├── base.py │ ├── reinforce.py │ ├── actor_critic.py │ ├── ddpg.py │ ├── ppo.py │ └── dqn.py └── learn.py ├── examples ├── cartpole.gif ├── cartpole-v1-reinforce.png └── cartpole-v1-reinforce-monitor.png ├── requirements.txt ├── .gitignore ├── setup.py └── README.md /playground/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /playground/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /playground/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /playground/configs/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /playground/tests/test_memory.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/cartpole.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Todo/deep-reinforcement-learning-gym/master/examples/cartpole.gif -------------------------------------------------------------------------------- /examples/cartpole-v1-reinforce.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Todo/deep-reinforcement-learning-gym/master/examples/cartpole-v1-reinforce.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | matplotlib==1.5.3 2 | setuptools==39.0.1 3 | numpy==1.14.0 4 | pandas==0.22.0 5 | tensorflow==1.5.0 6 | click==6.7 7 | gym==0.10.5 8 | -------------------------------------------------------------------------------- /examples/cartpole-v1-reinforce-monitor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Todo/deep-reinforcement-learning-gym/master/examples/cartpole-v1-reinforce-monitor.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | .idea 3 | .pytest_cache 4 | __pycache__ 5 | checkpoints/* 6 | logs/* 7 | tb/* 8 | figs/* 9 | **/*.pyc 10 | **/*.egg-info 11 | *~ 12 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | from setuptools import setup, find_packages 3 | 4 | setup( 5 | name='playground', 6 | version='0.1', 7 | description='Aloha!', 8 | url='http://github.com/lilianweng/playground', 9 | author='Lilian Weng', 10 | author_email='lilian.wengweng@gmail.com', 11 | packages=find_packages(exclude=['checkpoints', 'logs']), 12 | ) 13 | -------------------------------------------------------------------------------- /playground/configs/data/reinforce-cartpole-v1.json: -------------------------------------------------------------------------------- 1 | { 2 | "env_name": "CartPole-v1", 3 | "policy_name": "ReinforcePolicy", 4 | "policy_params": { 5 | "baseline": true, 6 | "layer_sizes": [ 7 | 32, 8 | 32 9 | ] 10 | }, 11 | "train_params": { 12 | "batch_size": 32, 13 | "lr": 0.001, 14 | "lr_decay": 0.998, 15 | "log_every_episode": 10, 16 | "n_episodes": 800 17 | } 18 | } -------------------------------------------------------------------------------- /playground/configs/data/ddpg-bipedalwalker-v2.json: -------------------------------------------------------------------------------- 1 | { 2 | "env_name": "BipedalWalker-v2", 3 | "policy_name": "DDPGPolicy", 4 | "policy_params": { 5 | "gamma": 0.99, 6 | "actor_layers": [32, 32], 7 | "critic_layers": [64, 64], 8 | "deterministic": true 9 | }, 10 | "train_params": { 11 | "n_steps": 100000, 12 | "warmup_steps": 35000, 13 | "batch_size": 64, 14 | "lr_a": 0.005, 15 | "lr_c": 0.005, 16 | "epsilon": 0.35, 17 | "epsilon_final": 0.0, 18 | "tau": 0.001 19 | } 20 | } -------------------------------------------------------------------------------- /playground/policies/__init__.py: -------------------------------------------------------------------------------- 1 | from playground.policies.actor_critic import ActorCriticPolicy 2 | from playground.policies.ddpg import DDPGPolicy 3 | from playground.policies.dqn import DqnPolicy 4 | from playground.policies.ppo import PPOPolicy 5 | from playground.policies.qlearning import QlearningPolicy 6 | from playground.policies.reinforce import ReinforcePolicy 7 | 8 | ALL_POLICIES = [ 9 | ActorCriticPolicy, 10 | DDPGPolicy, 11 | DqnPolicy, 12 | PPOPolicy, 13 | QlearningPolicy, 14 | ReinforcePolicy 15 | ] 16 | -------------------------------------------------------------------------------- /playground/configs/data/dqn-conv-pong-v0.json: -------------------------------------------------------------------------------- 1 | { 2 | "env_name": "Pong-v0", 3 | "policy_name": "DqnPolicy", 4 | "policy_params": { 5 | "batch_size": 32, 6 | "double_q": true, 7 | "dueling": true, 8 | "layer_sizes": [32, 32], 9 | "model_type": "conv", 10 | "step_size": 1 11 | }, 12 | "train_params": { 13 | "lr": 0.001, 14 | "epsilon": 1.0, 15 | "epsilon_final": 0.02, 16 | "warmup_episodes": 450, 17 | "log_every_episode": 10, 18 | "n_episodes": 500, 19 | "target_update_every_step": 10 20 | } 21 | } -------------------------------------------------------------------------------- /playground/configs/data/dqn-dense-cartpole-v1.json: -------------------------------------------------------------------------------- 1 | { 2 | "env_name": "CartPole-v1", 3 | "policy_name": "DqnPolicy", 4 | "policy_params": { 5 | "batch_size": 32, 6 | "layer_sizes": [32, 32], 7 | "double_q": true, 8 | "dueling": true, 9 | "model_type": "dense", 10 | "step_size": 1 11 | }, 12 | "train_params": { 13 | "lr": 0.001, 14 | "epsilon": 1.0, 15 | "epsilon_final": 0.02, 16 | "warmup_episodes": 450, 17 | "log_every_episode": 10, 18 | "n_episodes": 500, 19 | "target_update_every_step": 10 20 | } 21 | } -------------------------------------------------------------------------------- /playground/configs/data/actor-critic-cartpole-v1.json: -------------------------------------------------------------------------------- 1 | { 2 | "env_name": "CartPole-v1", 3 | "policy_name": "ActorCriticPolicy", 4 | "policy_params": { 5 | "layer_sizes": [32], 6 | "deterministic": true 7 | }, 8 | "train_params": { 9 | "lr_a": 0.01, 10 | "lr_a_decay": 0.999, 11 | "lr_c": 0.01, 12 | "lr_c_decay": 0.999, 13 | "epsilon": 1.0, 14 | "epsilon_final": 0.05, 15 | "batch_size": 32, 16 | "n_episodes": 800, 17 | "annealing_episodes": 720, 18 | "log_every_episode": 10, 19 | "done_rewards": -100.0 20 | } 21 | } -------------------------------------------------------------------------------- /playground/configs/data/ppo-lunarlander-v2.json: -------------------------------------------------------------------------------- 1 | { 2 | "env_name": "LunarLander-v2", 3 | "policy_name": "PPOPolicy", 4 | "policy_params": { 5 | "gamma": 0.99, 6 | "lam": 0.95, 7 | "actor_layers": [64, 64], 8 | "critic_layers": [128, 64], 9 | "clip_norm": 0.5, 10 | "deterministic": true 11 | }, 12 | "train_params": { 13 | "lr_a": 0.002, 14 | "lr_c": 0.005, 15 | "batch_size": 128, 16 | "ratio_clip_range": 0.2, 17 | "ratio_clip_decay": false, 18 | "n_iterations": 100, 19 | "n_rollout_workers": 5, 20 | "train_epoches": 4, 21 | "log_every_iteration": 5 22 | } 23 | } -------------------------------------------------------------------------------- /playground/configs/data/qlearning-cartpole-v1.json: -------------------------------------------------------------------------------- 1 | { 2 | "env_name": "CartPole-v0", 3 | "wrappers": [ 4 | [ 5 | "DiscretizedObservationWrapper", 6 | { 7 | "n_bins": 8, 8 | "low": [-2.4, -2.0, -0.42, -3.5], 9 | "high": [2.4, 2.0, 0.42, 3.5] 10 | }] 11 | ], 12 | "policy_name": "QlearningPolicy", 13 | "policy_params": { 14 | "gamma": 0.99 15 | }, 16 | "train_params": { 17 | "alpha": 0.5, 18 | "alpha_decay": 0.998, 19 | "epsilon": 1.0, 20 | "epsilon_final": 0.05, 21 | "n_episodes": 1000, 22 | "annealing_episodes": 800, 23 | "log_every_episode": 10 24 | } 25 | } -------------------------------------------------------------------------------- /playground/learn.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import click 4 | from playground.configs.manager import ConfigManager 5 | 6 | 7 | @click.command() 8 | @click.argument('config_name') 9 | @click.option('-m', '--model-name', default=None) 10 | def run(config_name, model_name=None): 11 | cfg = ConfigManager.load(config_name) 12 | 13 | if model_name is None: 14 | model_name = '-'.join([ 15 | cfg.env_name.lower(), 16 | cfg.policy_name.replace('_', '-'), 17 | os.path.splitext(os.path.basename(config_name))[0] if config_name else 'default', 18 | str(int(time.time())) 19 | ]) 20 | 21 | model_name = model_name.lower() 22 | cfg.start_training(model_name) 23 | 24 | 25 | if __name__ == '__main__': 26 | run() 27 | -------------------------------------------------------------------------------- /playground/configs/data/dqn-lstm-mspacman-ram-v0.json: -------------------------------------------------------------------------------- 1 | { 2 | "env_name": "MsPacman-ram-v0", 3 | "policy_name": "DqnPolicy", 4 | "policy_params": { 5 | "batch_size": 32, 6 | "double_q": true, 7 | "dueling": true, 8 | "layer_sizes": [ 9 | 128, 10 | 128 11 | ], 12 | "model_type": "lstm", 13 | "model_params": { 14 | "lstm_layers": 1, 15 | "lstm_size": 256 16 | }, 17 | "step_size": 16 18 | }, 19 | "train_params": { 20 | "lr": 0.001, 21 | "epsilon": 1.0, 22 | "epsilon_final": 0.02, 23 | "warmup_episodes": 450, 24 | "log_every_episode": 10, 25 | "n_episodes": 500, 26 | "target_update_every_step": 10 27 | } 28 | } -------------------------------------------------------------------------------- /playground/tests/test_utils.py: -------------------------------------------------------------------------------- 1 | import gym 2 | from playground.utils.wrappers import DiscretizedObservationWrapper 3 | from playground.utils.misc import Config 4 | 5 | 6 | def test_digitized_observation_wrapper(): 7 | env = gym.make('MountainCar-v0') 8 | env = DiscretizedObservationWrapper(env) 9 | obs = env.reset() 10 | print(obs) 11 | 12 | 13 | def test_config_class(): 14 | class ParentConfig(Config): 15 | a = 1 16 | b = 2 17 | 18 | class ChildConfig(ParentConfig): 19 | x = 4 20 | y = 5 21 | z = 6 22 | 23 | class GrandChildConfig(ChildConfig): 24 | red = True 25 | blue = False 26 | 27 | config = GrandChildConfig(a=100, y=200, blue=True) 28 | assert config.b == 2 29 | assert config.blue == True 30 | assert config.as_dict() == dict(a=100, b=2, x=4, y=200, z=6, red=True, blue=True) 31 | -------------------------------------------------------------------------------- /playground/utils/wrappers.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | 4 | from gym.spaces import Box, Discrete 5 | 6 | 7 | class DiscretizedObservationWrapper(gym.ObservationWrapper): 8 | def __init__(self, env, n_bins=10, low=None, high=None): 9 | super().__init__(env) 10 | assert isinstance(env.observation_space, Box) 11 | 12 | low = self.observation_space.low if low is None else low 13 | high = self.observation_space.high if high is None else high 14 | 15 | low = np.array(low) 16 | high = np.array(high) 17 | 18 | self.n_bins = n_bins 19 | self.val_bins = [np.linspace(l, h, n_bins + 1) for l, h in 20 | zip(low.flatten(), high.flatten())] 21 | self.ob_shape = self.observation_space.shape 22 | 23 | print("New ob space:", Discrete((n_bins + 1) ** len(low))) 24 | self.observation_space = Discrete(n_bins ** len(low)) 25 | 26 | def _convert_to_one_number(self, digits): 27 | return sum([d * ((self.n_bins + 1) ** i) for i, d in enumerate(digits)]) 28 | 29 | def observation(self, observation): 30 | digits = [np.digitize([x], bins)[0] 31 | for x, bins in zip(observation.flatten(), self.val_bins)] 32 | return self._convert_to_one_number(digits) 33 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Common Deep Reinforcement Learning Models (Tensorflow + OpenAI Gym) 2 | 3 | In this repo, I implemented several classic deep reinforcement learning models in Tensorflow and OpenAI gym environment. Please check the corresponding blog post: ["Implementing Deep Reinforcement Learning Models"](http://lilianweng.github.io/lil-log/2018/05/05/implementing-deep-reinforcement-learning-models.html) for more information. 4 | 5 | I will add more model implementation in the future. 6 | 7 | 8 | ### Setup 9 | 10 | (1) Make sure you have [Homebrew](https://docs.brew.sh/Installation) installed: 11 | ``` 12 | /usr/bin/ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)" 13 | ``` 14 | 15 | (2) Then set up virtualenv. 16 | 17 | ``` 18 | # Install python virtualenv 19 | brew install pyenv-virtualenv 20 | 21 | # Create a virtual environment of any name you like with Python 3.6.4 support 22 | pyenv virtualenv 3.6.4 workspace 23 | 24 | # Activate the virtualenv named “workspace” 25 | pyenv activate workspace 26 | ``` 27 | 28 | (3) (In the virtual env) Install OpenAI gym according to the [instruction](https://github.com/openai/gym#installation). For a minimal installation, run: 29 | ``` 30 | git clone https://github.com/openai/gym.git 31 | cd gym 32 | pip install -e . 33 | ``` 34 | If you are interested in playing with Atari games or other advanced packages in the gym environment, Please go with the gym [instruction](https://github.com/openai/gym#installation) further. 35 | 36 | 37 | (4) Clone the code repo and install the requirements. 38 | ``` 39 | git clone git@github.com:lilianweng/deep-reinforcement-learning-gym.git 40 | cd deep-reinforcement-learning-gym 41 | pip install -e . # install the “playground” project. 42 | pip install -r requirements.txt # install required packages. 43 | ``` 44 | 45 | 46 | ### Train Models 47 | 48 | The model configuration can be fully represented in a json file. I have a couple example config files in `playground/configs/data/`. 49 | 50 | Start a model training as follows, 51 | 52 | ```bash 53 | cd playground 54 | python learn.py configs/data/reinforce-cartpole-v1.json 55 | ``` 56 | 57 | During training, three folders will be created in the root directory: `logs`, `checkpoints` and `figs`. Because the env is wrapped by `gym.wrappers.Monitor`, the gym training log is written into `/tmp/` in the meantime. Feel free to comment that out in `playground.configs.manager.ConfigManager` if you are not a fan of that. 58 | 59 | Meanwhile, you can start the tensorboard, 60 | ```bash 61 | tensorboard --logdir=logs 62 | ``` 63 | 64 | Once the training is complete, two figures are generated in `figs/`. 65 | 66 | ![results](examples/cartpole-v1-reinforce.png "Rewards in time") 67 | 68 | ![video](examples/cartpole.gif "How it looks like.") 69 | 70 | 71 | 72 | 73 | -------------------------------------------------------------------------------- /playground/configs/manager.py: -------------------------------------------------------------------------------- 1 | import json 2 | import importlib 3 | import gym 4 | import os 5 | from gym.wrappers import Monitor 6 | from playground.utils.misc import plot_from_monitor_results 7 | 8 | 9 | def load_policy_class(policy_name): 10 | mod = importlib.import_module("playground.policies") 11 | policy_class = getattr(mod, policy_name) 12 | return policy_class 13 | 14 | 15 | def load_wrapper_class(wrapper_name): 16 | mod = importlib.import_module("playground.utils.wrappers") 17 | wrapper_class = getattr(mod, wrapper_name) 18 | return wrapper_class 19 | 20 | 21 | def apply_wrappers(env, list_of_wrappers): 22 | for name, params in list_of_wrappers: 23 | wrapper_class = load_wrapper_class(name) 24 | env = wrapper_class(env, **params) 25 | return env 26 | 27 | 28 | class ConfigManager: 29 | def __init__(self, env_name, policy_name, policy_params=None, train_params=None, 30 | wrappers=None): 31 | self.env_name = env_name 32 | self.policy_name = policy_name 33 | self.policy_params = policy_params or {} 34 | self.train_params = train_params or {} 35 | self.wrappers = wrappers or [] 36 | 37 | self.env = gym.make(self.env_name) 38 | self.env = apply_wrappers(self.env, self.wrappers) 39 | 40 | def to_json(self): 41 | return dict( 42 | env_name=self.env_name, 43 | wrappers=self.wrappers, 44 | policy_name=self.policy_name, 45 | policy_params=self.policy_params, 46 | train_params=self.train_params, 47 | ) 48 | 49 | @classmethod 50 | def load(cls, file_path): 51 | assert os.path.exists(file_path) 52 | return cls(**json.load(open(file_path))) 53 | 54 | def save(self, file_path): 55 | with open(file_path, 'w') as fin: 56 | json.dump(self.to_json(), fin, indent=4, sort_keys=True) 57 | 58 | def start_training(self, model_name): 59 | self.env.reset() 60 | env = Monitor(self.env, '/tmp/' + model_name, force=True) 61 | policy = load_policy_class(self.policy_name)( 62 | env, model_name, training=True, **self.policy_params) 63 | 64 | print("\n==================================================") 65 | print("Loaded gym.env:", self.env_name) 66 | print("Wrappers:", self.wrappers) 67 | print("Loaded policy:", policy.__class__) 68 | print("Policy params:", self.policy_params) 69 | print("Train params:", self.train_params) 70 | print("==================================================\n") 71 | 72 | policy.build() 73 | train_config = policy.TrainConfig(**self.train_params) 74 | policy.train(train_config) 75 | 76 | env.close() 77 | plot_from_monitor_results('/tmp/' + model_name, window=50) 78 | print("Training completed:", model_name) 79 | -------------------------------------------------------------------------------- /playground/policies/memory.py: -------------------------------------------------------------------------------- 1 | from collections import deque, namedtuple 2 | import numpy as np 3 | import itertools 4 | 5 | # This is the default buffer record nametuple type. 6 | Transition = namedtuple('Transition', ['s', 'a', 'r', 's_next', 'done']) 7 | 8 | 9 | class ReplayMemory: 10 | def __init__(self, capacity=100000, replace=False, tuple_class=Transition): 11 | self.buffer = [] 12 | self.capacity = capacity 13 | self.replace = replace 14 | self.tuple_class = tuple_class 15 | self.fields = tuple_class._fields 16 | 17 | def add(self, record): 18 | """Any named tuple item.""" 19 | if isinstance(record, self.tuple_class): 20 | self.buffer.append(record) 21 | elif isinstance(record, list): 22 | self.buffer += record 23 | 24 | while self.capacity and self.size > self.capacity: 25 | self.buffer.pop(0) 26 | 27 | def _reformat(self, indices): 28 | # Reformat a list of Transition tuples for training. 29 | # indices: list 30 | return { 31 | field_name: np.array([getattr(self.buffer[i], field_name) for i in indices]) 32 | for field_name in self.fields 33 | } 34 | 35 | def sample(self, batch_size): 36 | assert len(self.buffer) >= batch_size 37 | idxs = np.random.choice(range(len(self.buffer)), size=batch_size, replace=self.replace) 38 | return self._reformat(idxs) 39 | 40 | def pop(self, batch_size): 41 | # Pop the first `batch_size` Transition items out. 42 | i = min(self.size, batch_size) 43 | batch = self._reformat(range(i)) 44 | self.buffer = self.buffer[i:] 45 | return batch 46 | 47 | def loop(self, batch_size, epoch=None): 48 | indices = [] 49 | ep = None 50 | for i in itertools.cycle(range(len(self.buffer))): 51 | indices.append(i) 52 | if i == 0: 53 | ep = 0 if ep is None else ep + 1 54 | if epoch is not None and ep == epoch: 55 | break 56 | 57 | if len(indices) == batch_size: 58 | yield self._reformat(indices) 59 | indices = [] 60 | 61 | @property 62 | def size(self): 63 | return len(self.buffer) 64 | 65 | 66 | class ReplayTrajMemory: 67 | def __init__(self, capacity=100000, step_size=16): 68 | self.buffer = deque(maxlen=capacity) 69 | self.step_size = step_size 70 | 71 | def add(self, traj): 72 | # traj (list) 73 | if len(traj) >= self.step_size: 74 | self.buffer.append(traj) 75 | 76 | def sample(self, batch_size): 77 | traj_idxs = np.random.choice(range(len(self.buffer)), size=batch_size, replace=True) 78 | batch_data = {field_name: [] for field_name in Transition._fields} 79 | 80 | for traj_idx in traj_idxs: 81 | i = np.random.randint(0, len(self.buffer[traj_idx]) + 1 - self.step_size) 82 | transitions = self.buffer[traj_idx][i: i + self.step_size] 83 | 84 | for field_name in Transition._fields: 85 | batch_data[field_name] += [getattr(t, field_name) for t in transitions] 86 | 87 | assert all(len(v) == batch_size * self.step_size for v in batch_data.values()) 88 | return {k: np.array(v) for k, v in batch_data.items()} 89 | 90 | @property 91 | def size(self): 92 | return len(self.buffer) 93 | 94 | @property 95 | def transition_size(self): 96 | return sum(map(len, self.buffer)) 97 | -------------------------------------------------------------------------------- /playground/utils/misc.py: -------------------------------------------------------------------------------- 1 | import matplotlib; matplotlib.use('Agg') 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | import os 5 | import pandas as pd 6 | 7 | from gym.wrappers.monitor import load_results 8 | from copy import deepcopy 9 | 10 | REPO_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..')) 11 | 12 | 13 | class Config: 14 | def __init__(self, **kwargs): 15 | # read parameters from parents, and children can override the values. 16 | parents = [] 17 | queue = [self.__class__] 18 | while queue: 19 | parent = queue.pop() 20 | if issubclass(parent, Config) and parent is not Config: 21 | parents.append(parent) 22 | for p in reversed(parent.__bases__): 23 | queue.append(p) 24 | 25 | params = {} 26 | for cfg in reversed(parents): 27 | params.update(cfg.__dict__) 28 | 29 | # Set all instance variable based on kwargs and default class variables 30 | for key, value in params.items(): 31 | if key.startswith('__'): 32 | continue 33 | 34 | if key in kwargs: 35 | # override default with provided parameter 36 | value = kwargs[key] 37 | else: 38 | # Need to make copies of class variables so that they aren't changed by instances 39 | value = deepcopy(value) 40 | 41 | self.__dict__[key] = value 42 | 43 | def __setattr__(self, name, value): 44 | if name not in self.__dict__: 45 | raise AttributeError(f"{self.__class__.__name__} does not have attribute {name}") 46 | self.__dict__[name] = value 47 | 48 | def __getattr__(self, name): 49 | # Raise error on assignment of missing variable 50 | if name not in self.__dict__: 51 | raise AttributeError(f"{self.__class__.__name__} does not have attribute {name}") 52 | return self.__dict__[name] 53 | 54 | def as_dict(self): 55 | return deepcopy(self.__dict__) 56 | 57 | def copy(self): 58 | return self.__class__(**self.as_dict()) 59 | 60 | def get(self, name, default): 61 | return self.as_dict().get(name, default) 62 | 63 | def __repr__(self): 64 | return super().__repr__() + "\n" + self.dumps() 65 | 66 | 67 | def plot_learning_curve(filename, value_dict, xlabel='step'): 68 | # Plot step vs the mean(last 50 episodes' rewards) 69 | fig = plt.figure(figsize=(12, 4 * len(value_dict))) 70 | 71 | for i, (key, values) in enumerate(value_dict.items()): 72 | ax = fig.add_subplot(len(value_dict), 1, i + 1) 73 | ax.plot(range(len(values)), values) 74 | ax.set_xlabel(xlabel) 75 | ax.set_ylabel(key) 76 | ax.grid('k--', alpha=0.6) 77 | 78 | plt.tight_layout() 79 | os.makedirs(os.path.join(REPO_ROOT, 'figs'), exist_ok=True) 80 | plt.savefig(os.path.join(REPO_ROOT, 'figs', filename)) 81 | 82 | 83 | def plot_from_monitor_results(monitor_dir, window=10): 84 | assert os.path.exists(monitor_dir) 85 | if monitor_dir.endswith('/'): 86 | monitor_dir = monitor_dir[:-1] 87 | 88 | data = load_results(monitor_dir) 89 | n_episodes = len(data['episode_lengths']) 90 | assert n_episodes > 0 91 | 92 | fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 8), tight_layout=True, sharex=True) 93 | 94 | ax1.plot(range(n_episodes), pd.rolling_mean(np.array(data['episode_lengths']), window)) 95 | ax1.set_xlabel('episode') 96 | ax1.set_ylabel('episode length') 97 | ax1.grid('k--', alpha=0.6) 98 | 99 | ax2.plot(range(n_episodes), pd.rolling_mean(np.array(data['episode_rewards']), window)) 100 | ax2.set_xlabel('episode') 101 | ax2.set_ylabel('episode reward') 102 | ax2.grid('k--', alpha=0.6) 103 | 104 | os.makedirs(os.path.join(REPO_ROOT, 'figs'), exist_ok=True) 105 | plt.savefig(os.path.join(REPO_ROOT, 'figs', os.path.basename(monitor_dir) + '-monitor')) 106 | -------------------------------------------------------------------------------- /playground/policies/qlearning.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | 3 | import numpy as np 4 | from gym.spaces import Discrete 5 | 6 | from playground.policies.base import Policy, TrainConfig 7 | from playground.policies.memory import Transition 8 | from playground.utils.misc import plot_learning_curve 9 | 10 | 11 | class QlearningPolicy(Policy): 12 | def __init__(self, env, name, training=True, gamma=0.99, Q=None): 13 | """ 14 | This Q-learning implementation only works on an environment with discrete 15 | action and observation space. We use a dict to memorize the Q-value. 16 | 17 | 1. We start from state s and 18 | 19 | 2. At state s, with action a, we observe a reward r(s, a) and get into the 20 | next state s'. Update Q function: 21 | 22 | Q(s, a) += learning_rate * (r(s, a) + gamma * max Q(s', .) - Q(s, a)) 23 | 24 | Repeat this process. 25 | """ 26 | super().__init__(env, name, gamma=gamma, training=training) 27 | assert isinstance(env.action_space, Discrete) 28 | assert isinstance(env.observation_space, Discrete) 29 | 30 | self.Q = Q 31 | self.actions = range(self.env.action_space.n) 32 | 33 | def build(self): 34 | self.Q = defaultdict(float) 35 | 36 | def act(self, state, eps=0.1): 37 | """Pick best action according to Q values ~ argmax_a Q(s, a). 38 | Exploration is forced by epsilon-greedy. 39 | """ 40 | if self.training and eps > 0. and np.random.rand() < eps: 41 | return self.env.action_space.sample() 42 | 43 | # Pick the action with highest Q value. 44 | qvals = {a: self.Q[state, a] for a in self.actions} 45 | max_q = max(qvals.values()) 46 | 47 | # In case multiple actions have the same maximum Q value. 48 | actions_with_max_q = [a for a, q in qvals.items() if q == max_q] 49 | return np.random.choice(actions_with_max_q) 50 | 51 | def _update_q_value(self, tr, alpha): 52 | """ 53 | Q(s, a) += alpha * (r(s, a) + gamma * max Q(s', .) - Q(s, a)) 54 | """ 55 | max_q_next = max([self.Q[tr.s_next, a] for a in self.actions]) 56 | # We do not include the value of the next state if terminated. 57 | self.Q[tr.s, tr.a] += alpha * ( 58 | tr.r + self.gamma * max_q_next * (1.0 - tr.done) - self.Q[tr.s, tr.a] 59 | ) 60 | 61 | class TrainConfig(TrainConfig): 62 | alpha = 0.5 63 | alpha_decay = 0.998 64 | epsilon = 1.0 65 | epsilon_final = 0.05 66 | n_episodes = 1000 67 | warmup_episodes = 800 68 | log_every_episode = 10 69 | 70 | def train(self, config: TrainConfig): 71 | reward_history = [] 72 | reward_averaged = [] 73 | step = 0 74 | alpha = config.alpha 75 | eps = config.epsilon 76 | 77 | warmup_episodes = config.warmup_episodes or config.n_episodes 78 | eps_drop = (config.epsilon - config.epsilon_final) / warmup_episodes 79 | 80 | for n_episode in range(config.n_episodes): 81 | ob = self.env.reset() 82 | done = False 83 | reward = 0. 84 | 85 | while not done: 86 | a = self.act(ob, eps) 87 | new_ob, r, done, info = self.env.step(a) 88 | if done and config.done_reward is not None: 89 | r += config.done_reward 90 | 91 | self._update_q_value(Transition(ob, a, r, new_ob, done), alpha) 92 | 93 | step += 1 94 | reward += r 95 | ob = new_ob 96 | 97 | reward_history.append(reward) 98 | reward_averaged.append(np.average(reward_history[-50:])) 99 | 100 | alpha *= config.alpha_decay 101 | if eps > config.epsilon_final: 102 | eps = max(config.epsilon_final, eps - eps_drop) 103 | 104 | if config.log_every_episode is not None and n_episode % config.log_every_episode == 0: 105 | # Report the performance every 100 steps 106 | print("[episode:{}|step:{}] best:{} avg:{:.4f} alpha:{:.4f} eps:{:.4f} Qsize:{}".format( 107 | n_episode, step, np.max(reward_history), 108 | np.mean(reward_history[-10:]), alpha, eps, len(self.Q))) 109 | 110 | print("[FINAL] Num. episodes: {}, Max reward: {}, Average reward: {}".format( 111 | len(reward_history), np.max(reward_history), np.mean(reward_history))) 112 | 113 | data_dict = {'reward': reward_history, 'reward_avg50': reward_averaged} 114 | plot_learning_curve(self.name, data_dict, xlabel='episode') 115 | -------------------------------------------------------------------------------- /playground/policies/base.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import numpy as np 4 | import tensorflow as tf 5 | from gym.spaces import Box, Discrete 6 | from gym.utils import colorize 7 | 8 | from playground.utils.misc import Config 9 | from playground.utils.misc import REPO_ROOT 10 | 11 | 12 | class TrainConfig(Config): 13 | lr = 0.001 14 | n_steps = 10000 15 | warmup_steps = 5000 16 | batch_size = 64 17 | log_every_step = 1000 18 | 19 | # give an extra bonus if done; only needed for certain tasks. 20 | done_reward = None 21 | 22 | 23 | class Policy: 24 | def __init__(self, env, name, training=True, gamma=0.99, deterministic=False): 25 | self.env = env 26 | self.gamma = gamma 27 | self.training = training 28 | self.name = name 29 | 30 | if deterministic: 31 | np.random.seed(1) 32 | tf.set_random_seed(1) 33 | 34 | @property 35 | def act_size(self): 36 | # number of options of an action; this only makes sense for discrete actions. 37 | if isinstance(self.env.action_space, Discrete): 38 | return self.env.action_space.n 39 | else: 40 | return None 41 | 42 | @property 43 | def act_dim(self): 44 | # dimension of an action; this only makes sense for continuous actions. 45 | if isinstance(self.env.action_space, Box): 46 | return list(self.env.action_space.shape) 47 | else: 48 | return [] 49 | 50 | @property 51 | def state_dim(self): 52 | # dimension of a state. 53 | return list(self.env.observation_space.shape) 54 | 55 | def obs_to_inputs(self, ob): 56 | return ob.flatten() 57 | 58 | def act(self, state, **kwargs): 59 | pass 60 | 61 | def build(self): 62 | pass 63 | 64 | def train(self, *args, **kwargs): 65 | pass 66 | 67 | def evaluate(self, n_episodes): 68 | reward_history = [] 69 | reward = 0. 70 | 71 | for i in range(n_episodes): 72 | ob = self.env.reset() 73 | done = False 74 | while not done: 75 | a = self.act(ob) 76 | new_ob, r, done, _ = self.env.step(a) 77 | self.env.render() 78 | reward += r 79 | ob = new_ob 80 | 81 | reward_history.append(reward) 82 | reward = 0. 83 | 84 | print("Avg. reward over {} episodes: {:.4f}".format(n_episodes, np.mean(reward_history))) 85 | 86 | 87 | class BaseModelMixin: 88 | """Abstract object representing an tensorflow model that can be easily saved/loaded. 89 | Modified based on https://github.com/devsisters/DQN-tensorflow/blob/master/dqn/base.py 90 | """ 91 | 92 | def __init__(self, model_name, tf_sess_config=None): 93 | self._saver = None 94 | self._writer = None 95 | self._model_name = model_name 96 | self._sess = None 97 | 98 | if tf_sess_config is None: 99 | tf_sess_config = { 100 | 'allow_soft_placement': True, 101 | 'intra_op_parallelism_threads': 8, 102 | 'inter_op_parallelism_threads': 4, 103 | } 104 | self.tf_sess_config = tf_sess_config 105 | 106 | def scope_vars(self, scope, only_trainable=True): 107 | collection = tf.GraphKeys.TRAINABLE_VARIABLES if only_trainable else tf.GraphKeys.VARIABLES 108 | variables = tf.get_collection(collection, scope=scope) 109 | assert len(variables) > 0 110 | print(f"Variables in scope '{scope}':") 111 | for v in variables: 112 | print("\t" + str(v)) 113 | return variables 114 | 115 | def get_variable_values(self): 116 | t_vars = tf.trainable_variables() 117 | vals = self.sess.run(t_vars) 118 | return {v.name: value for v, value in zip(t_vars, vals)} 119 | 120 | def save_checkpoint(self, step=None): 121 | print(colorize(" [*] Saving checkpoints...", "green")) 122 | ckpt_file = os.path.join(self.checkpoint_dir, self.model_name) 123 | self.saver.save(self.sess, ckpt_file, global_step=step) 124 | 125 | def load_checkpoint(self): 126 | print(colorize(" [*] Loading checkpoints...", "green")) 127 | ckpt_path = tf.train.latest_checkpoint(self.checkpoint_dir) 128 | print(self.checkpoint_dir) 129 | print("ckpt_path:", ckpt_path) 130 | 131 | if ckpt_path: 132 | # self._saver = tf.train.import_meta_graph(ckpt_path + '.meta') 133 | self.saver.restore(self.sess, ckpt_path) 134 | print(colorize(" [*] Load SUCCESS: %s" % ckpt_path, "green")) 135 | return True 136 | else: 137 | print(colorize(" [!] Load FAILED: %s" % self.checkpoint_dir, "red")) 138 | return False 139 | 140 | def _get_dir(self, dir_name): 141 | path = os.path.join(REPO_ROOT, dir_name, self.model_name) 142 | os.makedirs(path, exist_ok=True) 143 | return path 144 | 145 | @property 146 | def log_dir(self): 147 | return self._get_dir('logs') 148 | 149 | @property 150 | def checkpoint_dir(self): 151 | return self._get_dir('checkpoints') 152 | 153 | @property 154 | def model_dir(self): 155 | return self._get_dir('models') 156 | 157 | @property 158 | def tb_dir(self): 159 | # tensorboard 160 | return self._get_dir('tb') 161 | 162 | @property 163 | def model_name(self): 164 | assert self._model_name, "Not a valid model name." 165 | return self._model_name 166 | 167 | @property 168 | def saver(self): 169 | if self._saver is None: 170 | self._saver = tf.train.Saver(max_to_keep=5) 171 | return self._saver 172 | 173 | @property 174 | def writer(self): 175 | if self._writer is None: 176 | self._writer = tf.summary.FileWriter(self.tb_dir, self.sess.graph) 177 | return self._writer 178 | 179 | @property 180 | def sess(self): 181 | if self._sess is None: 182 | config = tf.ConfigProto(**self.tf_sess_config) 183 | self._sess = tf.Session(config=config) 184 | 185 | return self._sess 186 | -------------------------------------------------------------------------------- /playground/policies/reinforce.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from playground.policies.base import BaseModelMixin, Policy, TrainConfig 4 | from playground.utils.misc import plot_learning_curve 5 | from playground.utils.tf_ops import dense_nn 6 | 7 | 8 | class ReinforcePolicy(Policy, BaseModelMixin): 9 | def __init__(self, env, name, training=True, gamma=0.99, 10 | layer_sizes=[32, 32], baseline=False): 11 | Policy.__init__(self, env, name, training=training, gamma=gamma) 12 | BaseModelMixin.__init__(self, name) 13 | 14 | self.layer_sizes = layer_sizes 15 | self.baseline = baseline 16 | 17 | def act(self, state, **kwargs): 18 | return self.sess.run(self.sampled_actions, {self.s: [state]}) 19 | 20 | def build(self): 21 | self.lr = tf.placeholder(tf.float32, shape=None, name='learning_rate') 22 | 23 | # Inputs 24 | self.s = tf.placeholder(tf.float32, shape=[None] + self.state_dim, name='state') 25 | self.a = tf.placeholder(tf.int32, shape=(None,), name='action') 26 | self.returns = tf.placeholder(tf.float32, shape=(None,), name='return') 27 | 28 | # Build network 29 | self.pi = dense_nn(self.s, self.layer_sizes + [self.act_size], name='pi_network') 30 | self.sampled_actions = tf.squeeze(tf.multinomial(self.pi, 1)) 31 | self.pi_vars = self.scope_vars('pi_network') 32 | 33 | if self.baseline: 34 | # State value estimation as the baseline 35 | self.v = dense_nn(self.s, self.layer_sizes + [1], name='v_network') 36 | self.target = self.returns - self.v # advantage 37 | 38 | with tf.variable_scope('v_optimize'): 39 | self.loss_v = tf.reduce_mean(tf.squared_difference(self.v, self.returns)) 40 | self.optim_v = tf.train.AdamOptimizer(self.lr).minimize(self.loss_v, name='adam_optim_v') 41 | else: 42 | self.target = tf.identity(self.returns) 43 | 44 | with tf.variable_scope('pi_optimize'): 45 | self.loss_pi = tf.reduce_mean( 46 | tf.stop_gradient(self.target) * tf.nn.sparse_softmax_cross_entropy_with_logits( 47 | logits=self.pi, labels=self.a), name='loss_pi') 48 | # self.optim_pi = tf.train.AdamOptimizer(self.lr) 49 | # self.grads_pi = self.optim_pi.compute_gradients(self.loss_pi, self.pi_vars) 50 | # self.train_pi_op = self.optim_pi.apply_gradients(self.grads_pi) 51 | self.optim_pi = tf.train.AdamOptimizer(self.lr).minimize(self.loss_pi, name='adam_optim_pi') 52 | 53 | with tf.variable_scope('summary'): 54 | self.loss_pi_summ = tf.summary.scalar('loss_pi', self.loss_pi) 55 | 56 | self.ep_reward = tf.placeholder(tf.float32, name='episode_reward') 57 | self.ep_reward_summ = tf.summary.scalar('episode_reward', self.ep_reward) 58 | summ_list = [self.loss_pi_summ, self.ep_reward_summ] 59 | 60 | if self.baseline: 61 | self.loss_v_summ = tf.summary.scalar('loss_v', self.loss_v) 62 | summ_list.append(self.loss_v_summ) 63 | 64 | self.merged_summary = tf.summary.merge(summ_list) 65 | 66 | if self.baseline: 67 | self.train_ops = [self.optim_pi, self.optim_v] 68 | else: 69 | self.train_ops = [self.optim_pi] 70 | 71 | self.sess.run(tf.global_variables_initializer()) 72 | 73 | class TrainConfig(TrainConfig): 74 | lr = 0.001 75 | lr_decay = 0.999 76 | batch_size = 32 77 | n_episodes = 800 78 | log_every_episode = 10 79 | 80 | def train(self, config: TrainConfig): 81 | step = 0 82 | episode_reward = 0. 83 | reward_history = [] 84 | reward_averaged = [] 85 | 86 | lr = config.lr 87 | 88 | for n_episode in range(config.n_episodes): 89 | ob = self.env.reset() 90 | done = False 91 | 92 | obs = [] 93 | actions = [] 94 | rewards = [] 95 | returns = [] 96 | 97 | while not done: 98 | a = self.act(ob) 99 | new_ob, r, done, info = self.env.step(a) 100 | step += 1 101 | episode_reward += r 102 | 103 | obs.append(self.obs_to_inputs(ob)) 104 | actions.append(a) 105 | rewards.append(r) 106 | ob = new_ob 107 | 108 | # One trajectory is complete! 109 | reward_history.append(episode_reward) 110 | reward_averaged.append(np.mean(reward_history[-10:])) 111 | episode_reward = 0. 112 | lr *= config.lr_decay 113 | 114 | # Estimate returns backwards. 115 | return_so_far = 0.0 116 | for r in rewards[::-1]: 117 | return_so_far = self.gamma * return_so_far + r 118 | returns.append(return_so_far) 119 | 120 | returns = returns[::-1] 121 | 122 | _, summ_str = self.sess.run( 123 | [self.train_ops, self.merged_summary], feed_dict={ 124 | self.lr: lr, 125 | self.s: np.array(obs), 126 | self.a: np.array(actions), 127 | self.returns: np.array(returns), 128 | self.ep_reward: reward_history[-1], 129 | }) 130 | self.writer.add_summary(summ_str, step) 131 | 132 | if reward_history and config.log_every_episode and n_episode % config.log_every_episode == 0: 133 | # Report the performance every `every_step` steps 134 | print("[episodes:{}/step:{}], best:{}, avg:{:.2f}:{}, lr:{:.4f}".format( 135 | n_episode, step, np.max(reward_history), np.mean(reward_history[-10:]), 136 | reward_history[-5:], lr, 137 | )) 138 | # self.save_checkpoint(step=step) 139 | 140 | self.save_checkpoint(step=step) 141 | 142 | print("[FINAL] episodes: {}, Max reward: {}, Average reward: {}".format( 143 | len(reward_history), np.max(reward_history), np.mean(reward_history))) 144 | 145 | data_dict = { 146 | 'reward': reward_history, 147 | 'reward_smooth10': reward_averaged, 148 | } 149 | plot_learning_curve(self.model_name, data_dict, xlabel='episode') 150 | -------------------------------------------------------------------------------- /playground/utils/tf_ops.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | from gym.utils import colorize 4 | 5 | 6 | def dense_nn(inputs, layers_sizes, name="mlp", reuse=False, output_fn=None, dropout_keep_prob=None, 7 | batch_norm=False, training=True): 8 | print(colorize("Building mlp {} | sizes: {}".format( 9 | name, [inputs.shape[0]] + layers_sizes), "green")) 10 | 11 | with tf.variable_scope(name, reuse=reuse): 12 | out = inputs 13 | for i, size in enumerate(layers_sizes): 14 | print("Layer:", name + '_l' + str(i), size) 15 | if i > 0 and dropout_keep_prob is not None and training: 16 | # No dropout on the input layer. 17 | out = tf.nn.dropout(out, dropout_keep_prob) 18 | 19 | out = tf.layers.dense( 20 | out, 21 | size, 22 | # Add relu activation only for internal layers. 23 | activation=tf.nn.relu if i < len(layers_sizes) - 1 else None, 24 | kernel_initializer=tf.contrib.layers.xavier_initializer(), 25 | name=name + '_l' + str(i), 26 | reuse=reuse 27 | ) 28 | 29 | if batch_norm: 30 | out = tf.layers.batch_normalization(out, training=training) 31 | 32 | if output_fn: 33 | out = output_fn(out) 34 | 35 | return out 36 | 37 | 38 | def conv2d_net(inputs, layers_sizes, name="conv2d", conv_layers=2, with_pooling=True, 39 | dropout_keep_prob=None, training=True): 40 | print(colorize("Building conv net " + name, "green")) 41 | print("inputs.shape =", inputs.shape) 42 | 43 | with tf.variable_scope(name): 44 | for i in range(conv_layers): 45 | # Apply convolution computation using a kernel of size (5, 5) over the image 46 | # inputs with strides (2, 2) and 'valid' padding. 47 | # For example: 48 | # i = , k = 5, s = 2, p = k // 2 = 2 49 | # o = (i + 2p - k) // 2 + 1 = (i - 1) // 2 + 1 50 | # Read more: https://arxiv.org/pdf/1603.07285.pdf 51 | # https://github.com/vdumoulin/conv_arithmetic 52 | # The output tensor of shape ( 53 | # batch_size, 54 | # (input_image_height - 1) // 2 + 1, 55 | # (input_image_width - 1) // 2 + 1, 56 | # output_dim, 57 | # ). 58 | inputs = tf.layers.conv2d(inputs, 32, [5, 5], strides=[2, 2], name='conv' + str(i)) 59 | print('conv' + str(i) + '.shape =', inputs.shape) 60 | 61 | if with_pooling: 62 | inputs = tf.layers.max_pooling2d(inputs, [2, 2], 2, name='pool' + str(i)) 63 | print('pool' + str(i) + '.shape =', inputs.shape) 64 | 65 | flatten = tf.reshape(inputs, [-1, np.prod(inputs.shape.as_list()[1:])], name='flatten') 66 | outputs = dense_nn(flatten, layers_sizes, name='fc', dropout_keep_prob=dropout_keep_prob) 67 | 68 | print("flatten.shape =", flatten.shape) 69 | print("outputs.shape =", outputs.shape) 70 | 71 | return outputs 72 | 73 | 74 | def alexnet(inputs, output_size, training=True, name='alexnet', dropout_keep_prob=0.5): 75 | """alex net v2 76 | 77 | Described in: http://arxiv.org/pdf/1404.5997v2.pdf 78 | Parameters from: 79 | github.com/akrizhevsky/cuda-convnet2/blob/master/layers/ 80 | layers-imagenet-1gpu.cfg 81 | 82 | Refer to: https://github.com/tensorflow/models/blob/master/research/slim/nets/alexnet.py 83 | """ 84 | trunc_normal = lambda stddev: tf.truncated_normal_initializer(0.0, stddev) 85 | 86 | with tf.variable_scope(name): 87 | net = tf.layers.conv2d(inputs, 64, [11, 11], 4, padding='valid', name='conv1') 88 | net = tf.layers.max_pooling2d(net, [3, 3], 2, name='pool1') 89 | net = tf.layers.conv2d(net, 192, [5, 5], name='conv2') 90 | net = tf.layers.max_pooling2d(net, [3, 3], 2, name='pool2') 91 | net = tf.layers.conv2d(net, 384, [3, 3], name='conv3') 92 | net = tf.layers.conv2d(net, 384, [3, 3], name='conv4') 93 | net = tf.layers.conv2d(net, 256, [3, 3], name='conv5') 94 | net = tf.layers.max_pooling2d(net, [3, 3], 2, name='pool5') 95 | 96 | # Use conv2d instead of fully_connected layers. 97 | net = tf.layers.conv2d(net, 4096, [5, 5], padding='valid', name='fc6', 98 | kernel_initializer=trunc_normal(0.005), 99 | bias_initializer=tf.constant_initializer(0.1)) 100 | 101 | net = tf.layers.dropout(net, dropout_keep_prob, training=training, name='dropout6') 102 | net = tf.layers.conv2d(net, 4096, [1, 1], name='fc7') 103 | 104 | if output_size: 105 | net = tf.layers.dropout(net, dropout_keep_prob, training=training, name='dropout7') 106 | net = tf.layers.conv2d(net, output_size, [1, 1], name='fc8') 107 | 108 | return net 109 | 110 | 111 | def lstm_net(inputs, layers_sizes, name='lstm', step_size=16, lstm_layers=1, lstm_size=256, 112 | pre_lstm_dense_layer=None, dropout_keep_prob=None, training=True): 113 | """inputs = (batch_size * step_size, *observation_size) 114 | """ 115 | print(colorize("Building lstm net " + name, "green")) 116 | print("inputs.shape =", inputs.shape) 117 | 118 | state_size = inputs.shape.as_list()[1] 119 | inputs = tf.reshape(inputs, [-1, step_size, state_size]) 120 | print("reshaped inputs.shape =", inputs.shape) 121 | 122 | def _make_cell(): 123 | cell = tf.nn.rnn_cell.LSTMCell(lstm_size, state_is_tuple=True, reuse=not training) 124 | if training and dropout_keep_prob: 125 | cell = tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob=dropout_keep_prob) 126 | return cell 127 | 128 | with tf.variable_scope(name): 129 | 130 | if pre_lstm_dense_layer: 131 | inputs = tf.nn.relu(dense_nn(inputs, [pre_lstm_dense_layer], name='pre_lstm')) 132 | 133 | with tf.variable_scope('lstm_cells'): 134 | # Before transpose, inputs.get_shape() = (batch_size, num_steps, lstm_size) 135 | # After transpose, inputs.get_shape() = (num_steps, batch_size, lstm_size) 136 | lstm_inputs = tf.transpose(inputs, [1, 0, 2]) 137 | 138 | cell = tf.contrib.rnn.MultiRNNCell([ 139 | _make_cell() for _ in range(lstm_layers)], state_is_tuple=True) 140 | lstm_outputs, lstm_states = tf.nn.dynamic_rnn(cell, lstm_inputs, dtype=tf.float32) 141 | 142 | # transpose back. 143 | lstm_outputs = tf.transpose(lstm_outputs, [1, 0, 2]) 144 | 145 | print("cell =", cell) 146 | print("lstm_states =", lstm_states) 147 | print("lstm_outputs.shape =", lstm_outputs.shape) 148 | 149 | outputs = dense_nn(lstm_outputs, layers_sizes, name="outputs") 150 | print("outputs.shape =", outputs.shape) 151 | 152 | outputs = tf.reshape(outputs, [-1, layers_sizes[-1]]) 153 | print("reshaped outputs.shape =", outputs.shape) 154 | return outputs 155 | -------------------------------------------------------------------------------- /playground/policies/actor_critic.py: -------------------------------------------------------------------------------- 1 | from collections import namedtuple 2 | 3 | import numpy as np 4 | import tensorflow as tf 5 | from gym.spaces import Discrete 6 | 7 | from playground.policies.base import BaseModelMixin, Config, Policy 8 | from playground.policies.memory import ReplayMemory, Transition 9 | from playground.utils.misc import plot_learning_curve 10 | from playground.utils.tf_ops import dense_nn 11 | 12 | 13 | class ActorCriticPolicy(Policy, BaseModelMixin): 14 | 15 | def __init__(self, env, name, training=True, gamma=0.9, layer_sizes=None, clip_norm=None, **kwargs): 16 | Policy.__init__(self, env, name, training=training, gamma=gamma, **kwargs) 17 | BaseModelMixin.__init__(self, name) 18 | 19 | assert isinstance(self.env.action_space, Discrete), \ 20 | "Current ActorCriticPolicy implementation only works for discrete action space." 21 | 22 | self.layer_sizes = [64] if layer_sizes is None else layer_sizes 23 | self.clip_norm = clip_norm 24 | 25 | def act(self, state, eps=0.1): 26 | # Discrete actions 27 | if self.training and np.random.random() < eps: 28 | return self.env.action_space.sample() 29 | 30 | # return self.sess.run(self.sampled_actions, {self.states: [state]}) 31 | proba = self.sess.run(self.actor_proba, {self.s: [state]})[0] 32 | return max(range(self.act_size), key=lambda i: proba[i]) 33 | 34 | def _build_networks(self): 35 | # Define input placeholders 36 | self.s = tf.placeholder(tf.float32, shape=[None] + self.state_dim, name='state') 37 | self.a = tf.placeholder(tf.int32, shape=(None,), name='action') 38 | self.s_next = tf.placeholder(tf.float32, shape=[None] + self.state_dim, name='next_state') 39 | self.r = tf.placeholder(tf.float32, shape=(None,), name='reward') 40 | self.done = tf.placeholder(tf.float32, shape=(None,), name='done_flag') 41 | 42 | # Actor: action probabilities 43 | self.actor = dense_nn(self.s, self.layer_sizes + [self.act_size], name='actor') 44 | self.sampled_actions = tf.squeeze(tf.multinomial(self.actor, 1)) 45 | self.actor_proba = tf.nn.softmax(self.actor) 46 | self.actor_vars = self.scope_vars('actor') 47 | 48 | # Critic: action value (V value) 49 | self.critic = dense_nn(self.s, self.layer_sizes + [1], name='critic') 50 | self.critic_next = dense_nn(self.s_next, self.layer_sizes + [1], name='critic', reuse=True) 51 | self.critic_vars = self.scope_vars('critic') 52 | 53 | # TD target 54 | self.td_target = self.r + self.gamma * tf.squeeze(self.critic_next) * (1.0 - self.done) 55 | self.td_error = self.td_target - tf.squeeze(self.critic) 56 | 57 | def _build_train_ops(self): 58 | self.lr_c = tf.placeholder(tf.float32, shape=None, name='learning_rate_c') 59 | self.lr_a = tf.placeholder(tf.float32, shape=None, name='learning_rate_a') 60 | 61 | with tf.variable_scope('critic_train'): 62 | # self.reg_c = tf.reduce_mean([tf.nn.l2_loss(x) for x in self.critic_vars]) 63 | self.loss_c = tf.reduce_mean(tf.square(self.td_error)) # + 0.001 * self.reg_c 64 | self.optim_c = tf.train.AdamOptimizer(self.lr_c) 65 | self.grads_c = self.optim_c.compute_gradients(self.loss_c, self.critic_vars) 66 | if self.clip_norm: 67 | self.grads_c = [(tf.clip_by_norm(grad, self.clip_norm), var) for grad, var in self.grads_c] 68 | 69 | self.train_op_c = self.optim_c.apply_gradients(self.grads_c) 70 | 71 | with tf.variable_scope('actor_train'): 72 | # self.reg_a = tf.reduce_mean([tf.nn.l2_loss(x) for x in self.actor_vars]) 73 | # self.entropy_a =- tf.reduce_sum(self.actor * tf.log(self.actor)) 74 | self.loss_a = tf.reduce_mean( 75 | tf.stop_gradient(self.td_error) * tf.nn.sparse_softmax_cross_entropy_with_logits( 76 | logits=self.actor, labels=self.a), name='loss_actor') # + 0.001 * self.reg_a 77 | self.optim_a = tf.train.AdamOptimizer(self.lr_a) 78 | self.grads_a = self.optim_a.compute_gradients(self.loss_a, self.actor_vars) 79 | if self.clip_norm: 80 | self.grads_a = [(tf.clip_by_norm(grad, self.clip_norm), var) for grad, var in self.grads_a] 81 | 82 | self.train_op_a = self.optim_a.apply_gradients(self.grads_a) 83 | 84 | with tf.variable_scope('summary'): 85 | self.ep_reward = tf.placeholder(tf.float32, name='episode_reward') 86 | self.summary = [ 87 | tf.summary.scalar('loss/critic', self.loss_c), 88 | tf.summary.scalar('loss/actor', self.loss_a), 89 | tf.summary.scalar('episode_reward', self.ep_reward) 90 | ] 91 | self.summary += [tf.summary.scalar('grads/a_' + var.name, tf.norm(grad)) for 92 | grad, var in self.grads_a if grad is not None] 93 | self.summary += [tf.summary.scalar('grads/c_' + var.name, tf.norm(grad)) for 94 | grad, var in self.grads_c if grad is not None] 95 | self.merged_summary = tf.summary.merge_all(key=tf.GraphKeys.SUMMARIES) 96 | 97 | self.train_ops = [self.train_op_a, self.train_op_c] 98 | 99 | self.sess.run(tf.global_variables_initializer()) 100 | 101 | def build(self): 102 | self._build_networks() 103 | self._build_train_ops() 104 | 105 | class TrainConfig(Config): 106 | lr_a = 0.02 107 | lr_a_decay = 0.995 108 | lr_c = 0.01 109 | lr_c_decay = 0.995 110 | batch_size = 32 111 | n_episodes = 800 112 | warmup_episodes = 720 113 | log_every_episode = 10 114 | done_rewards = -100 115 | # for epsilon-greedy exploration 116 | epsilon = 1.0 117 | epsilon_final = 0.05 118 | 119 | def train(self, config: TrainConfig): 120 | buffer = ReplayMemory(tuple_class=Transition) 121 | 122 | step = 0 123 | episode_reward = 0. 124 | reward_history = [] 125 | reward_averaged = [] 126 | 127 | lr_c = config.lr_c 128 | lr_a = config.lr_a 129 | 130 | eps = config.epsilon 131 | warmup_episodes = config.warmup_episodes or config.n_episodes 132 | eps_drop = (eps - config.epsilon_final) / warmup_episodes 133 | print("Decrease epsilon per step:", eps_drop) 134 | 135 | for n_episode in range(config.n_episodes): 136 | ob = self.env.reset() 137 | self.act(ob, eps) 138 | done = False 139 | 140 | while not done: 141 | a = self.act(ob, eps) 142 | ob_next, r, done, info = self.env.step(a) 143 | step += 1 144 | episode_reward += r 145 | 146 | record = Transition(self.obs_to_inputs(ob), a, r, self.obs_to_inputs(ob_next), done) 147 | buffer.add(record) 148 | 149 | ob = ob_next 150 | 151 | while buffer.size >= config.batch_size: 152 | batch = buffer.pop(config.batch_size) 153 | _, summ_str = self.sess.run( 154 | [self.train_ops, self.merged_summary], feed_dict={ 155 | self.lr_c: lr_c, 156 | self.lr_a: lr_a, 157 | self.s: batch['s'], 158 | self.a: batch['a'], 159 | self.r: batch['r'], 160 | self.s_next: batch['s_next'], 161 | self.done: batch['done'], 162 | self.ep_reward: np.mean(reward_history[-10:]) if reward_history else 0.0, 163 | }) 164 | self.writer.add_summary(summ_str, step) 165 | 166 | # One trajectory is complete! 167 | reward_history.append(episode_reward) 168 | reward_averaged.append(np.mean(reward_history[-10:])) 169 | episode_reward = 0. 170 | 171 | lr_c *= config.lr_c_decay 172 | lr_a *= config.lr_a_decay 173 | if eps > config.epsilon_final: 174 | eps -= eps_drop 175 | 176 | if (reward_history and config.log_every_episode and 177 | n_episode % config.log_every_episode == 0): 178 | # Report the performance every `every_step` steps 179 | print( 180 | "[episodes:{}/step:{}], best:{}, avg:{:.2f}:{}, lr:{:.4f}|{:.4f} eps:{:.4f}".format( 181 | n_episode, step, np.max(reward_history), 182 | np.mean(reward_history[-10:]), reward_history[-5:], 183 | lr_c, lr_a, eps, 184 | )) 185 | # self.save_checkpoint(step=step) 186 | 187 | self.save_checkpoint(step=step) 188 | 189 | print("[FINAL] episodes: {}, Max reward: {}, Average reward: {}".format( 190 | len(reward_history), np.max(reward_history), np.mean(reward_history))) 191 | 192 | data_dict = { 193 | 'reward': reward_history, 194 | 'reward_smooth10': reward_averaged, 195 | } 196 | plot_learning_curve(self.model_name, data_dict, xlabel='episode') 197 | -------------------------------------------------------------------------------- /playground/policies/ddpg.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from gym.spaces import Box 4 | 5 | from playground.policies.base import BaseModelMixin, Policy, TrainConfig 6 | from playground.policies.memory import ReplayMemory, Transition 7 | from playground.utils.misc import plot_learning_curve 8 | from playground.utils.tf_ops import dense_nn 9 | 10 | 11 | class DDPGPolicy(Policy, BaseModelMixin): 12 | 13 | def __init__(self, env, name, training=True, gamma=0.9, 14 | actor_layers=[64, 32], critic_layers=[128, 64], **kwargs): 15 | Policy.__init__(self, env, name, training=training, gamma=gamma, **kwargs) 16 | BaseModelMixin.__init__(self, name) 17 | 18 | assert isinstance(self.env.action_space, Box), \ 19 | "Current DDPGPolicy implementation only works for continuous action space." 20 | 21 | self.actor_layers = actor_layers 22 | self.critic_layers = critic_layers 23 | 24 | def act(self, state, eps=0.25): 25 | # add random gaussian noise for action exploration. 26 | action = self.sess.run(self.mu, {self.s: [state]})[0] 27 | action += eps * np.random.randn(*self.act_dim) 28 | action = np.clip(action * self.env.action_space.high, self.env.action_space.low, self.env.action_space.high) 29 | return action 30 | 31 | def _build_networks(self): 32 | """For continuous action space. 33 | """ 34 | # Define input placeholders 35 | self.s = tf.placeholder(tf.float32, shape=[None] + self.state_dim, name='state') 36 | self.a = tf.placeholder(tf.float32, shape=[None] + self.act_dim, name='action') 37 | self.s_next = tf.placeholder(tf.float32, shape=[None] + self.state_dim, name='next_state') 38 | self.r = tf.placeholder(tf.float32, shape=[None, ], name='reward') 39 | 40 | with tf.variable_scope('primary'): 41 | # Actor: deterministic policy mu(s) outputs one action vector. 42 | self.mu = dense_nn(self.s, self.actor_layers + self.act_dim, output_fn=tf.nn.tanh, name='mu') 43 | # Critic: action value, Q(s, a) 44 | self.Q = dense_nn(tf.concat([self.s, self.a], axis=1), self.critic_layers + [1], name='Q') 45 | # We want to train mu network to maximize Q value that is estimated by our critic; 46 | # this is only used for training. 47 | self.Q_mu = dense_nn(tf.concat([self.s, self.mu], axis=1), self.critic_layers + [1], name='Q', reuse=True) 48 | 49 | with tf.variable_scope('target'): 50 | # Clone target networks. 51 | self.mu_target = dense_nn(self.s_next, self.actor_layers + self.act_dim, output_fn=tf.nn.tanh, name='mu') 52 | self.Q_target = dense_nn(tf.concat([self.s_next, self.mu_target], axis=1), 53 | self.critic_layers + [1], name='Q') 54 | 55 | self.Q_vars = self.scope_vars('primary/Q') 56 | self.mu_vars = self.scope_vars('primary/mu') 57 | 58 | # sanity check 59 | self.primary_vars = self.Q_vars + self.mu_vars 60 | self.target_vars = self.scope_vars('target/Q') + self.scope_vars('target/mu') 61 | assert len(self.primary_vars) == len(self.target_vars) 62 | 63 | def init_target_net(self): 64 | self.sess.run([v_t.assign(v) for v_t, v in zip(self.target_vars, self.primary_vars)]) 65 | 66 | def update_target_net(self, tau=0.01): 67 | self.sess.run([v_t.assign((1.0 - tau) * v_t + tau * v) for v_t, v in zip(self.target_vars, self.primary_vars)]) 68 | 69 | def _build_train_ops(self): 70 | self.lr_a = tf.placeholder(tf.float32, shape=None, name='learning_rate_actor') 71 | self.lr_c = tf.placeholder(tf.float32, shape=None, name='learning_rate_critic') 72 | self.done = tf.placeholder(tf.float32, shape=None, name='terminal_flag') 73 | 74 | with tf.variable_scope('Q_train'): 75 | self.Q_reg = tf.reduce_mean([tf.nn.l2_loss(x) for x in self.Q_vars]) 76 | # use tf.stop_gradient() because we don't want to update the Q target net yet. 77 | y = self.r + self.gamma * self.Q_target * (1.0 - self.done) 78 | self.Q_loss = tf.reduce_mean(tf.square(tf.stop_gradient(y) - self.Q)) + 0.0001 * self.Q_reg 79 | # self.Q_train_op = tf.train.AdamOptimizer(self.lr_c).minimize(self.Q_loss, var_list=self.Q_vars) 80 | 81 | Q_optim = tf.train.AdamOptimizer(self.lr_c) 82 | self.Q_grads = Q_optim.compute_gradients(self.Q_loss, self.Q_vars) 83 | self.Q_train_op = Q_optim.apply_gradients(self.Q_grads) 84 | 85 | with tf.variable_scope('mu_train'): 86 | self.mu_loss = -tf.reduce_mean(self.Q_mu) 87 | self.mu_train_op = tf.train.AdamOptimizer(self.lr_a).minimize(self.mu_loss, var_list=self.mu_vars) 88 | 89 | with tf.variable_scope('summary'): 90 | self.ep_reward = tf.placeholder(tf.float32, name='episode_reward') # just for logging. 91 | self.summary = [ 92 | tf.summary.scalar('loss/Q', self.Q_loss), 93 | tf.summary.scalar('loss/Q_reg', self.Q_reg), 94 | tf.summary.scalar('loss/mu', self.mu_loss), 95 | tf.summary.scalar('output/Q', tf.reduce_mean(self.Q)), 96 | tf.summary.histogram('output/Q_mu', tf.reduce_mean(self.Q_mu)), 97 | tf.summary.scalar('output/Q_target', tf.reduce_mean(self.Q_target)), 98 | tf.summary.histogram('output/mu', self.mu), 99 | tf.summary.histogram('output/mu_target', self.mu_target), 100 | tf.summary.scalar('output/episode_reward', self.ep_reward) 101 | ] + [ 102 | tf.summary.scalar('grads/Q_' + var.name, tf.norm(grad)) 103 | for grad, var in self.Q_grads if grad is not None 104 | ] 105 | 106 | self.merged_summary = tf.summary.merge_all(key=tf.GraphKeys.SUMMARIES) 107 | 108 | self.train_ops = [self.Q_train_op, self.mu_train_op] 109 | 110 | self.sess.run(tf.global_variables_initializer()) 111 | self.init_target_net() 112 | 113 | def build(self): 114 | self._build_networks() 115 | self._build_train_ops() 116 | 117 | class TrainConfig(TrainConfig): 118 | lr_a = 0.0001 119 | lr_c = 0.001 120 | # action exploration noise 121 | epsilon = 0.25 122 | epsilon_final = 0.0 123 | # for target network polyak averaging 124 | tau = 0.001 125 | 126 | def train(self, config: TrainConfig): 127 | # Construct the replay memory buffer. 128 | buffer = ReplayMemory(tuple_class=Transition) 129 | 130 | step = 0 131 | n_episode = 0 132 | 133 | episode_reward = 0. 134 | episode_step = 0 135 | reward_history = [] 136 | reward_averaged = [] 137 | 138 | eps = config.epsilon 139 | eps_drop_per_step = (eps - config.epsilon_final) / config.warmup_steps 140 | print("decrease `epsilon` per step:", eps_drop_per_step) 141 | 142 | env = self.env 143 | ob = env.reset() 144 | done = False 145 | 146 | while step < config.n_steps: 147 | while not done: 148 | a = self.act(ob, eps) 149 | ob_next, r, done, _ = env.step(a) 150 | step += 1 151 | episode_step += 1 152 | episode_reward += r 153 | 154 | buffer.add(Transition(ob, a, r, ob_next, float(done))) 155 | ob = ob_next 156 | 157 | if eps > config.epsilon_final: 158 | eps = max(config.epsilon_final, eps - eps_drop_per_step) 159 | 160 | if reward_history and config.log_every_step and step % config.log_every_step == 0: 161 | # Report the performance every `log_every_step` steps 162 | print("[episodes:{}/step:{}], best(reward):{:.2f}, avg(reward):{:.2f}, eps:{:.4f}".format( 163 | n_episode, step, np.max(reward_history), np.mean(reward_history[-10:]), eps)) 164 | # self.save_checkpoint(step=step) 165 | 166 | if buffer.size >= config.batch_size: 167 | batch = buffer.pop(config.batch_size) 168 | _, q_loss, mu_loss, summ_str = self.sess.run( 169 | [self.train_ops, self.Q_loss, self.mu_loss, self.merged_summary], feed_dict={ 170 | self.lr_a: config.lr_a, 171 | self.lr_c: config.lr_c, 172 | self.done: batch['done'], 173 | self.s: batch['s'], 174 | self.a: batch['a'], 175 | self.r: batch['r'], 176 | self.s_next: batch['s_next'], 177 | self.ep_reward: np.mean(reward_history[-10:]) if reward_history else 0.0, 178 | }) 179 | self.update_target_net(tau=config.tau) 180 | self.writer.add_summary(summ_str, step) 181 | 182 | # one trajectory is complete. 183 | n_episode += 1 184 | ob = env.reset() 185 | done = False 186 | reward_history.append(episode_reward) 187 | reward_averaged.append(np.mean(reward_history[-10:])) 188 | episode_step = 0 189 | episode_reward = 0. 190 | 191 | self.save_checkpoint(step=step) 192 | 193 | print("[FINAL] episodes: {}, Max reward: {}, Average reward: {}".format( 194 | len(reward_history), np.max(reward_history), np.mean(reward_history))) 195 | 196 | data_dict = { 197 | 'reward': reward_history, 198 | 'reward_smooth10': reward_averaged, 199 | } 200 | plot_learning_curve(self.model_name, data_dict, xlabel='episode') 201 | -------------------------------------------------------------------------------- /playground/policies/ppo.py: -------------------------------------------------------------------------------- 1 | from collections import namedtuple 2 | 3 | import numpy as np 4 | import tensorflow as tf 5 | from gym.spaces import Discrete 6 | 7 | from playground.policies.base import BaseModelMixin, Policy, Config 8 | from playground.policies.memory import ReplayMemory 9 | from playground.utils.misc import plot_learning_curve 10 | from playground.utils.tf_ops import dense_nn 11 | 12 | 13 | class PPOPolicy(Policy, BaseModelMixin): 14 | 15 | def __init__(self, env, name, training=True, gamma=0.99, lam=0.95, 16 | actor_layers=[64, 32], critic_layers=[128, 64], clip_norm=None, **kwargs): 17 | Policy.__init__(self, env, name, training=training, gamma=gamma, **kwargs) 18 | BaseModelMixin.__init__(self, name) 19 | 20 | assert isinstance(self.env.action_space, Discrete), \ 21 | "Current PPOPolicy implementation only works for discrete action space." 22 | 23 | self.lam = lam # lambda for GAE. 24 | self.actor_layers = actor_layers 25 | self.critic_layers = critic_layers 26 | self.clip_norm = clip_norm 27 | 28 | def act(self, state, **kwargs): 29 | probas = self.sess.run(self.actor_proba, {self.s: [state]})[0] 30 | action = np.random.choice(range(self.act_size), size=1, p=probas)[0] 31 | return action 32 | 33 | def _build_networks(self): 34 | # Define input placeholders 35 | self.s = tf.placeholder(tf.float32, shape=[None] + self.state_dim, name='state') 36 | self.a = tf.placeholder(tf.int32, shape=(None,), name='action') 37 | self.s_next = tf.placeholder(tf.float32, shape=[None] + self.state_dim, name='next_state') 38 | self.r = tf.placeholder(tf.float32, shape=(None,), name='reward') 39 | self.done = tf.placeholder(tf.float32, shape=(None,), name='done_flag') 40 | 41 | self.old_logp_a = tf.placeholder(tf.float32, shape=(None,), name='old_logp_actor') 42 | self.v_target = tf.placeholder(tf.float32, shape=(None,), name='v_target') 43 | self.adv = tf.placeholder(tf.float32, shape=(None,), name='return') 44 | 45 | with tf.variable_scope('actor'): 46 | # Actor: action probabilities 47 | self.actor = dense_nn(self.s, self.actor_layers + [self.act_size], name='actor') 48 | self.actor_proba = tf.nn.softmax(self.actor) 49 | a_ohe = tf.one_hot(self.a, self.act_size, 1.0, 0.0, name='action_ohe') 50 | self.logp_a = tf.reduce_sum(tf.log(self.actor_proba) * a_ohe, 51 | reduction_indices=-1, name='new_logp_actor') 52 | self.actor_vars = self.scope_vars('actor') 53 | 54 | with tf.variable_scope('critic'): 55 | # Critic: action value (V value) 56 | self.critic = tf.squeeze(dense_nn(self.s, self.critic_layers + [1], name='critic')) 57 | self.critic_next = tf.squeeze(dense_nn(self.s_next, self.critic_layers + [1], name='critic', reuse=True)) 58 | self.critic_vars = self.scope_vars('critic') 59 | 60 | def _build_train_ops(self): 61 | self.lr_a = tf.placeholder(tf.float32, shape=None, name='learning_rate_actor') 62 | self.lr_c = tf.placeholder(tf.float32, shape=None, name='learning_rate_critic') 63 | self.clip_range = tf.placeholder(tf.float32, shape=None, name='ratio_clip_range') 64 | 65 | with tf.variable_scope('actor_train'): 66 | ratio = tf.exp(self.logp_a - self.old_logp_a) 67 | ratio_clipped = tf.clip_by_value(ratio, 1.0 - self.clip_range, 1.0 + self.clip_range) 68 | loss_a = - tf.reduce_mean(tf.minimum(self.adv * ratio, self.adv * ratio_clipped)) 69 | 70 | optim_a = tf.train.AdamOptimizer(self.lr_a) 71 | grads_a = optim_a.compute_gradients(loss_a, var_list=self.actor_vars) 72 | if self.clip_norm: 73 | grads_a = [(tf.clip_by_norm(g, self.clip_norm), v) for g, v in grads_a] 74 | self.train_op_a = optim_a.apply_gradients(grads_a) 75 | 76 | with tf.variable_scope('critic_train'): 77 | loss_c = tf.reduce_mean(tf.square(self.v_target - self.critic)) 78 | 79 | optim_c = tf.train.AdamOptimizer(self.lr_c) 80 | grads_c = optim_c.compute_gradients(loss_c, var_list=self.critic_vars) 81 | if self.clip_norm: 82 | grads_c = [(tf.clip_by_norm(g, self.clip_norm), v) for g, v in grads_c] 83 | self.train_op_c = optim_c.apply_gradients(grads_c) 84 | 85 | self.train_ops = [self.train_op_a, self.train_op_c] 86 | 87 | with tf.variable_scope('summary'): 88 | self.ep_reward = tf.placeholder(tf.float32, name='episode_reward') 89 | 90 | self.summary = [ 91 | tf.summary.scalar('loss/adv', tf.reduce_mean(self.adv)), 92 | tf.summary.scalar('loss/ratio', tf.reduce_mean(ratio)), 93 | tf.summary.scalar('loss/loss_actor', loss_a), 94 | tf.summary.scalar('loss/loss_critic', loss_c), 95 | tf.summary.scalar('episode_reward', self.ep_reward) 96 | ] 97 | 98 | # self.summary += [tf.summary.scalar('grads/' + v.name, tf.norm(g)) 99 | # for g, v in grads_a if g is not None] 100 | # self.summary += [tf.summary.scalar('grads/' + v.name, tf.norm(g)) 101 | # for g, v in grads_c if g is not None] 102 | 103 | self.merged_summary = tf.summary.merge_all(key=tf.GraphKeys.SUMMARIES) 104 | 105 | self.sess.run(tf.global_variables_initializer()) 106 | 107 | def build(self): 108 | self._build_networks() 109 | self._build_train_ops() 110 | 111 | class TrainConfig(Config): 112 | lr_a = 0.005 113 | lr_c = 0.005 114 | batch_size = 32 115 | n_iterations = 100 116 | n_rollout_workers = 5 117 | train_epoches = 5 118 | log_every_iteration = 10 119 | ratio_clip_range = 0.2 120 | ratio_clip_decay = True 121 | 122 | def _generate_rollout(self, buffer): 123 | # generate one trajectory. 124 | ob = self.env.reset() 125 | done = False 126 | rewards = [] 127 | episode_reward = 0.0 128 | obs = [] 129 | actions = [] 130 | 131 | while not done: 132 | a = self.act(ob) 133 | ob_next, r, done, info = self.env.step(a) 134 | obs.append(ob) 135 | actions.append(a) 136 | rewards.append(r) 137 | episode_reward += r 138 | ob = ob_next 139 | 140 | # length of the episode. 141 | T = len(rewards) 142 | 143 | # compute the current log pi(a|s) and predicted v values. 144 | with self.sess.as_default(): 145 | logp_a = self.logp_a.eval({self.a: np.array(actions), self.s: np.array(obs)}) 146 | v_pred = self.critic.eval({self.s: np.array(obs)}) 147 | 148 | # Compute TD errors 149 | td_errors = [rewards[t] + self.gamma * v_pred[t + 1] - v_pred[t] for t in range(T - 1)] 150 | td_errors += [rewards[T - 1] + self.gamma * 0.0 - v_pred[T - 1]] # handle the terminal state. 151 | 152 | assert len(logp_a) == len(v_pred) == len(td_errors) == T 153 | 154 | # Estimate advantage backwards. 155 | advs = [] 156 | adv_so_far = 0.0 157 | for delta in td_errors[::-1]: 158 | adv_so_far = delta + self.gamma * self.lam * adv_so_far 159 | advs.append(adv_so_far) 160 | advs = advs[::-1] 161 | assert len(advs) == T 162 | 163 | # add into the memory buffer 164 | v_targets = np.array(advs) + np.array(v_pred) 165 | for i, (s, a, s_next, r, old_logp_a, v_target, adv) in enumerate(zip( 166 | obs, actions, np.array(obs[1:] + [ob_next]), rewards, 167 | np.squeeze(logp_a), v_targets, advs)): 168 | done = float(i == T - 1) 169 | buffer.add(buffer.tuple_class(s, a, s_next, r, done, old_logp_a, v_target, adv)) 170 | 171 | return episode_reward, len(advs) 172 | 173 | def train(self, config: TrainConfig): 174 | BufferRecord = namedtuple('BufferRecord', ['s', 'a', 's_next', 'r', 'done', 175 | 'old_logp_actor', 'v_target', 'adv']) 176 | buffer = ReplayMemory(tuple_class=BufferRecord) 177 | 178 | reward_history = [] 179 | reward_averaged = [] 180 | step = 0 181 | total_rec = 0 182 | 183 | clip = config.ratio_clip_range 184 | if config.ratio_clip_decay: 185 | clip_delta = clip / config.n_iterations 186 | else: 187 | clip_delta = 0.0 188 | 189 | for n_iteration in range(config.n_iterations): 190 | 191 | # we should have multiple rollout_workers running in parallel. 192 | for _ in range(config.n_rollout_workers): 193 | episode_reward, n_rec = self._generate_rollout(buffer) 194 | # One trajectory is complete. 195 | reward_history.append(episode_reward) 196 | reward_averaged.append(np.mean(reward_history[-10:])) 197 | total_rec += n_rec 198 | 199 | # now let's train the model for some steps. 200 | for batch in buffer.loop(config.batch_size, epoch=config.train_epoches): 201 | _, summ_str = self.sess.run( 202 | [self.train_ops, self.merged_summary], feed_dict={ 203 | self.lr_a: config.lr_a, 204 | self.lr_c: config.lr_c, 205 | self.clip_range: clip, 206 | self.s: batch['s'], 207 | self.a: batch['a'], 208 | self.s_next: batch['s_next'], 209 | self.r: batch['r'], 210 | self.done: batch['done'], 211 | self.old_logp_a: batch['old_logp_actor'], 212 | self.v_target: batch['v_target'], 213 | self.adv: batch['adv'], 214 | self.ep_reward: np.mean(reward_history[-10:]) if reward_history else 0.0, 215 | }) 216 | 217 | self.writer.add_summary(summ_str, step) 218 | step += 1 219 | 220 | clip = max(0.0, clip - clip_delta) 221 | 222 | if (reward_history and config.log_every_iteration and 223 | n_iteration % config.log_every_iteration == 0): 224 | # Report the performance every `log_every_iteration` steps 225 | print("[iteration:{}/step:{}], best:{}, avg:{:.2f}, hist:{}, clip:{:.2f}; {} transitions.".format( 226 | n_iteration, step, np.max(reward_history), np.mean(reward_history[-10:]), 227 | list(map(lambda x: round(x, 2), reward_history[-5:])), clip, total_rec 228 | )) 229 | # self.save_checkpoint(step=step) 230 | 231 | self.save_checkpoint(step=step) 232 | 233 | print("[FINAL] episodes: {}, Max reward: {}, Average reward: {}".format( 234 | len(reward_history), np.max(reward_history), np.mean(reward_history))) 235 | 236 | data_dict = { 237 | 'reward': reward_history, 238 | 'reward_smooth10': reward_averaged, 239 | } 240 | plot_learning_curve(self.model_name, data_dict, xlabel='episode') 241 | -------------------------------------------------------------------------------- /playground/policies/dqn.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from gym.spaces import Box, Discrete 4 | 5 | from playground.policies.base import BaseModelMixin, Policy, TrainConfig 6 | from playground.policies.memory import ReplayMemory, ReplayTrajMemory, Transition 7 | from playground.utils.misc import plot_learning_curve 8 | from playground.utils.tf_ops import dense_nn, conv2d_net, lstm_net 9 | 10 | 11 | class DqnPolicy(Policy, BaseModelMixin): 12 | def __init__(self, env, name, 13 | training=True, 14 | gamma=0.99, 15 | batch_size=32, 16 | model_type='dense', 17 | model_params=None, 18 | step_size=1, # only > 1 if model_type is 'lstm'. 19 | layer_sizes=[32, 32], 20 | double_q=True, 21 | dueling=True): 22 | """ 23 | model_params: 'layer_sizes', 'step_size', 'lstm_layers', 'lstm_size' 24 | """ 25 | Policy.__init__(self, env, name, gamma=gamma, training=training) 26 | BaseModelMixin.__init__(self, name) 27 | 28 | assert isinstance(self.env.action_space, Discrete) 29 | assert isinstance(self.env.observation_space, Box) 30 | assert model_type in ('dense', 'conv', 'lstm') 31 | assert step_size == 1 or model_type == 'lstm' 32 | 33 | self.gamma = gamma 34 | self.batch_size = batch_size 35 | self.training = training 36 | self.model_type = model_type 37 | self.model_params = model_params or {} 38 | self.layer_sizes = layer_sizes 39 | self.step_size = step_size 40 | self.double_q = double_q 41 | self.dueling = dueling 42 | 43 | @property 44 | def state_dim(self): 45 | # Returns: A list 46 | if self.model_type == 'dense': 47 | return [np.prod(list(self.env.observation_space.shape))] 48 | elif self.model_type in ('conv', 'lstm'): 49 | return list(self.env.observation_space.shape) 50 | else: 51 | assert NotImplementedError() 52 | 53 | def obs_to_inputs(self, ob): 54 | if self.model_type == 'dense': 55 | return ob.flatten() 56 | elif self.model_type == 'conv': 57 | return ob 58 | elif self.model_type == 'lstm': 59 | return ob 60 | else: 61 | assert NotImplementedError() 62 | 63 | def init_target_q_net(self): 64 | self.sess.run([v_t.assign(v) for v_t, v in zip(self.q_target_vars, self.q_vars)]) 65 | 66 | def _extract_network_params(self): 67 | net_params = {} 68 | 69 | if self.model_type == 'dense': 70 | net_class = dense_nn 71 | elif self.model_type == 'conv': 72 | net_class = conv2d_net 73 | elif self.model_type == 'lstm': 74 | net_class = lstm_net 75 | net_params = { 76 | 'lstm_layers': self.model_params.get('lstm_layers', 1), 77 | 'lstm_size': self.model_params.get('lstm_size', 256), 78 | 'step_size': self.step_size, 79 | } 80 | else: 81 | raise NotImplementedError("Unknown model type: '%s'" % self.model_type) 82 | 83 | return net_class, net_params 84 | 85 | def create_q_networks(self): 86 | # The first dimension should have batch_size * step_size 87 | self.states = tf.placeholder(tf.float32, shape=(None, *self.state_dim), name='state') 88 | self.states_next = tf.placeholder(tf.float32, shape=(None, *self.state_dim), 89 | name='state_next') 90 | self.actions = tf.placeholder(tf.int32, shape=(None,), name='action') 91 | self.actions_next = tf.placeholder(tf.int32, shape=(None,), name='action_next') 92 | self.rewards = tf.placeholder(tf.float32, shape=(None,), name='reward') 93 | self.done_flags = tf.placeholder(tf.float32, shape=(None,), name='done') 94 | 95 | # The output is a probability distribution over all the actions. 96 | 97 | net_class, net_params = self._extract_network_params() 98 | 99 | if self.dueling: 100 | self.q_hidden = net_class(self.states, self.layer_sizes[:-1], name='Q_primary', 101 | **net_params) 102 | self.adv = dense_nn(self.q_hidden, self.layer_sizes[-1:] + [self.act_size], 103 | name='Q_primary_adv') 104 | self.v = dense_nn(self.q_hidden, self.layer_sizes[-1:] + [1], name='Q_primary_v') 105 | 106 | # Average Dueling 107 | self.q = self.v + (self.adv - tf.reduce_mean( 108 | self.adv, reduction_indices=1, keep_dims=True)) 109 | 110 | self.q_target_hidden = net_class(self.states_next, self.layer_sizes[:-1], name='Q_target', 111 | **net_params) 112 | self.adv_target = dense_nn(self.q_target_hidden, self.layer_sizes[-1:] + [self.act_size], 113 | name='Q_target_adv') 114 | self.v_target = dense_nn(self.q_target_hidden, self.layer_sizes[-1:] + [1], 115 | name='Q_target_v') 116 | 117 | # Average Dueling 118 | self.q_target = self.v_target + (self.adv_target - tf.reduce_mean( 119 | self.adv_target, reduction_indices=1, keep_dims=True)) 120 | 121 | else: 122 | self.q = net_class(self.states, self.layer_sizes + [self.act_size], name='Q_primary', 123 | **net_params) 124 | self.q_target = net_class(self.states_next, self.layer_sizes + [self.act_size], 125 | name='Q_target', **net_params) 126 | 127 | # The primary and target Q networks should match. 128 | self.q_vars = self.scope_vars('Q_primary') 129 | self.q_target_vars = self.scope_vars('Q_target') 130 | assert len(self.q_vars) == len(self.q_target_vars), "Two Q-networks are not same." 131 | 132 | def build(self): 133 | self.create_q_networks() 134 | 135 | self.actions_selected_by_q = tf.argmax(self.q, axis=-1, name='action_selected') 136 | action_one_hot = tf.one_hot(self.actions, self.act_size, 1.0, 0.0, name='action_one_hot') 137 | pred = tf.reduce_sum(self.q * action_one_hot, reduction_indices=-1, name='q_acted') 138 | 139 | if self.double_q: 140 | actions_next_flatten = self.actions_next + tf.range( 141 | 0, self.batch_size * self.step_size) * self.q_target.shape[1] 142 | max_q_next_target = tf.gather(tf.reshape(self.q_target, [-1]), actions_next_flatten) 143 | else: 144 | max_q_next_target = tf.reduce_max(self.q_target, axis=-1) 145 | 146 | y = self.rewards + (1. - self.done_flags) * self.gamma * max_q_next_target 147 | 148 | self.learning_rate = tf.placeholder(tf.float32, shape=None, name='learning_rate') 149 | self.loss = tf.reduce_mean(tf.square(pred - tf.stop_gradient(y)), name="loss_mse_train") 150 | self.optimizer = tf.train.AdamOptimizer( 151 | self.learning_rate).minimize(self.loss, name="adam_optim") 152 | 153 | with tf.variable_scope('summary'): 154 | q_summ = [] 155 | avg_q = tf.reduce_mean(self.q, 0) 156 | for idx in range(self.act_size): 157 | q_summ.append(tf.summary.histogram('q/%s' % idx, avg_q[idx])) 158 | self.q_summ = tf.summary.merge(q_summ, 'q_summary') 159 | 160 | self.q_y_summ = tf.summary.histogram("batch/y", y) 161 | self.q_pred_summ = tf.summary.histogram("batch/pred", pred) 162 | self.loss_summ = tf.summary.scalar("loss", self.loss) 163 | 164 | self.ep_reward = tf.placeholder(tf.float32, name='episode_reward') 165 | self.ep_reward_summ = tf.summary.scalar('episode_reward', self.ep_reward) 166 | 167 | self.merged_summary = tf.summary.merge_all(key=tf.GraphKeys.SUMMARIES) 168 | 169 | self.sess.run(tf.global_variables_initializer()) 170 | self.init_target_q_net() 171 | 172 | def update_target_q_net(self): 173 | self.sess.run([v_t.assign(v) for v_t, v in zip(self.q_target_vars, self.q_vars)]) 174 | 175 | def act(self, state, epsilon=0.1): 176 | if self.training and np.random.random() < epsilon: 177 | return self.env.action_space.sample() 178 | 179 | with self.sess.as_default(): 180 | if self.model_type == 'lstm': 181 | return self.actions_selected_by_q.eval({ 182 | self.states: [np.zeros(state.shape)] * (self.step_size - 1) + [state] 183 | })[-1] 184 | else: 185 | return self.actions_selected_by_q.eval({self.states: [state]})[-1] 186 | 187 | ########## 188 | 189 | class TrainConfig(TrainConfig): 190 | lr = 0.001 191 | lr_decay = 1.0 192 | epsilon = 1.0 193 | epsilon_final = 0.01 194 | memory_capacity = 100000 195 | target_update_every_step = 100 196 | n_episodes = 500 197 | warmup_episodes = 450 198 | log_every_episode = 10 199 | 200 | def train(self, config: TrainConfig): 201 | 202 | if self.model_type == 'lstm': 203 | buffer = ReplayTrajMemory(capacity=config.memory_capacity, step_size=self.step_size) 204 | else: 205 | buffer = ReplayMemory(capacity=config.memory_capacity) 206 | 207 | reward = 0. 208 | reward_history = [0.0] 209 | reward_averaged = [] 210 | 211 | lr = config.lr 212 | eps = config.epsilon 213 | annealing_episodes = config.warmup_episodes or config.n_episodes 214 | eps_drop = (config.epsilon - config.epsilon_final) / annealing_episodes 215 | print("eps_drop:", eps_drop) 216 | step = 0 217 | 218 | for n_episode in range(config.n_episodes): 219 | ob = self.env.reset() 220 | done = False 221 | traj = [] 222 | 223 | while not done: 224 | a = self.act(self.obs_to_inputs(ob), eps) 225 | new_ob, r, done, info = self.env.step(a) 226 | step += 1 227 | reward += r 228 | 229 | traj.append( 230 | Transition(self.obs_to_inputs(ob), a, r, self.obs_to_inputs(new_ob), done)) 231 | ob = new_ob 232 | 233 | # No enough samples in the buffer yet. 234 | if buffer.size < self.batch_size: 235 | continue 236 | 237 | # Training with a mini batch of samples! 238 | batch_data = buffer.sample(self.batch_size) 239 | feed_dict = { 240 | self.learning_rate: lr, 241 | self.states: batch_data['s'], 242 | self.actions: batch_data['a'], 243 | self.rewards: batch_data['r'], 244 | self.states_next: batch_data['s_next'], 245 | self.done_flags: batch_data['done'], 246 | self.ep_reward: reward_history[-1], 247 | } 248 | 249 | if self.double_q: 250 | actions_next = self.sess.run(self.actions_selected_by_q, { 251 | self.states: batch_data['s_next'] 252 | }) 253 | feed_dict.update({self.actions_next: actions_next}) 254 | 255 | _, q_val, q_target_val, loss, summ_str = self.sess.run( 256 | [self.optimizer, self.q, self.q_target, self.loss, self.merged_summary], 257 | feed_dict 258 | ) 259 | self.writer.add_summary(summ_str, step) 260 | if step % config.target_update_every_step: 261 | self.update_target_q_net() 262 | 263 | # Add all the transitions of one trajectory into the replay memory. 264 | buffer.add(traj) 265 | 266 | # One episode is complete. 267 | reward_history.append(reward) 268 | reward_averaged.append(np.mean(reward_history[-10:])) 269 | reward = 0. 270 | 271 | # Annealing the learning and exploration rate after every episode. 272 | lr *= config.lr_decay 273 | if eps > config.epsilon_final: 274 | eps = max(eps - eps_drop, config.epsilon_final) 275 | 276 | if reward_history and config.log_every_episode and n_episode % config.log_every_episode == 0: 277 | # Report the performance every `every_step` steps 278 | print( 279 | "[episodes:{}/step:{}], best:{}, avg:{:.2f}:{}, lr:{:.4f}, eps:{:.4f}".format( 280 | n_episode, step, np.max(reward_history), 281 | np.mean(reward_history[-10:]), reward_history[-5:], 282 | lr, eps, buffer.size 283 | )) 284 | # self.save_checkpoint(step=step) 285 | 286 | self.save_checkpoint(step=step) 287 | 288 | print("[FINAL] episodes: {}, Max reward: {}, Average reward: {}".format( 289 | len(reward_history), np.max(reward_history), np.mean(reward_history))) 290 | 291 | data_dict = { 292 | 'reward': reward_history, 293 | 'reward_smooth10': reward_averaged, 294 | } 295 | plot_learning_curve(self.model_name, data_dict, xlabel='episode') 296 | --------------------------------------------------------------------------------