├── rlib ├── __init__.py ├── A2C │ ├── __init__.py │ ├── A2C.py │ ├── ActorCritic.py │ └── A2C_lstm.py ├── DDQN │ ├── __init__.py │ └── SyncDQN.py ├── RND │ ├── __init__.py │ └── RND.py ├── utils │ ├── __init__.py │ ├── __pycache__ │ │ └── SyncMultiEnvTrainer.cpython-35.pyc.140156829641872 │ ├── schedulers.py │ ├── random_agent.py │ ├── utils.py │ ├── play.py │ ├── ReplayMemory.py │ ├── VecEnv.py │ ├── wrappers.py │ └── SyncMultiEnvTrainer.py ├── Curiosity │ ├── __init__.py │ ├── logs │ │ └── Curiosity_LSTM │ │ │ └── FreewayDeterministic-v4 │ │ │ └── 19-08-04_16-50-38 │ │ │ └── train │ │ │ └── events.out.tfevents.1564933838.jhubuntu │ └── CuriosityA2C.py ├── networks │ ├── __init__.py │ └── networks.py ├── .vscode │ └── settings.json ├── A3C │ └── A3C.py ├── PPO │ └── PPO.py ├── VIN │ └── VIN.py ├── DAAC │ └── DAAC.py └── Unreal │ └── UnrealA2C2.py ├── setup.py ├── LICENSE ├── README.md └── .gitignore /rlib/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rlib/A2C/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rlib/DDQN/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rlib/RND/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rlib/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rlib/Curiosity/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rlib/networks/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rlib/utils/__pycache__/SyncMultiEnvTrainer.cpython-35.pyc.140156829641872: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rlib/.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "python.pythonPath": "/usr/bin/python3.5" 3 | } -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup(name='rlib', version='2.0', packages=find_packages()) 4 | -------------------------------------------------------------------------------- /rlib/Curiosity/logs/Curiosity_LSTM/FreewayDeterministic-v4/19-08-04_16-50-38/train/events.out.tfevents.1564933838.jhubuntu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jhare96/reinforcement-learning/HEAD/rlib/Curiosity/logs/Curiosity_LSTM/FreewayDeterministic-v4/19-08-04_16-50-38/train/events.out.tfevents.1564933838.jhubuntu -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2021 Joshua Hare 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | -------------------------------------------------------------------------------- /rlib/utils/schedulers.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | def polynomial_sheduler(optimiser, lr_final, decay_steps, power=1): 4 | lr_init = optimiser.defaults["lr"] 5 | assert lr_init > lr_final, f"lr_final ({lr_final}) must be be smaller than initial lr ({lr_init})" 6 | 7 | def polylambda(current_step: int): 8 | if current_step > decay_steps: 9 | return lr_final / lr_init # as LambdaLR multiplies by lr_init 10 | else: 11 | decay = (lr_init - lr_final) * (1 - current_step / decay_steps) ** power + lr_final 12 | return decay / lr_init # as LambdaLR multiplies by lr_init 13 | 14 | return torch.optim.lr_scheduler.LambdaLR(optimiser, polylambda, last_epoch=-1) -------------------------------------------------------------------------------- /rlib/utils/random_agent.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | import matplotlib.pyplot as plt 4 | import seaborn as sns 5 | import os, time 6 | import threading 7 | sns.set() 8 | 9 | ep_rewards = [] 10 | 11 | def run_episodes(env, number_episodes, max_steps): 12 | for episode in range(number_episodes): 13 | obs = env.reset() 14 | ep_score = 0 15 | for t in range(max_steps): 16 | action = env.action_space.sample() 17 | obs, r, done, info = env.step(action) 18 | ep_score += r 19 | 20 | if done: 21 | ep_rewards.append(ep_score) 22 | break 23 | def main(): 24 | 25 | env_id = 'MountainCar-v0' 26 | envs = [gym.make(env_id) for i in range(64)] 27 | num_eps = int(1e6) // 64 28 | max_steps = 1000 29 | 30 | ep_rewards = [] 31 | threads = [threading.Thread(target=run_episodes, args=(envs[i], num_eps, max_steps)) for i in range(len(envs))] 32 | 33 | for thread in threads: 34 | thread.start() 35 | 36 | for thread in threads: 37 | thread.join() 38 | 39 | 40 | ep_rewards = np.array(ep_rewards) 41 | avg_reward_line = np.ones_like(ep_rewards) * np.mean(ep_rewards) 42 | filename = 'experiments/random/' + env_id + '/' 43 | if not os.path.exists(filename): 44 | os.makedirs(filename) 45 | np.save(filename + str(num_eps * len(envs)) + 'random.npy', ep_rewards) 46 | plt.plot(ep_rewards) 47 | plt.plot(avg_reward_line, '--', color='0.5') 48 | plt.show() 49 | 50 | 51 | if __name__ == "__main__": 52 | main() -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # reinforcement-learning 2 | 3 | A small Pytorch based reinforcement learning library 4 | as used for my MSc dissertation project ['Dealing with sparse rewards in reinforcement learning'](https://arxiv.org/abs/1910.09281). 5 | 6 | This repository has working implementations of the following reinforcement agents: 7 | 1. Advantage Actor Critic [(A2C)](https://openai.com/blog/baselines-acktr-a2c/) 8 | 2. Synchronous n-step Double Deep Q Network (Sync-DDQN) 9 | 3. Proximal Policy Optimisation [(PPO)](https://arxiv.org/abs/1707.06347) 10 | 4. Random Network Distillation [(RND)](https://arxiv.org/abs/1810.12894) 11 | 5. UNREAL-A2C2, A2C-CNN version of the [(UNREAL agent)](https://deepmind.com/blog/article/reinforcement-learning-unsupervised-auxiliary-tasks) 12 | 6. Random Network Distillation with Auxiliary Learning (RANDAL), novel solution combining UNREAL and RND agents 13 | 14 | 15 | # Install repository: 16 | ```bash 17 | git clone https://github.com/jhare96/reinforcement-learning.git 18 | pip install -e reinforcement-learning 19 | ``` 20 | # To cite RANDAL agents in publications: 21 | follow the link to the ArXiv publication https://arxiv.org/abs/1910.09281 22 | 23 | # To cite this repository in publications: 24 | 25 | @misc{Hare_rlib, 26 | author = {Joshua Hare}, 27 | title = {reinforcement learning library, rlib}, 28 | year = {2019}, 29 | publisher = {GitHub}, 30 | journal = {GitHub repository}, 31 | howpublished = {\url{https://github.com/jhare96/reinforcement-learning}}, 32 | } 33 | 34 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ### Python ### 2 | # Byte-compiled / optimized / DLL files 3 | __pycache__/ 4 | *.py[cod] 5 | *$py.class 6 | 7 | # C extensions 8 | *.so 9 | 10 | # Distribution / packaging 11 | .Python 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | cover/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | db.sqlite3 63 | db.sqlite3-journal 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | 75 | # PyBuilder 76 | .pybuilder/ 77 | target/ 78 | 79 | # Jupyter Notebook 80 | .ipynb_checkpoints 81 | 82 | # IPython 83 | profile_default/ 84 | ipython_config.py 85 | 86 | # pyenv 87 | # For a library or package, you might want to ignore these files since the code is 88 | # intended to run in multiple environments; otherwise, check them in: 89 | # .python-version 90 | 91 | # pipenv 92 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 93 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 94 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 95 | # install all needed dependencies. 96 | #Pipfile.lock 97 | 98 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 99 | __pypackages__/ 100 | 101 | # Celery stuff 102 | celerybeat-schedule 103 | celerybeat.pid 104 | 105 | # SageMath parsed files 106 | *.sage.py 107 | 108 | # Environments 109 | .env 110 | .venv 111 | env/ 112 | venv/ 113 | ENV/ 114 | env.bak/ 115 | venv.bak/ 116 | 117 | # Spyder project settings 118 | .spyderproject 119 | .spyproject 120 | 121 | # Rope project settings 122 | .ropeproject 123 | 124 | # mkdocs documentation 125 | /site 126 | 127 | # mypy 128 | .mypy_cache/ 129 | .dmypy.json 130 | dmypy.json 131 | 132 | # Pyre type checker 133 | .pyre/ 134 | 135 | # pytype static type analyzer 136 | .pytype/ 137 | 138 | # Cython debug symbols 139 | cython_debug/ 140 | 141 | ### Text ### 142 | *.doc 143 | *.docx 144 | *.msg 145 | *.pages 146 | *.rtf 147 | *.txt 148 | *.wpd 149 | *.wps 150 | 151 | ### VisualStudioCode ### 152 | .vscode/* 153 | !.vscode/settings.json 154 | !.vscode/tasks.json 155 | !.vscode/launch.json 156 | !.vscode/extensions.json 157 | *.code-workspace 158 | 159 | # Local History for Visual Studio Code 160 | .history/ 161 | 162 | ### VisualStudioCode Patch ### 163 | # Ignore all local history of files 164 | .history 165 | .ionide 166 | 167 | 168 | *.mo 169 | *.egg-info 170 | *.egg 171 | *.EGG 172 | *.EGG-INFO 173 | bin 174 | build 175 | develop-eggs 176 | downloads 177 | eggs 178 | fake-eggs 179 | parts 180 | dist 181 | .installed.cfg 182 | .mr.developer.cfg 183 | .hg 184 | .bzr 185 | .svn 186 | *.pyc 187 | *.pyo 188 | *.tmp* 189 | .vscode 190 | -------------------------------------------------------------------------------- /rlib/utils/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | 4 | def fastsample(policy:np.ndarray, k=1): 5 | return torch.multinomial(torch.from_numpy(policy), num_samples=k, replacement=True).view(-1).numpy() 6 | 7 | def log_uniform(low=1e-10, high=1, size=()): 8 | return np.exp(np.random.uniform(low=np.log(low), high=np.log(high), size=size)) 9 | 10 | def stack_many(*args, stack=np.stack): 11 | return tuple([stack(arg) for arg in args]) 12 | 13 | def normalise(x, mean, std): 14 | return (x-mean)/std 15 | 16 | def fold_batch(x): 17 | rows, cols = x.shape[0], x.shape[1] 18 | y = x.reshape(rows*cols,*x.shape[2:]) 19 | return y 20 | 21 | def unfold_batch(x, length, batch_size): 22 | return x.reshape(length, batch_size, *x.shape[1:]) 23 | 24 | def fold_many(*args): 25 | return tuple([fold_batch(arg) for arg in args]) 26 | 27 | def one_hot(x, num_classes): 28 | return np.eye(num_classes)[x] 29 | 30 | def totorch(x, device='cuda'): 31 | x = torch.from_numpy(x).float().to(device) 32 | return x 33 | 34 | def tonumpy(x): 35 | return x.detach().cpu().numpy() 36 | 37 | def tonumpy_many(*args): 38 | return tuple([tonumpy(arg) for arg in args]) 39 | 40 | def totorch_many(*args, device='cuda'): 41 | return tuple([totorch(arg, device) for arg in args]) 42 | 43 | class Welfords_algorithm(object): 44 | #https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm 45 | def __init__(self, mean=0, epsilon=1e-4): 46 | self.mean = mean 47 | self.n = epsilon 48 | self.M2 = 1 49 | 50 | def update(self, x): 51 | return self.update_from_mean(x.mean(axis=0)) 52 | 53 | def update_from_mean(self, x): 54 | self.n +=1 55 | prev_mean = self.mean 56 | new_mean = prev_mean + ((x - prev_mean) / self.n) 57 | self.M2 += (x - new_mean) * (x - prev_mean) 58 | self.var = self.M2 / self.n 59 | self.mean = new_mean 60 | return self.mean, np.sqrt(self.var) 61 | 62 | #https://github.com/openai/baselines/blob/master/baselines/common/running_mean_std.py 63 | class RunningMeanStd(object): 64 | # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm 65 | def __init__(self, epsilon=1e-4, shape=(), dtype=np.float32): 66 | self.mean = np.zeros(shape, dtype=dtype) 67 | self.var = np.ones(shape, dtype=dtype) 68 | self.count = epsilon 69 | 70 | def update(self, x): 71 | batch_mean = np.mean(x, axis=0) 72 | batch_var = np.var(x, axis=0) 73 | batch_count = x.shape[0] 74 | return self.update_from_moments(batch_mean, batch_var, batch_count) 75 | 76 | def update_from_moments(self, batch_mean, batch_var, batch_count): 77 | delta = batch_mean - self.mean 78 | tot_count = self.count + batch_count 79 | 80 | new_mean = self.mean + delta * batch_count / tot_count 81 | m_a = self.var * (self.count) 82 | m_b = batch_var * (batch_count) 83 | M2 = m_a + m_b + np.square(delta) * self.count * batch_count / (self.count + batch_count) 84 | new_var = M2 / (self.count + batch_count) 85 | 86 | new_count = batch_count + self.count 87 | 88 | self.mean = new_mean 89 | self.var = new_var 90 | self.count = new_count 91 | 92 | return self.mean, np.sqrt(self.var) 93 | 94 | 95 | 96 | def nstep_return(rewards, last_values, dones, gamma=0.99, clip=False): 97 | if clip: 98 | rewards = np.clip(rewards, -1, 1) 99 | 100 | T = len(rewards) 101 | 102 | # Calculate R for advantage A = R - V 103 | R = np.zeros_like(rewards) 104 | R[-1] = last_values * (1-dones[-1]) 105 | 106 | for i in reversed(range(T-1)): 107 | # restart score if done as BatchEnv automatically resets after end of episode 108 | R[i] = rewards[i] + gamma * R[i+1] * (1-dones[i]) 109 | 110 | return R 111 | 112 | def lambda_return(rewards, values, last_values, dones, gamma=0.99, lambda_=0.8, clip=False): 113 | if clip: 114 | rewards = np.clip(rewards, -1, 1) 115 | T = len(rewards) 116 | # Calculate eligibility trace R^lambda 117 | R = np.zeros_like(rewards) 118 | R[-1] = last_values * (1-dones[-1]) 119 | for t in reversed(range(T-1)): 120 | # restart score if done as BatchEnv automatically resets after end of episode 121 | R[t] = rewards[t] + gamma * (lambda_* R[t+1] + (1.0-lambda_) * values[t+1]) * (1-dones[t]) 122 | 123 | return R 124 | 125 | def GAE(rewards, values, last_values, dones, gamma=0.99, lambda_=0.95, clip=False): 126 | if clip: 127 | rewards = np.clip(rewards, -1, 1) 128 | # Generalised Advantage Estimation 129 | Adv = np.zeros_like(rewards) 130 | Adv[-1] = rewards[-1] + gamma * last_values * (1-dones[-1]) - values[-1] 131 | T = len(rewards) 132 | for t in reversed(range(T-1)): 133 | delta = rewards[t] + gamma * values[t+1] * (1-dones[t]) - values[t] 134 | Adv[t] = delta + gamma * lambda_ * Adv[t+1] * (1-dones[t]) 135 | 136 | return Adv -------------------------------------------------------------------------------- /rlib/utils/play.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import pygame 3 | import matplotlib.pyplot as plt 4 | #from gym.utils import play 5 | 6 | from collections import deque 7 | from pygame.locals import VIDEORESIZE 8 | 9 | 10 | def display_arr(screen, arr, video_size, transpose): 11 | arr_min, arr_max = arr.min(), arr.max() 12 | arr = 255.0 * (arr - arr_min) / (arr_max - arr_min) 13 | pyg_img = pygame.surfarray.make_surface(arr.swapaxes(0, 1) if transpose else arr) 14 | pyg_img = pygame.transform.scale(pyg_img, video_size) 15 | screen.blit(pyg_img, (0,0)) 16 | 17 | def play(env, transpose=True, fps=30, zoom=None, callback=None, keys_to_action=None): 18 | """Allows one to play the game using keyboard. 19 | To simply play the game use: 20 | play(gym.make("Pong-v4")) 21 | Above code works also if env is wrapped, so it's particularly useful in 22 | verifying that the frame-level preprocessing does not render the game 23 | unplayable. 24 | If you wish to plot real time statistics as you play, you can use 25 | gym.utils.play.PlayPlot. Here's a sample code for plotting the reward 26 | for last 5 second of gameplay. 27 | def callback(obs_t, obs_tp1, action, rew, done, info): 28 | return [rew,] 29 | plotter = PlayPlot(callback, 30 * 5, ["reward"]) 30 | env = gym.make("Pong-v4") 31 | play(env, callback=plotter.callback) 32 | Arguments 33 | --------- 34 | env: gym.Env 35 | Environment to use for playing. 36 | transpose: bool 37 | If True the output of observation is transposed. 38 | Defaults to true. 39 | fps: int 40 | Maximum number of steps of the environment to execute every second. 41 | Defaults to 30. 42 | zoom: float 43 | Make screen edge this many times bigger 44 | callback: lambda or None 45 | Callback if a callback is provided it will be executed after 46 | every step. It takes the following input: 47 | obs_t: observation before performing action 48 | obs_tp1: observation after performing action 49 | action: action that was executed 50 | rew: reward that was received 51 | done: whether the environment is done or not 52 | info: debug info 53 | keys_to_action: dict: tuple(int) -> int or None 54 | Mapping from keys pressed to action performed. 55 | For example if pressed 'w' and space at the same time is supposed 56 | to trigger action number 2 then key_to_action dict would look like this: 57 | { 58 | # ... 59 | sorted(ord('w'), ord(' ')) -> 2 60 | # ... 61 | } 62 | If None, default key_to_action mapping for that env is used, if provided. 63 | """ 64 | env.reset() 65 | rendered=env.render( mode='rgb_array') 66 | 67 | if keys_to_action is None: 68 | if hasattr(env, 'get_keys_to_action'): 69 | keys_to_action = env.get_keys_to_action() 70 | elif hasattr(env.unwrapped, 'get_keys_to_action'): 71 | keys_to_action = env.unwrapped.get_keys_to_action() 72 | else: 73 | assert False, env.spec.id + " does not have explicit key to action mapping, " + \ 74 | "please specify one manually" 75 | relevant_keys = set(sum(map(list, keys_to_action.keys()),[])) 76 | 77 | video_size=[rendered.shape[1],rendered.shape[0]] 78 | if zoom is not None: 79 | video_size = int(video_size[0] * zoom), int(video_size[1] * zoom) 80 | 81 | pressed_keys = [] 82 | running = True 83 | env_done = True 84 | 85 | screen = pygame.display.set_mode(video_size) 86 | clock = pygame.time.Clock() 87 | 88 | 89 | while running: 90 | if env_done: 91 | env_done = False 92 | obs = env.reset() 93 | else: 94 | action = keys_to_action.get(tuple(sorted(pressed_keys)), 0) 95 | prev_obs = obs 96 | obs, rew, env_done, info = env.step(action) 97 | if callback is not None: 98 | callback(prev_obs, obs, action, rew, env_done, info) 99 | if obs is not None: 100 | rendered=env.render( mode='rgb_array') 101 | display_arr(screen, rendered, transpose=transpose, video_size=video_size) 102 | 103 | # process pygame events 104 | for event in pygame.event.get(): 105 | # test events, set key states 106 | if event.type == pygame.KEYDOWN: 107 | if event.key in relevant_keys: 108 | pressed_keys.append(event.key) 109 | elif event.key == 27: 110 | running = False 111 | elif event.type == pygame.KEYUP: 112 | if event.key in relevant_keys: 113 | pressed_keys.remove(event.key) 114 | elif event.type == pygame.QUIT: 115 | running = False 116 | elif event.type == VIDEORESIZE: 117 | video_size = event.size 118 | screen = pygame.display.set_mode(video_size) 119 | print(video_size) 120 | 121 | pygame.display.flip() 122 | clock.tick(fps) 123 | pygame.quit() 124 | 125 | class PlayPlot(object): 126 | def __init__(self, callback, horizon_timesteps, plot_names): 127 | self.data_callback = callback 128 | self.horizon_timesteps = horizon_timesteps 129 | self.plot_names = plot_names 130 | 131 | assert plt is not None, "matplotlib backend failed, plotting will not work" 132 | 133 | num_plots = len(self.plot_names) 134 | self.fig, self.ax = plt.subplots(num_plots) 135 | if num_plots == 1: 136 | self.ax = [self.ax] 137 | for axis, name in zip(self.ax, plot_names): 138 | axis.set_title(name) 139 | self.t = 0 140 | self.cur_plot = [None for _ in range(num_plots)] 141 | self.data = [deque(maxlen=horizon_timesteps) for _ in range(num_plots)] 142 | 143 | def callback(self, obs_t, obs_tp1, action, rew, done, info): 144 | points = self.data_callback(obs_t, obs_tp1, action, rew, done, info) 145 | for point, data_series in zip(points, self.data): 146 | data_series.append(point) 147 | self.t += 1 148 | 149 | xmin, xmax = max(0, self.t - self.horizon_timesteps), self.t 150 | 151 | for i, plot in enumerate(self.cur_plot): 152 | if plot is not None: 153 | plot.remove() 154 | self.cur_plot[i] = self.ax[i].scatter(range(xmin, xmax), list(self.data[i]), c='blue') 155 | self.ax[i].set_xlim(xmin, xmax) 156 | plt.pause(0.000001) 157 | 158 | if __name__ == '__main__': 159 | env = gym.make('MountainCar-v0') 160 | def callback(obs_t, obs_tp1, action, rew, done, info): 161 | return [rew,] 162 | plotter = PlayPlot(callback, 30 * 5, ["reward"]) 163 | play(env, callback=plotter.callback) -------------------------------------------------------------------------------- /rlib/A2C/A2C.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import gym 3 | import numpy as np 4 | import time, datetime 5 | 6 | from rlib.networks.networks import* 7 | from rlib.utils.VecEnv import* 8 | from rlib.utils.wrappers import* 9 | from rlib.utils.SyncMultiEnvTrainer import SyncMultiEnvTrainer 10 | from rlib.utils.utils import stack_many, totorch, fastsample 11 | from rlib.A2C.ActorCritic import ActorCritic 12 | 13 | class A2C(SyncMultiEnvTrainer): 14 | def __init__(self, envs, model, val_envs, train_mode='nstep', return_type='nstep', log_dir='logs/A2C', model_dir='models/A2C', total_steps=10000, nsteps=5, gamma=0.99, lambda_=0.95, 15 | validate_freq=1e6, save_freq=0, render_freq=0, num_val_episodes=50, max_val_steps=10000, log_scalars=True): 16 | 17 | super().__init__(envs, model, val_envs, log_dir=log_dir, model_dir=model_dir, train_mode=train_mode, return_type=return_type, total_steps=total_steps, nsteps=nsteps, 18 | gamma=gamma, lambda_=lambda_, validate_freq=validate_freq, save_freq=save_freq, render_freq=render_freq, 19 | num_val_episodes=num_val_episodes, max_val_steps=max_val_steps, log_scalars=log_scalars) 20 | 21 | hyperparas = {'learning_rate':model.lr, 'learning_rate_final':model.lr_final, 'lr_decay_steps':model.decay_steps , 'grad_clip':model.grad_clip, 'nsteps':nsteps, 'num_workers':self.num_envs, 22 | 'total_steps':total_steps, 'entropy_coefficient':model.entropy_coeff, 'value_coefficient':model.value_coeff , 'return type':self.return_type, 'gamma':self.gamma, 'lambda':self.lambda_} 23 | 24 | if log_scalars: 25 | filename = log_dir + '/' + 'hyperparameters.txt' 26 | self.save_hyperparameters(filename , **hyperparas) 27 | 28 | def get_action(self, state): 29 | policy, value = self.model.evaluate(state) 30 | action = int(fastsample(policy)) 31 | return action 32 | 33 | def rollout(self,): 34 | rollout = [] 35 | for t in range(self.nsteps): 36 | policies, values = self.model.evaluate(self.states) 37 | actions = fastsample(policies) 38 | next_states, rewards, dones, infos = self.env.step(actions) 39 | rollout.append((self.states, actions, rewards, values, dones)) 40 | self.states = next_states 41 | 42 | states, actions, rewards, values, dones = stack_many(*zip(*rollout)) 43 | _, last_values = self.model.evaluate(next_states) 44 | return states, actions, rewards, dones, values, last_values 45 | 46 | def _train_onestep(self): 47 | states = self.env.reset() 48 | y = np.zeros((self.num_envs)) 49 | num_steps = self.total_steps // self.num_envs 50 | for t in range(1,num_steps+1): 51 | policies, values = self.model.evaluate(self.states) 52 | actions = fastsample(policies) 53 | next_states, rewards, dones, infos = self.env.step(actions) 54 | y = rewards + self.gamma * self.model.get_value(next_states) * (1-dones) 55 | 56 | l = self.model.backprop(states, y, actions) 57 | states = next_states 58 | 59 | if self.render_freq > 0 and t % ((self.validate_freq // self.num_envs) * self.render_freq) == 0: 60 | render = True 61 | else: 62 | render = False 63 | 64 | if self.validate_freq > 0 and t % (self.validate_freq // self.num_envs) == 0: 65 | self.validation_summary(t,l,start,render) 66 | start = time.time() 67 | 68 | if self.save_freq > 0 and t % (self.save_freq // self.num_envs) == 0: 69 | self.s += 1 70 | self.save(self.s) 71 | print('saved model') 72 | 73 | 74 | 75 | def main(env_id): 76 | num_envs = 32 77 | nsteps = 20 78 | 79 | current_time = datetime.datetime.now().strftime('%y-%m-%d_%H-%M-%S') 80 | 81 | train_log_dir = 'logs/A2C/' + env_id +'/GAE/' + current_time 82 | model_dir = "models/A2C/" + env_id + '/GAE/' + current_time 83 | 84 | 85 | classic_list = ['MountainCar-v0', 'Acrobot-v1', 'LunarLander-v2', 'CartPole-v0', 'CartPole-v1'] 86 | if any(env_id in s for s in classic_list): 87 | print('Classic Control') 88 | val_envs = [gym.make(env_id) for i in range(10)] 89 | envs = BatchEnv(DummyEnv, env_id, num_envs, blocking=False) 90 | 91 | elif 'ApplePicker' in env_id: 92 | print('ApplePicker') 93 | make_args = {'num_objects':100, 'default_reward':-0.1} 94 | val_envs = [gym.make(env_id, **make_args) for i in range(10)] 95 | envs = DummyBatchEnv(apple_pickgame, env_id, num_envs, max_steps=5000, auto_reset=True, make_args=make_args) 96 | print(val_envs[0]) 97 | print(envs.envs[0]) 98 | 99 | else: 100 | print('Atari') 101 | env = gym.make(env_id) 102 | if env.unwrapped.get_action_meanings()[1] == 'FIRE': 103 | reset = True 104 | print('fire on reset') 105 | else: 106 | reset = False 107 | print('only stack frames') 108 | env.close() 109 | val_envs = [AtariEnv(gym.make(env_id), k=4, episodic=False, reset=reset, clip_reward=False) for i in range(16)] 110 | envs = BatchEnv(AtariEnv, env_id, num_envs, blocking=False, k=4, reset=reset, episodic=False, clip_reward=True) 111 | 112 | action_size = val_envs[0].action_space.n 113 | input_size = val_envs[0].reset().shape 114 | print('input shape', input_size) 115 | print('action space', action_size) 116 | 117 | 118 | 119 | model = ActorCritic(UniverseCNN, 120 | input_size, 121 | action_size, 122 | lr=1e-3, 123 | lr_final=1e-4, 124 | entropy_coeff=0.01, 125 | decay_steps=50e6//(num_envs*nsteps), 126 | grad_clip=0.5) 127 | 128 | 129 | a2c = A2C(envs=envs, 130 | model=model, 131 | model_dir=model_dir, 132 | log_dir=train_log_dir, 133 | val_envs=val_envs, 134 | train_mode='nstep', 135 | return_type='GAE', 136 | total_steps=50e6, 137 | nsteps=nsteps, 138 | validate_freq=1e5, 139 | save_freq=0, 140 | render_freq=0, 141 | num_val_episodes=50, 142 | log_scalars=False) 143 | 144 | a2c.train() 145 | 146 | del a2c 147 | 148 | # a2c = A2C.load(A2C, model, envs, val_envs, model_dir + time + '/1.trainer') 149 | # a2c.train() 150 | 151 | 152 | if __name__ == "__main__": 153 | env_id_list = ['SpaceInvadersDeterministic-v4', 'FreewayDeterministic-v4', 'MontezumaRevengeDeterministic-v4', 'PongDeterministic-v4'] 154 | #env_id_list = ['MontezumaRevengeDeterministic-v4'] 155 | #env_id_list = ['MountainCar-v0', 'Acrobot-v1', 'CartPole-v1', ] 156 | #env_id_list = ['ApplePicker-v0'] 157 | for env_id in env_id_list: 158 | main(env_id) 159 | -------------------------------------------------------------------------------- /rlib/A2C/ActorCritic.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | import numpy as np 4 | from rlib.networks.networks import MaskedLSTMCell, MaskedRNN, MaskedLSTMBlock 5 | from rlib.utils.schedulers import polynomial_sheduler 6 | from rlib.utils.utils import totorch, tonumpy, totorch_many, tonumpy_many 7 | 8 | class ActorCritic(torch.nn.Module): 9 | def __init__(self, model, input_size, action_size, entropy_coeff=0.01, value_coeff=0.5, lr=1e-3, lr_final=1e-6, 10 | decay_steps=6e5, grad_clip=0.5, build_optimiser=True, optim=torch.optim.RMSprop, optim_args={}, device='cuda', **model_args): 11 | super(ActorCritic, self).__init__() 12 | self.lr = lr 13 | self.lr_final = lr_final 14 | self.entropy_coeff = entropy_coeff 15 | self.value_coeff = value_coeff 16 | self.decay_steps = decay_steps 17 | self.grad_clip = grad_clip 18 | self.action_size = action_size 19 | self.device = device 20 | 21 | self.model = model(input_size, **model_args).to(self.device) 22 | self.dense_size = self.model.dense_size 23 | self.policy_distrib = torch.nn.Linear(self.dense_size, action_size).to(self.device) # Actor 24 | self.V = torch.nn.Linear(self.dense_size, 1).to(self.device) # Critic 25 | 26 | if build_optimiser: 27 | self.optimiser = optim(self.parameters(), lr, **optim_args) 28 | self.scheduler = polynomial_sheduler(self.optimiser, lr_final, decay_steps, power=1) 29 | 30 | def loss(self, policy, R, V, actions_onehot): 31 | Advantage = R - V 32 | value_loss = 0.5 * torch.mean(torch.square(Advantage)) 33 | 34 | log_policy = torch.log(torch.clip(policy, 1e-6, 0.999999)) 35 | log_policy_actions = torch.sum(log_policy * actions_onehot, dim=1) 36 | policy_loss = torch.mean(-log_policy_actions * Advantage.detach()) 37 | 38 | entropy = torch.mean(torch.sum(policy * -log_policy, dim=1)) 39 | loss = policy_loss + self.value_coeff * value_loss - self.entropy_coeff * entropy 40 | return loss 41 | 42 | def forward(self, state): 43 | enc_state = self.model(state) 44 | policy = F.softmax(self.policy_distrib(enc_state), dim=-1) 45 | value = self.V(enc_state).view(-1) 46 | return policy, value 47 | 48 | def evaluate(self, state:np.ndarray): 49 | state = totorch(state, self.device) 50 | with torch.no_grad(): 51 | policy, value = self.forward(state) 52 | return tonumpy(policy), tonumpy(value) 53 | 54 | def backprop(self, state, R, action): 55 | state, R, action = totorch_many(state, R, action, device=self.device) 56 | action_onehot = F.one_hot(action.long(), num_classes=self.action_size) 57 | policy, value = self.forward(state) 58 | loss = self.loss(policy, R, value, action_onehot) 59 | loss.backward() 60 | if self.grad_clip is not None: 61 | torch.nn.utils.clip_grad_norm_(self.parameters(), self.grad_clip) 62 | self.optimiser.step() 63 | self.optimiser.zero_grad() 64 | self.scheduler.step() 65 | return loss.detach().cpu().numpy() 66 | 67 | 68 | 69 | class ActorCritic_LSTM(torch.nn.Module): 70 | def __init__(self, model, input_size, action_size, cell_size, entropy_coeff=0.01, value_coeff=0.5, 71 | lr=1e-3, lr_final=1e-6, decay_steps=6e5, grad_clip=0.5, build_optimiser=True, optim=torch.optim.RMSprop, optim_args={}, device='cuda', **model_args): 72 | super(ActorCritic_LSTM, self).__init__() 73 | self.lr = lr 74 | self.lr_final = lr_final 75 | self.input_size = input_size 76 | self.entropy_coeff = entropy_coeff 77 | self.value_coeff = value_coeff 78 | self.decay_steps = decay_steps 79 | self.grad_clip = grad_clip 80 | self.cell_size = cell_size 81 | self.action_size = action_size 82 | self.device = device 83 | 84 | 85 | self.model = model(input_size, **model_args).to(self.device) 86 | self.dense_size = self.model.dense_size 87 | #self.lstm = MaskedRNN(MaskedLSTMCell(cell_size, self.dense_size), time_major=True) 88 | self.lstm = MaskedLSTMBlock(self.dense_size, cell_size, time_major=True).to(self.device) 89 | 90 | self.policy_distrib = torch.nn.Linear(cell_size, action_size, device=self.device) # Actor 91 | self.V = torch.nn.Linear(cell_size, 1, device=self.device) # Critic 92 | 93 | 94 | if build_optimiser: 95 | self.optimiser = optim(self.parameters(), lr, **optim_args) 96 | self.scheduler = polynomial_sheduler(self.optimiser, lr_final, decay_steps, power=1) 97 | 98 | def loss(self, policy, R, V, actions_onehot): 99 | Advantage = R - V 100 | value_loss = 0.5 * torch.mean(torch.square(Advantage)) 101 | 102 | log_policy = torch.log(torch.clip(policy, 1e-6, 0.999999)) 103 | log_policy_actions = torch.sum(log_policy * actions_onehot, dim=1) 104 | policy_loss = torch.mean(-log_policy_actions * Advantage.detach()) 105 | 106 | entropy = torch.mean(torch.sum(policy * -log_policy, dim=1)) 107 | loss = policy_loss + self.value_coeff * value_loss - self.entropy_coeff * entropy 108 | return loss 109 | 110 | def forward(self, state, hidden=None, done=None): 111 | T, num_envs = state.shape[:2] 112 | folded_state = state.view(-1, *self.input_size) 113 | enc_state = self.model(folded_state) 114 | folded_enc_state = enc_state.view(T, num_envs, self.dense_size) 115 | lstm_outputs, hidden = self.lstm(folded_enc_state, hidden, done) 116 | policy = F.softmax(self.policy_distrib(lstm_outputs), dim=-1).view(-1, self.action_size) 117 | value = self.V(lstm_outputs).view(-1) 118 | return policy, value, hidden 119 | 120 | def evaluate(self, state:np.ndarray, hidden:np.ndarray=None, done=None): 121 | state = totorch(state, self.device) 122 | hidden = totorch_many(*hidden, device=self.device) if hidden is not None else None 123 | with torch.no_grad(): 124 | policy, value, hidden = self.forward(state, hidden, done) 125 | return tonumpy(policy), tonumpy(value), tonumpy_many(*hidden) 126 | 127 | def backprop(self, state, R, action, hidden, done): 128 | state, R, action, done = totorch_many(state, R, action, done, device=self.device) 129 | hidden = totorch_many(*hidden, device=self.device) 130 | action_onehot = F.one_hot(action.long(), num_classes=self.action_size) 131 | policy, value, hidden = self.forward(state, hidden, done) 132 | loss = self.loss(policy, R, value, action_onehot) 133 | loss.backward() 134 | if self.grad_clip is not None: 135 | torch.nn.utils.clip_grad_norm_(self.parameters(), self.grad_clip) 136 | self.optimiser.step() 137 | self.optimiser.zero_grad() 138 | self.scheduler.step() 139 | return loss.detach().cpu().numpy() 140 | 141 | def get_initial_hidden(self, batch_size): 142 | return np.zeros((1, batch_size, self.cell_size)), np.zeros((1, batch_size, self.cell_size)) 143 | 144 | def mask_hidden(self, hidden, dones): 145 | mask = (1-dones).reshape(-1, 1) 146 | return (hidden[0]*mask, hidden[1]*mask) -------------------------------------------------------------------------------- /rlib/A3C/A3C.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import torch 3 | import torch.multiprocessing as mp 4 | import torch.nn.functional as F 5 | import numpy as np 6 | import math 7 | import time 8 | 9 | from rlib.A2C.ActorCritic import ActorCritic 10 | from rlib.networks.networks import NatureCNN 11 | from rlib.utils.wrappers import AtariEnv 12 | from rlib.utils.utils import stack_many, tonumpy, totorch, lambda_return 13 | 14 | 15 | def train(global_model, model, env, nsteps, num_episodes, ID): 16 | opt = torch.optim.RMSprop(global_model.parameters(), lr=1e-3) 17 | episode = 0 18 | episode_steps = 0 19 | episode_score = 0 20 | T = 0 21 | state = env.reset() 22 | start = time.time() 23 | while episode < num_episodes: 24 | rollout = [] 25 | for t in range(nsteps): 26 | with torch.no_grad(): 27 | policy, value = model(totorch(state[None], device='cpu')) 28 | policy, value = tonumpy(policy), tonumpy(value) 29 | action = np.random.choice(policy.shape[1], p=policy[0]) 30 | next_state, reward, done, info = env.step(action) 31 | episode_score += reward 32 | rollout.append((state, action, reward, value, done)) 33 | state = next_state 34 | 35 | T += 1 36 | episode_steps += 1 37 | 38 | if done or t == nsteps-1: 39 | states, actions, rewards, values, dones = stack_many(*zip(*rollout)) 40 | with torch.no_grad(): 41 | _, last_values = model.forward(totorch(next_state[None], device='cpu')) 42 | last_values = last_values.cpu().numpy() 43 | 44 | 45 | R = lambda_return(rewards, values, last_values, dones, gamma=0.9, lambda_=0.95, clip=False) 46 | 47 | loss = update_params(model, global_model, opt, states, actions, R) 48 | 49 | #self.T += t 50 | 51 | if done: 52 | episode += 1 53 | state = env.reset() 54 | if episode % 1 == 0: 55 | time_taken = time.time() - start 56 | print(f'worker {ID}, total worker steps {T:,} local episode {episode}, episode score {episode_score} episode steps {episode_steps}, time taken {time_taken:,.1f}s, fps {episode_steps/time_taken:.2f}') 57 | episode_steps = 0 58 | episode_score = 0 59 | start = time.time() 60 | break 61 | 62 | 63 | def update_params(lm, gm, gopt, states, actions, R): 64 | states, R, actions = totorch(states, 'cpu'), totorch(R, 'cpu'), totorch(actions, 'cpu') 65 | actions_onehot = F.one_hot(actions.long(), num_classes=lm.action_size) 66 | policies, values = lm.forward(states) 67 | loss = lm.loss(policies, R, values, actions_onehot) 68 | 69 | loss.backward() 70 | 71 | if lm.grad_clip is not None: 72 | torch.nn.utils.clip_grad_norm_(lm.parameters(), lm.grad_clip) 73 | 74 | for local_param, global_param in zip(lm.parameters(), gm.parameters()): 75 | global_param._grad = local_param.grad 76 | 77 | gopt.step() 78 | gopt.zero_grad() 79 | #self.scheduler.step() 80 | 81 | lm.load_state_dict(gm.state_dict()) 82 | return loss.detach().cpu().numpy() 83 | 84 | 85 | 86 | # class SharedAdam(torch.optim.Adam): 87 | # def __init__(self, params, lr=1e-3, betas=(0.9, 0.99), eps=1e-8, 88 | # weight_decay=0): 89 | # super(SharedAdam, self).__init__(params, lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) 90 | # # State initialization 91 | # for group in self.param_groups: 92 | # for p in group['params']: 93 | # state = self.state[p] 94 | # state['step'] = 0 95 | # state['exp_avg'] = torch.zeros_like(p.data) 96 | # state['exp_avg_sq'] = torch.zeros_like(p.data) 97 | 98 | # # share in memory 99 | # state['exp_avg'].share_memory_() 100 | # state['exp_avg_sq'].share_memory_() 101 | 102 | class SharedAdam(torch.optim.Adam): 103 | """Implements Adam algorithm with shared states. 104 | """ 105 | 106 | def __init__(self, 107 | params, 108 | lr=1e-3, 109 | betas=(0.9, 0.999), 110 | eps=1e-8, 111 | weight_decay=0): 112 | super(SharedAdam, self).__init__(params, lr, betas, eps, weight_decay) 113 | 114 | for group in self.param_groups: 115 | for p in group['params']: 116 | state = self.state[p] 117 | state['step'] = torch.zeros(1) 118 | state['exp_avg'] = p.data.new().resize_as_(p.data).zero_() 119 | state['exp_avg_sq'] = p.data.new().resize_as_(p.data).zero_() 120 | 121 | def share_memory(self): 122 | for group in self.param_groups: 123 | for p in group['params']: 124 | state = self.state[p] 125 | state['step'].share_memory_() 126 | state['exp_avg'].share_memory_() 127 | state['exp_avg_sq'].share_memory_() 128 | 129 | def step(self, closure=None): 130 | """Performs a single optimization step. 131 | Arguments: 132 | closure (callable, optional): A closure that reevaluates the model 133 | and returns the loss. 134 | """ 135 | loss = None 136 | if closure is not None: 137 | loss = closure() 138 | 139 | for group in self.param_groups: 140 | for p in group['params']: 141 | if p.grad is None: 142 | continue 143 | grad = p.grad.data 144 | state = self.state[p] 145 | 146 | exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] 147 | beta1, beta2 = group['betas'] 148 | 149 | state['step'] += 1 150 | 151 | if group['weight_decay'] != 0: 152 | grad = grad.add(group['weight_decay'], p.data) 153 | 154 | # Decay the first and second moment running average coefficient 155 | exp_avg.mul_(beta1).add_(1 - beta1, grad) 156 | exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) 157 | 158 | denom = exp_avg_sq.sqrt().add_(group['eps']) 159 | 160 | bias_correction1 = 1 - beta1 ** state['step'].item() 161 | bias_correction2 = 1 - beta2 ** state['step'].item() 162 | step_size = group['lr'] * math.sqrt( 163 | bias_correction2) / bias_correction1 164 | 165 | p.data.addcdiv_(-step_size, exp_avg, denom) 166 | 167 | return loss 168 | 169 | 170 | 171 | 172 | if __name__ == '__main__': 173 | env_id = 'SpaceInvadersDeterministic-v4' 174 | env = AtariEnv(gym.make(env_id), reset=True) 175 | input_size = env.reset().shape 176 | action_size = env.action_space.n 177 | 178 | print('action_size', action_size) 179 | 180 | global_model = ActorCritic(NatureCNN, input_size, action_size, build_optimiser=False) 181 | global_model.share_memory() 182 | 183 | #opt = SharedAdam(global_model.parameters(), lr=1e-3) 184 | #opt.share_memory() 185 | 186 | #actor = ActorCritic(NatureCNN, input_size, action_size) 187 | env_args = dict(k=4, rescale=84, episodic=True, reset=True, clip_reward=True, Noop=True, time_limit=None, channels_first=True) 188 | model_args = dict(model=NatureCNN, input_size=input_size, action_size=action_size, build_optimiser=False) 189 | 190 | processes = [] 191 | for rank in range(8): 192 | p = mp.Process(target=train, args=(global_model, ActorCritic(**model_args), AtariEnv(gym.make(env_id), **env_args), 20, 1000, rank)) 193 | p.start() 194 | processes.append(p) 195 | time.sleep(0.5) 196 | for p in processes: 197 | p.join() -------------------------------------------------------------------------------- /rlib/utils/ReplayMemory.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | import time, copy 4 | import scipy.misc 5 | from collections import deque 6 | #from DoubleDQN import ReplayMemory as RM 7 | 8 | class FrameBuffer(object): 9 | def __init__(self, size, width, height, stack, Atari = True): 10 | self._idx = 0 11 | self._replay_length = size 12 | self._stack_size = stack 13 | self._Atari = Atari 14 | self._frames = np.empty((size,width,height), dtype=np.uint8) 15 | self._blank_frame = np.zeros((width,height)) 16 | self._stacked_frames = deque([self._blank_frame for i in range(self._stack_size)], maxlen=self._stack_size) 17 | 18 | def preprocess_frame(self,frame): 19 | frame = scipy.misc.imresize(frame, [110,84,3])[110-84:,0:84,:] 20 | frame = np.dot(frame[...,:3], np.array([0.299, 0.587, 0.114])).astype(dtype=np.uint8) 21 | return frame 22 | 23 | def addFrame(self,frame): 24 | self._frames[self._idx] = self.preprocess_frame(frame) 25 | self._idx = (self._idx +1) % self._replay_length 26 | 27 | def stack_frames(self,frame,reset=False): 28 | self.addFrame(frame) 29 | if reset: 30 | for _ in range(self._stack_size): 31 | self._stacked_frames.append(self._frames[self._idx-1]) 32 | else: 33 | self._stacked_frames.append(self._frames[self._idx-1]) 34 | 35 | return copy.copy(self._stacked_frames) 36 | 37 | 38 | class NumpyReplayMemory(object): 39 | def __init__(self, replaysize, shape): 40 | self._idx = 0 41 | self._full_flag = False 42 | self._replay_length = replaysize 43 | self._states = np.zeros((replaysize,*shape), dtype=np.uint8) 44 | self._actions = np.zeros((replaysize), dtype=np.int) 45 | self._rewards = np.zeros((replaysize), dtype=np.int) 46 | self._next_states = np.zeros((replaysize,*shape), dtype=np.uint8) 47 | self._dones = np.zeros((replaysize), dtype=np.int) 48 | #self._stacked_frames = deque([np.zeros((width,height), dtype=np.uint8) for i in range(stack)], maxlen=stack) 49 | 50 | def addMemory(self,state,action,reward,next_state,done): 51 | self._states[self._idx] = state 52 | self._actions[self._idx] = action 53 | self._rewards[self._idx] = reward 54 | self._next_states[self._idx] = next_state 55 | self._dones[self._idx] = done 56 | if self._idx + 1 >= self._replay_length: 57 | self._idx = 0 58 | self._full_flag = True 59 | else: 60 | self._idx += 1 61 | 62 | def __len__(self): 63 | if self._full_flag == False: 64 | return self._idx 65 | else: 66 | return self._replay_length 67 | 68 | 69 | def sample(self,batch_size): 70 | if self._full_flag == False: 71 | idxs = np.random.choice(self._idx, size=batch_size, replace=False) 72 | else: 73 | idxs = np.random.choice(self._replay_length, size=batch_size, replace=False) 74 | 75 | states = self._states[idxs] 76 | actions = self._actions[idxs] 77 | rewards = self._rewards[idxs] 78 | next_states = self._next_states[idxs] 79 | dones = self._dones[idxs] 80 | 81 | return states, actions, rewards, next_states, dones, idxs 82 | 83 | class replayMemory(object): 84 | def __init__(self,replay_length,pixels=True): 85 | self._replay_length = replay_length 86 | self._pixels = pixels 87 | self._memory = [] 88 | self._idx = 0 89 | 90 | def addMemory(self,state,action,reward,next_state,final_state): 91 | if len(self._memory) < self._replay_length: 92 | self._memory.append((state,action,reward,next_state,final_state)) 93 | else: 94 | self._memory[self._idx] = (state,action,reward,next_state,final_state) 95 | self._idx = (self._idx +1) % self._replay_length 96 | 97 | def getlen(self): 98 | return len(self._memory) 99 | 100 | def resetMemory(self): 101 | self._memory = [] 102 | self._idx = 0 103 | 104 | def sample(self, batch_size): 105 | idxs = np.random.choice(np.arange(len(self._memory)), size=batch_size, replace=False) 106 | sample = [self._memory[i] for i in idxs ] 107 | 108 | if self._pixels: #stack images to get k previous states 109 | states = np.stack([np.stack(sample[i][0],axis=2) for i in range(len(sample))],axis=0) 110 | next_states = np.stack([np.stack(sample[i][3],axis=2) for i in range(len(sample))],axis=0) 111 | else: 112 | states = np.stack([sample[i][0]for i in range(len(sample))],axis=0) 113 | next_states = np.stack([sample[i][3] for i in range(len(sample))],axis=0) 114 | 115 | actions = np.array([sample[i][1]for i in range(len(sample))]) 116 | rewards = np.array([sample[i][2]for i in range(len(sample))]) 117 | final_state = np.array([sample[i][4]for i in range(len(sample))]) 118 | 119 | return (states,actions,rewards,next_states,final_state) 120 | 121 | 122 | 123 | def stack_frames(frame,stacked_frames,reset=False): 124 | # Preprocess frame 125 | frame = preprocess_frame(frame) 126 | 127 | if reset: 128 | # Clear our stacked_frames 129 | stacked_frames = deque([np.zeros((84,84), dtype=np.uint8) for i in range(4)], maxlen=4) 130 | 131 | # Because we're in a new episode, copy the same frame 4x 132 | for i in range(4): 133 | stacked_frames.append(frame) 134 | 135 | # Stack the frames 136 | stacked_state = np.stack(stacked_frames, axis=2) 137 | 138 | else: 139 | # Append frame to deque, automatically removes the oldest frame 140 | stacked_frames.append(frame) 141 | 142 | # Build the stacked state (first dimension specifies different frames) 143 | stacked_state = np.stack(stacked_frames, axis=2) 144 | 145 | return stacked_state, stacked_frames 146 | 147 | def preprocess_frame(frame): 148 | frame = scipy.misc.imresize(frame, [110,84,3])[110-84:,0:84,:] 149 | frame = np.dot(frame[...,:3], np.array([0.299, 0.587, 0.114])).astype(dtype=np.uint8) 150 | return frame 151 | 152 | def main(): 153 | env = gym.make('SpaceInvaders-v0') 154 | replay =NumpyReplayMemory(100000,84,84,4) 155 | #framebuffer = FrameBuffer(100000,84,84,4) 156 | 157 | obs = env.reset() 158 | #state = framebuffer.stack_frames(obs,reset=True) 159 | state = replay.stack_frames(obs,reset=True) 160 | print("state shape", state.shape) 161 | avg_time = 0 162 | for t in range(int(1e7)): 163 | start = time.time() 164 | action = env.action_space.sample() 165 | obs, reward, done, info = env.step(action) 166 | #next_state = framebuffer.stack_frames(obs,reset=False) 167 | next_state = replay.stack_frames(obs,reset=False) 168 | print("next state shape", next_state.shape) 169 | replay.addMemory(state,action,reward,next_state,done) 170 | state = next_state 171 | if done: 172 | obs = env.reset() 173 | #state = framebuffer.stack_frames(obs,reset=True) 174 | state = replay.stack_frames(obs,reset=True) 175 | 176 | 177 | if t > 32 : 178 | 179 | batch_states, batch_actions, batch_rewards, batch_next_states, batch_final_states = replay.sample(32) 180 | end = time.time() 181 | #if t % 100 == 0: 182 | #print("next_state ", t) 183 | #print("batch_actions shape", batch_actions.shape) 184 | #print("next_state shape", len(batch_next_states), ",", batch_states[0].shape) 185 | #for i in range(4): 186 | #scipy.misc.imshow(batch_next_states[-1,:,:,i]) 187 | 188 | avg_time += (end-start) 189 | if t % 10000 == 0: 190 | print("time taken for 10000 steps", avg_time) 191 | avg_time = 0 192 | 193 | if __name__ == "__main__": 194 | main() -------------------------------------------------------------------------------- /rlib/utils/VecEnv.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | import multiprocessing as mp 4 | from itertools import chain 5 | 6 | # Code was inspired from or modified from OpenAI baselines https://github.com/openai/baselines/tree/master/baselines/common 7 | 8 | class Env(object): 9 | def __init__(self, env, worker_id=0): #, Wrappers=None, **wrapper_args): 10 | #self.env_id = env_id 11 | #env = gym.make(env_id) 12 | self.parent, self.child = mp.Pipe() 13 | self.worker = Worker(worker_id, env, self.child) 14 | self.worker.daemon = True 15 | self.worker.start() 16 | self.open = True 17 | 18 | def __del__(self): 19 | self.close() 20 | self.parent.close() 21 | self.child.close() 22 | 23 | def __getattr__(self, name): 24 | attribute = self._send_step('getattr', name) 25 | return attribute() 26 | 27 | def _send_step(self,cmd,action): 28 | self.parent.send((cmd,action)) 29 | return self._recieve 30 | 31 | def _recieve(self,): 32 | return self.parent.recv() 33 | 34 | def step(self,action, blocking=True): 35 | #if self.open: 36 | results = self._send_step('step', action) 37 | # if blocking: 38 | # return results() 39 | # else: 40 | return results 41 | 42 | def reset(self): 43 | #if self.open: 44 | results = self._send_step('reset', None) 45 | return results() 46 | 47 | def close(self): 48 | if self.open: 49 | self.open = False 50 | results = self._send_step('close', None) 51 | self.worker.join() 52 | 53 | def render(self): 54 | #if self.open: 55 | self._send_step('render', None) 56 | 57 | class Worker(mp.Process): 58 | def __init__(self, worker_id, env, connection): 59 | import gym 60 | np.random.seed() 61 | mp.Process.__init__(self) 62 | self.env = env #gym.make(env_id) 63 | self.worker_id = worker_id 64 | self.connection = connection 65 | 66 | def _step(self): 67 | try: 68 | while True: 69 | cmd, a = self.connection.recv() 70 | if cmd == 'step': 71 | obs, r, done, info = self.env.step(a) 72 | # auto_reset moved to env wrappers 73 | self.connection.send((obs,r,done,info)) 74 | elif cmd == 'render': 75 | self.env.render() 76 | #self.connection.send((1)) 77 | elif cmd == 'reset': 78 | obs = self.env.reset() 79 | self.connection.send(obs) 80 | elif cmd == 'getattr': 81 | self.connection.send(getattr(self.env, a)) 82 | elif cmd == 'close': 83 | self.env.close() 84 | #self.connection.send((1)) 85 | break 86 | except KeyboardInterrupt: 87 | print("closing worker", self.worker_id) 88 | finally: 89 | self.env.close() 90 | #self.connection.close() 91 | 92 | def run(self,): 93 | self._step() 94 | 95 | 96 | 97 | class BatchEnv(object): 98 | def __init__(self, env_constructor, env_id, num_envs, blocking=False, make_args={}, **env_args): 99 | #self.envs = [Env(env_constructor(gym.make(env_id),**env_args),worker_id=i) for i in range(num_envs)] 100 | self.envs = [] 101 | for i in range(num_envs): 102 | env = gym.make(env_id, **make_args) 103 | self.envs.append(Env(env_constructor(env, **env_args))) 104 | #self.envs = [env_constructor(env_id=env_id,**env_args, worker_id=i) for i in range(num_envs)] 105 | self.blocking = blocking 106 | 107 | def __len__(self): 108 | return len(self.envs) 109 | 110 | def __getattr__(self, name): 111 | return getattr(self.envs[0], name) 112 | 113 | def step(self,actions): 114 | if self.blocking: # wait for each process to return results before starting the next 115 | results = [env.step(action,True) for env, action in zip(self.envs,actions)] 116 | else: 117 | results = [env.step(action,False) for env, action in zip(self.envs,actions)] # apply steps async 118 | results = [result() for result in results] # collect results 119 | 120 | obs, rewards, done, info = zip(*results) 121 | return np.stack(obs), np.stack(rewards), np.stack(done), info 122 | 123 | def reset(self): 124 | obs = [env.reset() for env in self.envs] 125 | return np.stack(obs) 126 | 127 | def close(self): 128 | for env in self.envs: 129 | env.close() 130 | 131 | 132 | 133 | def chunks(l, n): 134 | for i in range(0, len(l), n): 135 | yield l[i:i+n] 136 | 137 | class ChunkEnv(object): 138 | def __init__(self, env_id, num_workers, num_chunks): 139 | self.num_workers = num_workers 140 | self.num_chunks = num_chunks 141 | self.env_id = env_id 142 | 143 | self.workers = [] 144 | self.parents = [] 145 | for i in range(num_workers): 146 | parent, child = mp.Pipe() 147 | worker = ChunkWorker(env_id,num_chunks,child) 148 | self.parents.append(parent) 149 | self.workers.append(worker) 150 | 151 | try: 152 | for worker in self.workers: 153 | worker.start() 154 | 155 | except KeyboardInterrupt: 156 | self.close() 157 | exit() 158 | #for w in self.workers: 159 | #w.env.close() 160 | #w.terminate() 161 | #exit() 162 | 163 | 164 | def _send_step(self,cmd,actions): 165 | for parent, action_chunk in zip(self.parents,chunks(actions, self.num_chunks)): 166 | parent.send((cmd,action_chunk)) 167 | return self._recieve 168 | 169 | def _recieve(self,): 170 | return [parent.recv() for parent in self.parents] 171 | 172 | def step(self,actions,blocking=True): 173 | results = self._send_step('step', actions) 174 | if blocking: 175 | results = list(chain.from_iterable(results())) 176 | obs, rewards, dones, infos = zip(*results) 177 | return np.stack(obs), np.stack(rewards), np.stack(dones), infos 178 | else: 179 | return results 180 | 181 | def reset(self): 182 | results = self._send_step('reset',np.zeros((self.num_chunks*self.num_workers))) 183 | results = list(chain.from_iterable(results())) 184 | return np.stack(results) 185 | 186 | def close(self): 187 | results = self._send_step('close',np.zeros((self.num_chunks*self.num_workers))) 188 | for worker in self.workers: 189 | worker.join() 190 | 191 | class ChunkWorker(mp.Process): 192 | def __init__(self, env_id, num_chunks, connection, render=False): 193 | mp.Process.__init__(self) 194 | self.envs = [gym.make(env_id) for i in range(num_chunks)] 195 | self.connection = connection 196 | self.render = render 197 | 198 | def run(self): 199 | while True: 200 | cmd, actions = self.connection.recv() 201 | if cmd == 'step': 202 | results = [] 203 | for a, env in zip(actions,self.envs): 204 | obs, r, done, info = env.step(a) 205 | # auto_reset moved to env wrappers 206 | if self.render: 207 | self.env.render() 208 | results.append((obs,r,done,info)) 209 | self.connection.send(results) 210 | elif cmd == 'reset': 211 | results = [] 212 | for a, env in zip(actions,self.envs): 213 | obs = env.reset() 214 | results.append(obs) 215 | self.connection.send(results) 216 | elif cmd == 'close': 217 | for env in self.envs: 218 | env.close() 219 | self.connection.send((1)) 220 | break 221 | 222 | 223 | class DummyBatchEnv(object): 224 | def __init__(self, env_constructor, env_id, num_envs, make_args={}, **env_args): 225 | self.envs = [env_constructor(gym.make(env_id, **make_args),**env_args) for i in range(num_envs)] 226 | 227 | def __len__(self): 228 | return len(self.envs) 229 | 230 | def __getattr__(self, name): 231 | return getattr(self.envs[0], name) 232 | 233 | def step(self,actions): 234 | results = [env.step(action) for env, action in zip(self.envs,actions)] 235 | obs, rewards, done, info = zip(*results) 236 | return np.stack(obs).copy(), np.stack(rewards).copy(), np.stack(done).copy(), info 237 | 238 | def reset(self): 239 | obs = [env.reset() for env in self.envs] 240 | return np.stack(obs).copy() 241 | 242 | def close(self): 243 | for env in self.envs: 244 | env.close() -------------------------------------------------------------------------------- /rlib/A2C/A2C_lstm.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy 3 | import gym 4 | import os, time, datetime 5 | import threading 6 | from rlib.A2C.ActorCritic import ActorCritic_LSTM 7 | from rlib.networks.networks import* 8 | from rlib.utils.utils import fold_batch, stack_many, totorch, fastsample 9 | from rlib.utils.SyncMultiEnvTrainer import SyncMultiEnvTrainer 10 | from rlib.utils.VecEnv import* 11 | from rlib.utils.wrappers import* 12 | 13 | 14 | 15 | class A2CLSTM_Trainer(SyncMultiEnvTrainer): 16 | def __init__(self, envs, model, val_envs, train_mode='nstep', return_type='nstep', log_dir='logs/', model_dir='models/', total_steps=1000000, nsteps=20, 17 | validate_freq=1e6, save_freq=0, render_freq=0, num_val_episodes=50, max_val_steps=10000, log_scalars=True): 18 | 19 | super().__init__(envs, model, val_envs, log_dir=log_dir, model_dir=model_dir, train_mode=train_mode, return_type=return_type, 20 | total_steps=total_steps, nsteps=nsteps, validate_freq=validate_freq, save_freq=save_freq, 21 | render_freq=render_freq, update_target_freq=0, num_val_episodes=num_val_episodes, max_val_steps=max_val_steps, log_scalars=log_scalars) 22 | 23 | 24 | self.prev_hidden = self.model.get_initial_hidden(self.num_envs) 25 | 26 | hyper_params = {'learning_rate':model.lr, 'learning_rate_final':model.lr_final, 'lr_decay_steps':model.decay_steps , 'grad_clip':model.grad_clip, 'nsteps':self.nsteps, 'num_workers':self.num_envs, 27 | 'total_steps':self.total_steps, 'entropy_coefficient':model.entropy_coeff, 'value_coefficient':model.value_coeff, 'gamma':self.gamma, 'lambda':self.lambda_} 28 | 29 | if self.log_scalars: 30 | filename = log_dir + '/hyperparameters.txt' 31 | self.save_hyperparameters(filename, **hyper_params) 32 | 33 | def _train_nstep(self): 34 | batch_size = (self.num_envs * self.nsteps) 35 | start = time.time() 36 | num_updates = self.total_steps // batch_size 37 | s = 0 38 | # main loop 39 | for t in range(1,num_updates+1): 40 | states, actions, rewards, first_hidden, dones, values, last_values = self.rollout() 41 | 42 | if self.return_type == 'nstep': 43 | R = self.nstep_return(rewards, last_values, dones, gamma=self.gamma) 44 | elif self.return_type == 'GAE': 45 | R = self.GAE(rewards, values, last_values, dones, gamma=self.gamma, lambda_=self.lambda_) + values 46 | elif self.return_type == 'lambda': 47 | R = self.lambda_return(rewards, values, last_values, dones, gamma=self.gamma, lambda_=self.lambda_) 48 | 49 | # stack all states, actions and Rs across all workers into a single batch 50 | actions, R = fold_batch(actions), fold_batch(R) 51 | l = self.model.backprop(states, R, actions, first_hidden, dones) 52 | 53 | if self.render_freq > 0 and t % ((self.validate_freq // batch_size) * self.render_freq) == 0: 54 | render = True 55 | else: 56 | render = False 57 | 58 | if self.validate_freq > 0 and t % (self.validate_freq //batch_size) == 0: 59 | self.validation_summary(t,l,start,render) 60 | start = time.time() 61 | 62 | if self.save_freq > 0 and t % (self.save_freq // batch_size) == 0: 63 | s += 1 64 | self.saver.save(self.sess, str(self.model_dir + str(s) + ".ckpt") ) 65 | print('saved model') 66 | 67 | 68 | def _validate_async(self, env, num_ep, max_steps, render=False): 69 | for episode in range(num_ep): 70 | state = env.reset() 71 | episode_score = [] 72 | hidden = self.model.get_initial_hidden(1) 73 | for t in range(max_steps): 74 | policy, value, hidden = self.model.evaluate(state[None, None], hidden) 75 | #print('policy', policy, 'value', value) 76 | action = int(fastsample(policy)) 77 | next_state, reward, done, info = env.step(action) 78 | state = next_state 79 | 80 | episode_score.append(reward) 81 | 82 | if render: 83 | with self.lock: 84 | env.render() 85 | 86 | if done or t == max_steps -1: 87 | tot_reward = np.sum(episode_score) 88 | with self.lock: 89 | self.validate_rewards.append(tot_reward) 90 | 91 | break 92 | if render: 93 | with self.lock: 94 | env.close() 95 | 96 | def validate_sync(self, render): 97 | episode_scores = [] 98 | env = self.val_envs 99 | for episode in range(self.num_val_episodes//len(env)): 100 | states = env.reset() 101 | episode_score = [] 102 | prev_hidden = self.model.get_initial_hidden(len(self.val_envs)) 103 | for t in range(self.val_steps): 104 | policies, values, hidden = self.model.evaluate(states[None], prev_hidden) 105 | actions = fastsample(policies) 106 | next_states, rewards, dones, infos = env.step(actions) 107 | states = next_states 108 | 109 | episode_score.append(rewards*(1-dones)) 110 | 111 | if render: 112 | with self.lock: 113 | env.render() 114 | 115 | if dones.sum() == self.num_envs or t == self.val_steps -1: 116 | tot_reward = np.sum(np.stack(episode_score), axis=0) 117 | episode_scores.append(tot_reward) 118 | break 119 | 120 | return np.mean(episode_scores) 121 | 122 | 123 | def rollout(self,): 124 | rollout = [] 125 | first_hidden = self.prev_hidden 126 | for t in range(self.nsteps): 127 | policies, values, hidden = self.model.evaluate(self.states[None], self.prev_hidden) 128 | actions = fastsample(policies) 129 | next_states, rewards, dones, infos = self.env.step(actions) 130 | rollout.append((self.states, actions, rewards, values, dones)) 131 | self.states = next_states 132 | self.prev_hidden = self.model.mask_hidden(hidden, dones) # reset hidden state at end of episode 133 | 134 | states, actions, rewards, values, dones = stack_many(*zip(*rollout)) 135 | _, last_values, _ = self.model.evaluate(self.states[None], self.prev_hidden) 136 | return states, actions, rewards, first_hidden, dones, values, last_values 137 | 138 | 139 | def main(env_id): 140 | num_envs = 32 141 | nsteps = 20 142 | 143 | classic_list = ['MountainCar-v0', 'Acrobot-v1', 'LunarLander-v2', 'CartPole-v0', 'CartPole-v1'] 144 | if any(env_id in s for s in classic_list): 145 | print('Classic Control') 146 | val_envs = [gym.make(env_id) for i in range(10)] 147 | envs = BatchEnv(DummyEnv, env_id, num_envs, blocking=False) 148 | 149 | elif 'ApplePicker' in env_id: 150 | print('ApplePicker') 151 | make_args = {'num_objects':100, 'default_reward':-0.1} 152 | val_envs = [gym.make(env_id, **make_args) for i in range(10)] 153 | envs = DummyBatchEnv(apple_pickgame, env_id, num_envs, max_steps=5000, auto_reset=True, make_args=make_args) 154 | print(val_envs[0]) 155 | print(envs.envs[0]) 156 | 157 | else: 158 | print('Atari') 159 | env = gym.make(env_id) 160 | if env.unwrapped.get_action_meanings()[1] == 'FIRE': 161 | reset = True 162 | print('fire on reset') 163 | else: 164 | reset = False 165 | print('only stack frames') 166 | 167 | env.close() 168 | val_envs = [AtariEnv(gym.make(env_id), k=1, rescale=84, episodic=False, reset=reset, clip_reward=False) for i in range(16)] 169 | envs = BatchEnv(AtariEnv, env_id, num_envs, rescale=84, blocking=False , k=1, reset=reset, episodic=False, clip_reward=True) 170 | 171 | action_size = val_envs[0].action_space.n 172 | input_size = val_envs[0].reset().shape 173 | 174 | 175 | current_time = datetime.datetime.now().strftime('%y-%m-%d_%H-%M-%S') 176 | train_log_dir = 'logs/A2C_LSTM/' + env_id +'/' + current_time 177 | model_dir = "models/A2C_LSTM/" + env_id + '/' + current_time 178 | 179 | 180 | model = ActorCritic_LSTM(NatureCNN, 181 | input_size=input_size, 182 | action_size=action_size, 183 | cell_size=256, 184 | lr=1e-3, 185 | lr_final=1e-4, 186 | decay_steps=50e6//(num_envs*nsteps), 187 | grad_clip=0.5, 188 | optim=torch.optim.RMSprop, 189 | device='cuda') 190 | 191 | 192 | a2c_trainer = A2CLSTM_Trainer(envs=envs, 193 | model=model, 194 | model_dir=model_dir, 195 | log_dir=train_log_dir, 196 | val_envs=val_envs, 197 | train_mode='nstep', 198 | return_type='GAE', 199 | total_steps=50e6, 200 | nsteps=nsteps, 201 | validate_freq=1e6, 202 | save_freq=0, 203 | render_freq=0, 204 | num_val_episodes=25, 205 | log_scalars=False) 206 | print(env_id) 207 | 208 | a2c_trainer.train() 209 | 210 | del model 211 | 212 | if __name__ == "__main__": 213 | env_id_list = ['SpaceInvadersDeterministic-v4', 'FreewayDeterministic-v4', 'MontezumaRevengeDeterministic-v4', 'PongDeterministic-v4'] 214 | #env_id_list = ['MountainCar-v0', 'Acrobot-v1'] 215 | #env_id_list = ['SuperMarioBros-1-1-v0'] 216 | for env_id in env_id_list: 217 | main(env_id) -------------------------------------------------------------------------------- /rlib/utils/wrappers.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | from PIL import Image 4 | from collections import deque 5 | import torch 6 | 7 | # Code was inspired from or modified from OpenAI baselines https://github.com/openai/baselines/tree/master/baselines/common 8 | 9 | 10 | def AtariValidate(env): 11 | env = FireResetEnv(env) 12 | env = NoopResetEnv(env, max_op=3000) 13 | env = StackEnv(env) 14 | return env 15 | 16 | class RescaleEnv(gym.Wrapper): 17 | def __init__(self, env, size): 18 | gym.Wrapper.__init__(self, env) 19 | self.size = size 20 | 21 | def preprocess(self, frame): 22 | frame = np.array(Image.fromarray(frame).resize([self.size,self.size])) 23 | frame = np.dot(frame[...,:3], np.array([0.299, 0.587, 0.114])).astype(dtype=np.uint8) 24 | return frame[:,:,np.newaxis] 25 | 26 | def step(self, action): 27 | obs, reward, done, info = self.env.step(action) 28 | return self.preprocess(obs), reward, done, info 29 | 30 | def reset(self, **kwargs): 31 | obs = self.env.reset(**kwargs) 32 | return self.preprocess(obs) 33 | 34 | 35 | class AtariRescale42x42(gym.Wrapper): 36 | def __init__(self, env): 37 | gym.Wrapper.__init__(self, env) 38 | 39 | def preprocess(self,frame): 40 | frame = np.array(Image.fromarray(frame).resize([84,110]))[110-84:,0:84,:] 41 | frame = np.dot(frame[...,:3], np.array([0.299, 0.587, 0.114])).astype(dtype=np.uint8) 42 | frame = np.array(Image.fromarray(frame).resize([42,42])).astype(dtype=np.uint8) 43 | return frame[:,:,np.newaxis] 44 | 45 | def step(self, action): 46 | obs, reward, done, info = self.env.step(action) 47 | return self.preprocess(obs), reward, done, info 48 | 49 | def reset(self, **kwargs): 50 | obs = self.env.reset(**kwargs) 51 | return self.preprocess(obs) 52 | 53 | class AtariRescaleEnv(gym.Wrapper): 54 | def __init__(self, env): 55 | gym.Wrapper.__init__(self, env) 56 | 57 | def preprocess(self,frame): 58 | frame = np.array(Image.fromarray(frame).resize([84,110]))[110-84:,0:84,:] 59 | frame = np.dot(frame[...,:3], np.array([0.299, 0.587, 0.114])).astype(dtype=np.uint8) 60 | return frame[:,:,np.newaxis] 61 | 62 | def step(self, action): 63 | obs, reward, done, info = self.env.step(action) 64 | return self.preprocess(obs), reward, done, info 65 | 66 | def reset(self, **kwargs): 67 | obs = self.env.reset(**kwargs) 68 | return self.preprocess(obs) 69 | 70 | class AtariRescaleColour(gym.Wrapper): 71 | def __init__(self, env): 72 | gym.Wrapper.__init__(self, env) 73 | 74 | def preprocess(self,frame): 75 | frame = np.array(Image.fromarray(frame).resize([84,110]))[110-84:,0:84,:] 76 | return frame 77 | 78 | def step(self, action): 79 | obs, reward, done, info = self.env.step(action) 80 | return self.preprocess(obs), reward, done, info 81 | 82 | def reset(self, **kwargs): 83 | obs = self.env.reset(**kwargs) 84 | return self.preprocess(obs) 85 | 86 | 87 | class DummyEnv(gym.Wrapper): 88 | def __init__(self, env): 89 | gym.Wrapper.__init__(self, env) 90 | def step(self, action): 91 | return self.env.step(action) 92 | def reset(self, **kwargs): 93 | return self.env.reset(**kwargs) 94 | 95 | class NoopResetEnv(gym.Wrapper): 96 | def __init__(self, env, max_op=7): 97 | gym.Wrapper.__init__(self, env) 98 | self.max_op = max_op 99 | 100 | def reset(self, **kwargs): 101 | obs = self.env.reset(**kwargs) 102 | noops = np.random.randint(0, self.max_op) 103 | for i in range(noops): 104 | obs, reward, done, info = self.env.step(0) 105 | return obs 106 | 107 | def step(self, action): 108 | return self.env.step(action) 109 | 110 | class ClipRewardEnv(gym.Wrapper): 111 | def __init__(self, env): 112 | gym.Wrapper.__init__(self, env) 113 | 114 | def step(self, action): 115 | obs, reward, done, info = self.env.step(action) 116 | reward = np.clip(reward, -1, 1) 117 | return obs, reward, done, info 118 | 119 | def reset(self, **kwargs): 120 | return self.env.reset(**kwargs) 121 | 122 | class NoRewardEnv(gym.Wrapper): 123 | def __init__(self, env): 124 | gym.Wrapper.__init__(self, env) 125 | 126 | def step(self, action): 127 | obs, reward, done, info = self.env.step(action) 128 | return obs, 0, done, info 129 | 130 | def reset(self, **kwargs): 131 | return self.env.reset(**kwargs) 132 | 133 | class FireResetEnv(gym.Wrapper): 134 | def __init__(self, env): 135 | """Take action on reset for environments that are fixed until firing.""" 136 | gym.Wrapper.__init__(self, env) 137 | assert env.unwrapped.get_action_meanings()[1] == 'FIRE' 138 | assert len(env.unwrapped.get_action_meanings()) >= 3 139 | 140 | def reset(self, **kwargs): 141 | self.env.reset(**kwargs) 142 | obs, _, done, _ = self.env.step(1) 143 | if done: 144 | self.env.reset(**kwargs) 145 | obs, _, done, _ = self.env.step(2) 146 | if done: 147 | self.env.reset(**kwargs) 148 | return obs 149 | 150 | def step(self, ac): 151 | return self.env.step(ac) 152 | 153 | class EpisodicLifeEnv(gym.Wrapper): 154 | def __init__(self, env): 155 | gym.Wrapper.__init__(self,env) 156 | self.lives = 0 157 | self.end_of_episode = True 158 | 159 | 160 | def step(self, action): 161 | obs, reward, done, info = self.env.step(action) 162 | self.end_of_episode = done 163 | lives = self.env.unwrapped.ale.lives() 164 | if lives < self.lives: 165 | done = True 166 | self.lives = lives 167 | return obs, reward, done, info 168 | 169 | def reset(self, **kwargs): 170 | if self.end_of_episode: 171 | obs = self.env.reset(**kwargs) 172 | else: 173 | obs, _, _, _ = self.env.step(0) 174 | return obs 175 | 176 | class TimeLimitEnv(gym.Wrapper): 177 | def __init__(self, env, time_limit): 178 | gym.Wrapper.__init__(self, env) 179 | self._time_limit=time_limit 180 | self._step = 0 181 | 182 | def step(self, action): 183 | obs, reward, done, info = self.env.step(action) 184 | self._step += 1 185 | if self._step > self._time_limit: 186 | done = True 187 | return obs, reward, done, info 188 | 189 | def reset(self, **kwargs): 190 | self._step = 0 191 | return self.env.reset(**kwargs) 192 | 193 | 194 | 195 | class StackEnv(gym.Wrapper): 196 | def __init__(self, env, k=4): 197 | gym.Wrapper.__init__(self, env) 198 | #self._stacked_frames = np.array(np.zeros([84,84,k])) 199 | self._stacked_frames = deque([], maxlen=k) 200 | self.k = k 201 | 202 | def step(self, action): 203 | obs, reward, done, info = self.env.step(action) 204 | obs = self.stack_frames(obs) 205 | return obs, reward, done, info 206 | 207 | def reset(self, **kwargs): 208 | obs = self.env.reset(**kwargs) 209 | return self.stack_frames(obs, True) 210 | 211 | 212 | def stack_frames(self,frame,reset=False): 213 | if reset: 214 | for i in range(self.k): 215 | self._stacked_frames.append(frame) 216 | else: 217 | self._stacked_frames.append(frame) 218 | return np.concatenate(self._stacked_frames,axis=2) 219 | 220 | 221 | class AutoResetEnv(gym.Wrapper): 222 | def __init__(self, env): 223 | gym.Wrapper.__init__(self, env) 224 | 225 | def step(self, action): 226 | obs, reward, done, info = self.env.step(action) 227 | if done: 228 | obs = self.env.reset() 229 | return obs, reward, done, info 230 | 231 | class ChannelsFirstEnv(gym.Wrapper): 232 | def __init__(self, env): 233 | gym.Wrapper.__init__(self, env) 234 | 235 | def step(self, action): 236 | obs, reward, done, info = self.env.step(action) 237 | return obs.transpose(2, 0, 1), reward, done, info 238 | 239 | def reset(self, **kwargs): 240 | obs = self.env.reset(**kwargs) 241 | return obs.transpose(2, 0, 1) 242 | 243 | class GreyScaleEnv(gym.Wrapper): 244 | def __init__(self, env): 245 | gym.Wrapper.__init__(self, env) 246 | 247 | def preprocess(self,frame): 248 | frame = np.dot(frame[...,:3], np.array([0.299, 0.587, 0.114])).astype(dtype=np.uint8) 249 | return frame[:,:,None] 250 | 251 | def step(self, action): 252 | obs, reward, done, info = self.env.step(action) 253 | return self.preprocess(obs), reward, done, info 254 | 255 | def reset(self, **kwargs): 256 | obs = self.env.reset(**kwargs) 257 | return self.preprocess(obs) 258 | 259 | class ToTorchEnv(gym.Wrapper): 260 | def __init__(self, env, device='cuda:0'): 261 | gym.Wrapper.__init__(self, env) 262 | self.device = device 263 | 264 | def step(self, action:torch.Tensor): 265 | obs, reward, done, info = self.env.step(action.cpu().numpy()) 266 | obs = torch.from_numpy(obs).float().to(self.device) 267 | reward = torch.tensor(reward, device=self.device, dtype=torch.float32) 268 | done = torch.tensor(done, device=self.device) 269 | return obs, reward, done, info 270 | 271 | def reset(self, **kwargs): 272 | obs = self.env.reset(**kwargs) 273 | return torch.from_numpy(obs).float().to(self.device) 274 | 275 | def apple_pickgame(env, k=1, grey_scale=False, auto_reset=False, max_steps=1000, channels_first=True): 276 | if auto_reset: 277 | env = AutoResetEnv(env) 278 | if max_steps is not None: 279 | env = TimeLimitEnv(env, time_limit=max_steps) 280 | if grey_scale: 281 | env = GreyScaleEnv(env) 282 | if k > 1: 283 | env = StackEnv(env, k) 284 | if channels_first: 285 | env = ChannelsFirstEnv(env) 286 | return env 287 | 288 | 289 | def AtariEnv(env, k=4, rescale=84, episodic=True, reset=True, clip_reward=True, Noop=True, time_limit=None, channels_first=True, auto_reset=False): 290 | ''' Wrapper function for Determinsitic Atari env 291 | assert 'Deterministic' in env.spec.id 292 | ''' 293 | if reset: 294 | env = FireResetEnv(env) 295 | 296 | if Noop: 297 | if 'NoFrameskip' in env.spec.id : 298 | max_op = 30 299 | else: 300 | max_op = 7 301 | env = NoopResetEnv(env,max_op) 302 | 303 | if clip_reward: 304 | env = ClipRewardEnv(env) 305 | 306 | if episodic: 307 | env = EpisodicLifeEnv(env) 308 | 309 | if rescale == 42: 310 | env = AtariRescale42x42(env) 311 | elif rescale == 84: 312 | env = AtariRescaleEnv(env) 313 | else: 314 | raise ValueError('84 or 42 are valid rescale sizes') 315 | 316 | if k > 1: 317 | env = StackEnv(env,k) 318 | 319 | if time_limit is not None: 320 | env = TimeLimitEnv(env, time_limit) 321 | 322 | if auto_reset: 323 | env = AutoResetEnv(env) 324 | 325 | if channels_first: 326 | env = ChannelsFirstEnv(env) 327 | 328 | return env -------------------------------------------------------------------------------- /rlib/DDQN/SyncDQN.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | import numpy as np 4 | import gym 5 | import threading 6 | import time, datetime 7 | from collections import OrderedDict 8 | 9 | from rlib.networks.networks import* 10 | from rlib.utils.wrappers import* 11 | from rlib.utils.VecEnv import* 12 | from rlib.utils.SyncMultiEnvTrainer import SyncMultiEnvTrainer 13 | from rlib.utils.utils import one_hot, fold_batch, unfold_batch, totorch, tonumpy, totorch_many 14 | from rlib.utils.schedulers import polynomial_sheduler 15 | 16 | 17 | main_lock = threading.Lock() 18 | 19 | def save_hyperparameters(filename, **kwargs): 20 | handle = open(filename, "w") 21 | for key, value in kwargs.items(): 22 | handle.write("{} = {}\n" .format(key, value)) 23 | handle.close() 24 | 25 | 26 | 27 | class DQN(torch.nn.Module): 28 | def __init__(self, model, input_shape, action_size, lr=1e-3, lr_final=0, decay_steps=50e6, grad_clip=0.5, optim=torch.optim.Adam, optim_args={}, device='cuda', **model_args): 29 | super(DQN, self).__init__() 30 | self.lr = lr 31 | self.lr_final = lr_final 32 | self.decay_steps = decay_steps 33 | self.grad_clip = grad_clip 34 | self.action_size = action_size 35 | self.device = device 36 | 37 | self.model = model(input_shape, **model_args).to(self.device) 38 | self.Q = torch.nn.Linear(self.model.dense_size, action_size).to(self.device) 39 | 40 | self.optimiser = optim(self.parameters(), lr, **optim_args) 41 | self.scheduler = polynomial_sheduler(self.optimiser, lr_final, decay_steps, power=1) 42 | 43 | def loss(self, Qsa, R, action_onehot): 44 | Qvalue = torch.sum(Qsa * action_onehot, dim=1) 45 | loss = torch.mean(torch.square(R - Qvalue)) 46 | return loss 47 | 48 | def backprop(self, state:np.ndarray, R:np.ndarray, action:np.ndarray): 49 | state, R, action = totorch_many(state, R, action, device=self.device) 50 | action_onehot = F.one_hot(action.long(), num_classes=self.action_size) 51 | Qsa = self.forward(state) 52 | loss = self.loss(Qsa, R, action_onehot) 53 | loss.backward() 54 | if self.grad_clip is not None: 55 | torch.nn.utils.clip_grad_norm_(self.parameters(), self.grad_clip) 56 | self.optimiser.step() 57 | self.optimiser.zero_grad() 58 | self.scheduler.step() 59 | return loss.detach().cpu().numpy() 60 | 61 | def forward(self, state): 62 | Qsa = self.Q(self.model(state)) 63 | return Qsa 64 | 65 | def evaluate(self, state): 66 | with torch.no_grad(): 67 | Qsa = self.forward(totorch(state, self.device)) 68 | return Qsa.cpu().numpy() 69 | 70 | 71 | 72 | class SyncDDQN(SyncMultiEnvTrainer): 73 | def __init__(self, envs, model, target_model, val_envs, action_size, log_dir='logs/SyncDDQN/', model_dir='models/SyncDDQN/', 74 | train_mode='nstep', return_type='nstep', total_steps=1000000, nsteps=5, gamma=0.99, lambda_=0.95, 75 | validate_freq=1e6, save_freq=0, render_freq=0, update_target_freq=10000, num_val_episodes=50, log_scalars=True, 76 | epsilon_start=1, epsilon_final=0.01, epsilon_steps = 1e6, epsilon_test=0.01): 77 | 78 | 79 | super().__init__(envs=envs, model=model, val_envs=val_envs, train_mode=train_mode, log_dir=log_dir, model_dir=model_dir, return_type=return_type, total_steps=total_steps, 80 | nsteps=nsteps, gamma=gamma, lambda_=lambda_, validate_freq=validate_freq, save_freq=save_freq, render_freq=render_freq, 81 | update_target_freq=update_target_freq, num_val_episodes=num_val_episodes, log_scalars=log_scalars) 82 | 83 | self.target_model = self.TargetQ = target_model 84 | self.Q = self.model # more readable alias 85 | self.epsilon = np.array([epsilon_start], dtype=np.float64) 86 | self.epsilon_final = epsilon_final 87 | self.epsilon_steps = epsilon_steps 88 | self.schedule = self.linear_schedule(self.epsilon , epsilon_final, epsilon_steps//self.num_envs) 89 | self.epsilon_test = np.array(epsilon_test, dtype=np.float64) 90 | 91 | self.action_size = action_size 92 | 93 | hyper_paras = {'learning_rate':self.model.lr, 'learning_rate_final':self.model.lr_final, 'lr_decay_steps':self.model.decay_steps , 'grad_clip':self.model.grad_clip, 94 | 'nsteps':self.nsteps, 'num_workers':self.num_envs, 'return type':self.return_type, 'total_steps':self.total_steps, 'gamma':gamma, 'lambda':lambda_, 95 | 'epsilon_start':self.epsilon, 'epsilon_final':self.epsilon_final, 'epsilon_steps':self.epsilon_steps, 'update_freq':update_target_freq} 96 | 97 | hyper_paras = OrderedDict(hyper_paras) 98 | 99 | if self.log_scalars: 100 | filename = log_dir + '/hyperparameters.txt' 101 | self.save_hyperparameters(filename, **hyper_paras) 102 | 103 | 104 | class linear_schedule(object): 105 | def __init__(self, epsilon, epsilon_final, num_steps=1000000): 106 | self._counter = 0 107 | self._epsilon = epsilon 108 | self._epsilon_final = epsilon_final 109 | self._step = (epsilon - epsilon_final) / num_steps 110 | self._num_steps = num_steps 111 | 112 | def step(self,): 113 | if self._counter < self._num_steps : 114 | self._epsilon -= self._step 115 | self._counter += 1 116 | else: 117 | self._epsilon[:] = self._epsilon_final 118 | 119 | def get_epsilon(self,): 120 | return self._epsilon 121 | 122 | def get_action(self, state): 123 | if np.random.uniform() < self.epsilon_test: 124 | action = np.random.randint(self.action_size) 125 | else: 126 | action = np.argmax(self.model.evaluate(state)) 127 | return action 128 | 129 | def update_target(self): 130 | self.target_model.load_state_dict(self.model.state_dict()) 131 | 132 | 133 | def local_attr(self, attr): 134 | attr['update_target_freq'] = self.target_freq 135 | return attr 136 | 137 | def rollout(self): 138 | rollout = [] 139 | for t in range(self.nsteps): 140 | Qsa = self.Q.evaluate(self.states) 141 | actions = np.argmax(Qsa, axis=1) 142 | random = np.random.uniform(size=(self.num_envs)) 143 | random_actions = np.random.randint(self.action_size, size=(self.num_envs)) 144 | actions = np.where(random < self.epsilon, random_actions, actions) 145 | next_states, rewards, dones, infos = self.env.step(actions) 146 | rollout.append((self.states, actions, rewards, dones, infos)) 147 | self.states = next_states 148 | self.schedule.step() 149 | 150 | states, actions, rewards, dones, infos = zip(*rollout) 151 | states, actions, rewards, dones = np.stack(states), np.stack(actions), np.stack(rewards), np.stack(dones) 152 | TargetQsa = unfold_batch(self.TargetQ.evaluate(fold_batch(states)), self.num_steps, self.num_envs) # Q(s,a; theta-1) 153 | values = np.sum(TargetQsa * one_hot(actions, self.action_size), axis=-1) # Q(s, argmax_a Q(s,a; theta); theta-1) 154 | 155 | last_actions = np.argmax(self.Q.evaluate(next_states), axis=1) 156 | last_TargetQsa = self.TargetQ.evaluate(next_states) # Q(s,a; theta-1) 157 | last_values = np.sum(last_TargetQsa * one_hot(last_actions, self.action_size), axis=-1) # Q(s, argmax_a Q(s,a; theta); theta-1) 158 | return states, actions, rewards, dones, values, last_values 159 | 160 | 161 | def stackFireReset(env): 162 | return StackEnv(FireResetEnv(env)) 163 | 164 | 165 | def main(env_id): 166 | num_envs = 32 167 | nsteps = 128 168 | 169 | current_time = datetime.datetime.now().strftime('%y-%m-%d_%H-%M-%S') 170 | train_log_dir = 'logs/SyncDDQN/' + env_id + '/n-step/RMSprop/' + current_time 171 | model_dir = "models/SyncDDQN/" + env_id + '/' + current_time 172 | 173 | env = gym.make(env_id) 174 | 175 | classic_list = ['MountainCar-v0', 'Acrobot-v1', 'LunarLander-v2', 'CartPole-v0', 'CartPole-v1'] 176 | if any(env_id in s for s in classic_list): 177 | print('Classic Control') 178 | val_envs = [gym.make(env_id) for i in range(16)] 179 | envs = BatchEnv(DummyEnv, env_id, num_envs, blocking=False) 180 | 181 | elif 'ApplePicker' in env_id: 182 | print('ApplePicker') 183 | make_args = {'num_objects':100, 'default_reward':-0.01} 184 | val_envs = [apple_pickgame(gym.make(env_id, **make_args), max_steps=5000, auto_reset=True, k=1) for i in range(15)] 185 | envs = BatchEnv(apple_pickgame, env_id, num_envs, max_steps=5000, auto_reset=False, k=1) 186 | print(val_envs[0]) 187 | print(envs.envs[0]) 188 | 189 | else: 190 | print('Atari') 191 | if env.unwrapped.get_action_meanings()[1] == 'FIRE': 192 | reset = True 193 | print('fire on reset') 194 | else: 195 | reset = False 196 | print('only stack frames') 197 | 198 | val_envs = [AtariEnv(gym.make(env_id), k=4, episodic=False, reset=reset, clip_reward=False) for i in range(15)] 199 | envs = BatchEnv(AtariEnv, env_id, num_envs, blocking=False, k=4, reset=reset, episodic=False, clip_reward=True, time_limit=4500) 200 | 201 | action_size = val_envs[0].action_space.n 202 | input_size = val_envs[0].reset().shape 203 | 204 | env.close() 205 | print('action space', action_size) 206 | 207 | dqn_args = dict(model=NatureCNN, 208 | input_shape=input_size, 209 | action_size=action_size, 210 | lr=1e-3, 211 | lr_final=1e-6, 212 | grad_clip=0.5, 213 | decay_steps=50e6//(num_envs*nsteps), 214 | optim=torch.optim.RMSprop, 215 | device='cuda') 216 | 217 | Q = DQN(**dqn_args) 218 | TargetQ = DQN(**dqn_args) 219 | 220 | DDQN = SyncDDQN(envs=envs, 221 | model=Q, 222 | target_model=TargetQ, 223 | model_dir=model_dir, 224 | log_dir=train_log_dir, 225 | val_envs=val_envs, 226 | action_size=action_size, 227 | train_mode='nstep', 228 | return_type='lambda', 229 | total_steps=50e6, 230 | nsteps=nsteps, 231 | gamma=0.99, 232 | lambda_=0.95, 233 | save_freq=0, 234 | render_freq=0, 235 | validate_freq=1e5, 236 | num_val_episodes=15, 237 | update_target_freq=10000, 238 | epsilon_start=1, 239 | epsilon_final=0.01, 240 | epsilon_steps=2e6, 241 | epsilon_test=0.01, 242 | log_scalars=False) 243 | 244 | DDQN.update_target() 245 | DDQN.train() 246 | 247 | if __name__ == "__main__": 248 | import apple_picker 249 | env_id_list = [ 'SpaceInvadersDeterministic-v4', 'FreewayDeterministic-v4','MontezumaRevengeDeterministic-v4', ] 250 | #env_id_list = ['MontezumaRevengeDeterministic-v4'] 251 | #env_id_list = ['MountainCar-v0', 'CartPole-v1', 'Acrobot-v1', ] 252 | env_id_list = ['ApplePicker-v0'] 253 | for env_id in env_id_list: 254 | main(env_id) 255 | -------------------------------------------------------------------------------- /rlib/PPO/PPO.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | import numpy as np 4 | import time, datetime 5 | import gym 6 | import copy 7 | import matplotlib.pyplot as plt 8 | 9 | from rlib.networks.networks import * 10 | from rlib.utils.VecEnv import* 11 | from rlib.utils.wrappers import* 12 | from rlib.utils.SyncMultiEnvTrainer import SyncMultiEnvTrainer 13 | from rlib.utils.utils import fastsample, fold_batch, tonumpy, totorch, totorch_many, stack_many, fold_many 14 | from rlib.utils.schedulers import polynomial_sheduler 15 | 16 | class PPO(torch.nn.Module): 17 | def __init__(self, model, input_shape, action_size, lr=1e-3, lr_final=0, decay_steps=6e5, grad_clip=0.5, value_coeff=1.0, entropy_coeff=0.01, policy_clip=0.1, 18 | build_optimiser=True, optim=torch.optim.Adam, optim_args={}, device='cuda', **model_args): 19 | super(PPO, self).__init__() 20 | self.lr = lr 21 | self.lr_final = lr_final 22 | self.action_size = action_size 23 | self.value_coeff = value_coeff 24 | self.entropy_coeff = entropy_coeff 25 | self.decay_steps = decay_steps 26 | self.grad_clip = grad_clip 27 | self.policy_clip = policy_clip 28 | self.device = device 29 | 30 | self.model = model(input_shape, **model_args).to(self.device) 31 | dense_size = self.model.dense_size 32 | self.policy = torch.nn.Sequential(torch.nn.Linear(dense_size, action_size), torch.nn.Softmax(dim=-1)).to(self.device) 33 | self.V = torch.nn.Linear(dense_size, 1).to(self.device) 34 | 35 | if build_optimiser: 36 | self.optimiser = optim(self.parameters(), lr, **optim_args) 37 | self.scheduler = polynomial_sheduler(self.optimiser, lr_final, decay_steps, power=1) 38 | 39 | 40 | def forward(self, state): 41 | state_enc = self.model(state) 42 | policy = self.policy(state_enc) 43 | value = self.V(state_enc).view(-1) 44 | return policy, value 45 | 46 | def evaluate(self, state): 47 | with torch.no_grad(): 48 | policy, value = self.forward(totorch(state, self.device)) 49 | return tonumpy(policy), tonumpy(value) 50 | 51 | 52 | def loss(self, policy, R, V, Adv, action_onehot, old_policy): 53 | value_loss = 0.5 * torch.mean(torch.square(R - V)) 54 | 55 | policy_actions = torch.sum(policy * action_onehot, dim=1) 56 | old_policy_actions = torch.sum(old_policy * action_onehot, dim=1) 57 | ratio = policy_actions / old_policy_actions 58 | policy_loss_unclipped = ratio * -Adv 59 | policy_loss_clipped = torch.clip_(ratio, 1 - self.policy_clip, 1 + self.policy_clip) * -Adv 60 | policy_loss = torch.mean(torch.maximum(policy_loss_unclipped, policy_loss_clipped)) 61 | entropy = torch.mean(torch.sum(policy * -torch.log(policy), dim=1)) 62 | 63 | loss = policy_loss + self.value_coeff * value_loss - self.entropy_coeff * entropy 64 | return loss 65 | 66 | def backprop(self, state, R, Adv, action, old_policy): 67 | state, action, R, Adv, old_policy = totorch_many(state, action, R, Adv, old_policy, device=self.device) 68 | action_onehot = F.one_hot(action.long(), self.action_size) 69 | policy, value = self.forward(state) 70 | loss = self.loss(policy, R, value, Adv, action_onehot, old_policy) 71 | 72 | loss.backward() 73 | if self.grad_clip is not None: 74 | torch.nn.utils.clip_grad_norm_(self.parameters(), self.grad_clip) 75 | self.optimiser.step() 76 | self.optimiser.zero_grad() 77 | self.scheduler.step() 78 | return loss.detach().cpu().numpy() 79 | 80 | 81 | class PPOTrainer(SyncMultiEnvTrainer): 82 | def __init__(self, envs, model, val_envs, train_mode='nstep', log_dir='logs/', model_dir='models/', total_steps=1000000, nsteps=5, gamma=0.99, lambda_=0.95, 83 | num_epochs=4, num_minibatches=4, validate_freq=1000000.0, save_freq=0, render_freq=0, num_val_episodes=50, max_val_steps=10000, log_scalars=True): 84 | 85 | super().__init__(envs, model, val_envs, train_mode=train_mode, log_dir=log_dir, model_dir=model_dir, total_steps=total_steps, nsteps=nsteps, gamma=gamma, lambda_=lambda_, 86 | validate_freq=validate_freq, save_freq=save_freq, render_freq=render_freq, update_target_freq=0, num_val_episodes=num_val_episodes, max_val_steps=max_val_steps, log_scalars=log_scalars) 87 | 88 | self.num_epochs = num_epochs 89 | self.num_minibatches = num_minibatches 90 | 91 | hyper_paras = {'learning_rate':model.lr, 'learning_rate_final':model.lr_final, 'lr_decay_steps':model.decay_steps, 92 | 'grad_clip':model.grad_clip, 'nsteps':self.nsteps, 'num_workers':self.num_envs, 'total_steps':self.total_steps, 93 | 'entropy_coefficient':self.model.entropy_coeff, 'value_coefficient':self.model.value_coeff, 'gamma':self.gamma, 'lambda':self.lambda_} 94 | 95 | if log_scalars: 96 | filename = log_dir + '/hyperparameters.txt' 97 | self.save_hyperparameters(filename, **hyper_paras) 98 | 99 | 100 | 101 | def _train_nstep(self): 102 | batch_size = self.num_envs * self.nsteps 103 | num_updates = self.total_steps // batch_size 104 | s = 0 105 | mini_batch_size = self.nsteps//self.num_minibatches 106 | start = time.time() 107 | # main loop 108 | for t in range(1,num_updates+1): 109 | #rollout_start = time.time() 110 | states, actions, rewards, values, last_values, old_policies, dones = self.rollout() 111 | #print('rollout time', time.time()-rollout_start) 112 | Adv = self.GAE(rewards, values, last_values, dones, gamma=self.gamma, lambda_=self.lambda_) 113 | R = Adv + values 114 | l = 0 115 | 116 | #backprop_time = time.time() 117 | idxs = np.arange(len(states)) 118 | for epoch in range(self.num_epochs): 119 | np.random.shuffle(idxs) 120 | for batch in range(0, len(states), mini_batch_size): 121 | batch_idxs = idxs[batch: batch + mini_batch_size] 122 | # stack all states, actions and Rs across all workers into a single batch 123 | mb_states, mb_actions, mb_R, mb_Adv, mb_old_policies = fold_many(states[batch_idxs], actions[batch_idxs], 124 | R[batch_idxs], Adv[batch_idxs], 125 | old_policies[batch_idxs]) 126 | 127 | l += self.model.backprop(mb_states.copy(), mb_R.copy(), mb_Adv.copy(), mb_actions.copy(), mb_old_policies.copy()) 128 | 129 | #print('backprop time', time.time()-backprop_time) 130 | l /= self.num_epochs 131 | 132 | 133 | if self.render_freq > 0 and t % ((self.validate_freq // batch_size) * self.render_freq) == 0: 134 | render = True 135 | else: 136 | render = False 137 | 138 | if self.validate_freq > 0 and t % (self.validate_freq // batch_size) == 0: 139 | #val_time = time.time() 140 | self.validation_summary(t,l,start,render) 141 | #print('validation time', time.time()-val_time) 142 | start = time.time() 143 | 144 | if self.save_freq > 0 and t % (self.save_freq // batch_size) == 0: 145 | s += 1 146 | self.save(s) 147 | print('saved model') 148 | 149 | 150 | def get_action(self, states): 151 | policies, values = self.model.evaluate(states) 152 | actions = fastsample(policies) 153 | return actions 154 | 155 | def rollout(self): 156 | rollout = [] 157 | for t in range(self.nsteps): 158 | policies, values = self.model.evaluate(self.states) 159 | actions = fastsample(policies) 160 | next_states, rewards, dones, infos = self.env.step(actions) 161 | rollout.append((self.states, actions, rewards, values, policies, dones)) 162 | self.states = next_states 163 | 164 | states, actions, rewards, values, policies, dones = stack_many(*zip(*rollout)) 165 | policy, last_values, = self.model.evaluate(next_states) 166 | return states, actions, rewards, values, last_values, policies, dones 167 | 168 | 169 | def main(env_id): 170 | num_envs = 32 171 | nsteps = 128 172 | 173 | classic_list = ['MountainCar-v0', 'Acrobot-v1', 'LunarLander-v2', 'CartPole-v0', 'CartPole-v1'] 174 | if any(env_id in s for s in classic_list): 175 | print('Classic Control') 176 | val_envs = [gym.make(env_id) for i in range(10)] 177 | envs = BatchEnv(DummyEnv, env_id, num_envs, blocking=False) 178 | 179 | elif 'ApplePicker' in env_id: 180 | print('ApplePicker') 181 | make_args = {'num_objects':300, 'default_reward':0} 182 | if 'Deterministic' in env_id: 183 | envs = DummyBatchEnv(apple_pickgame, env_id, num_envs, max_steps=5000, auto_reset=True, k=4, grey_scale=True, make_args=make_args) 184 | val_envs = DummyBatchEnv(apple_pickgame, env_id, num_envs, max_steps=5000, auto_reset=False, k=4, grey_scale=True, make_args=make_args) 185 | for i in range(len(envs)): 186 | val_envs.envs[i].set_locs(envs.envs[i].item_locs_master, envs.envs[i].start_loc) 187 | val_envs.reset() 188 | else: 189 | #val_envs = [apple_pickgame(gym.make(env_id), max_steps=5000, auto_reset=False, k=1) for i in range(16)] 190 | val_envs = DummyBatchEnv(apple_pickgame, env_id, num_envs, max_steps=5000, auto_reset=False, k=4, grey_scale=True) 191 | envs = DummyBatchEnv(apple_pickgame, env_id, num_envs, max_steps=5000, auto_reset=True, k=4, grey_scale=True) 192 | print(val_envs.envs[0]) 193 | print(envs.envs[0]) 194 | 195 | else: 196 | print('Atari') 197 | env = gym.make(env_id) 198 | if env.unwrapped.get_action_meanings()[1] == 'FIRE': 199 | reset = True 200 | print('fire on reset') 201 | else: 202 | reset = False 203 | print('only stack frames') 204 | env.close() 205 | val_envs = [AtariEnv(gym.make(env_id), k=4, episodic=False, reset=reset, clip_reward=False) for i in range(16)] 206 | envs = BatchEnv(AtariEnv, env_id, num_envs, blocking=False, k=4, reset=reset, episodic=False, clip_reward=True) 207 | 208 | 209 | action_size = val_envs.envs[0].action_space.n 210 | input_size = val_envs.envs[0].reset().shape 211 | 212 | 213 | current_time = datetime.datetime.now().strftime('%y-%m-%d_%H-%M-%S') 214 | train_log_dir = 'logs/PPO/' + env_id + '/Adam/' + current_time 215 | model_dir = "models/PPO/" + env_id + '/' + current_time 216 | 217 | 218 | model = PPO(UniverseCNN, 219 | input_shape=input_size, 220 | action_size=action_size, 221 | lr=1e-4, 222 | lr_final=1e-5, 223 | decay_steps=200e6//(num_envs*nsteps), 224 | grad_clip=0.5, 225 | value_coeff=1.0, 226 | entropy_coeff=0.01, 227 | device='cuda' 228 | ).cuda() 229 | 230 | 231 | ppo = PPOTrainer(envs=envs, 232 | model=model, 233 | model_dir=model_dir, 234 | log_dir=train_log_dir, 235 | val_envs=val_envs, 236 | train_mode='nstep', 237 | total_steps=200e6, 238 | nsteps=nsteps, 239 | num_epochs=2, 240 | num_minibatches=4, 241 | validate_freq=1e5, 242 | save_freq=0, 243 | render_freq=0, 244 | num_val_episodes=32, 245 | log_scalars=False) 246 | ppo.train() 247 | 248 | 249 | if __name__ == "__main__": 250 | import apple_picker 251 | #env_id_list = ['SpaceInvadersDeterministic-v4', 'FreewayDeterministic-v4']# 'SpaceInvadersDeterministic-v4',]# , ] 252 | #env_id_list = ['MountainCar-v0', 'Acrobot-v1', 'CartPole-v1', ] 253 | env_id_list = ['ApplePickerDeterministic-v0'] 254 | for env_id in env_id_list: 255 | main(env_id) 256 | -------------------------------------------------------------------------------- /rlib/networks/networks.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | from typing import List 4 | 5 | 6 | def deconv2d_outsize(height, width, kernel_size, stride, padding, dilation=[1,1], output_padding=[0,0]): 7 | h_out = (height-1) * stride[0] - 2*padding[0] + dilation[0] * (kernel_size[0]-1) + output_padding[0] + 1 8 | w_out = (width-1) * stride[1] - 2*padding[1] + dilation[1] * (kernel_size[1]-1) + output_padding[1] + 1 9 | return h_out, w_out 10 | 11 | def conv2d_outsize(height, width, kernel_size, stride, padding): 12 | h_out = ((height + 2*padding[0] - (kernel_size[0] -1) -1) // stride[0]) + 1 13 | w_out = ((width + 2*padding[1] - (kernel_size[1] -1) -1) // stride[1]) + 1 14 | return h_out, w_out 15 | 16 | class DeconvUniverse(torch.nn.Module): 17 | def __init__(self, output_size, deconv1_size=64, deconv2_size=64, deconv3_size=64, deconv4_size=64, padding=[0,0], conv_activation=torch.nn.ELU, weight_initialiser=torch.nn.init.xavier_uniform_, trainable=True): 18 | # output_size [channels, height, width] size of output after convolutions 19 | super(DeconvUniverse, self).__init__() 20 | self.output_size = output_size 21 | self.dense_size = np.prod(output_size) 22 | 23 | self.h1 = torch.nn.Sequential(torch.nn.ConvTranspose2d(output_size[0], deconv1_size, kernel_size=[3,3], stride=[2,2], padding=padding, output_padding=1), conv_activation()) 24 | self.h2 = torch.nn.Sequential(torch.nn.ConvTranspose2d(deconv1_size, deconv2_size, kernel_size=[3,3], stride=[2,2], padding=padding, output_padding=0), conv_activation()) 25 | self.h3 = torch.nn.Sequential(torch.nn.ConvTranspose2d(deconv2_size, deconv3_size, kernel_size=[3,3], stride=[2,2], padding=padding, output_padding=0), conv_activation()) 26 | self.h4 = torch.nn.Sequential(torch.nn.ConvTranspose2d(deconv3_size, deconv4_size, kernel_size=[3,3], stride=[2,2], padding=padding, output_padding=1), conv_activation()) 27 | c, h, w = self._conv_outsize() 28 | 29 | print('final outsize', (c, h, w)) 30 | self.initialiser = weight_initialiser 31 | self.init_weights() 32 | 33 | def init_weights(self): 34 | self.apply(self._init_weights) 35 | 36 | def _init_weights(self, module): 37 | if isinstance(module, (torch.nn.Linear, torch.nn.Conv2d)): 38 | self.initialiser(module.weight) 39 | 40 | def _conv_outsize(self): 41 | _, h, w = self.output_size 42 | h, w = deconv2d_outsize(h, w, self.h1[0].kernel_size, self.h1[0].stride, self.h1[0].padding, self.h1[0].dilation, self.h1[0].output_padding) 43 | h, w = deconv2d_outsize(h, w, self.h2[0].kernel_size, self.h2[0].stride, self.h2[0].padding, self.h2[0].dilation, self.h2[0].output_padding) 44 | h, w = deconv2d_outsize(h, w, self.h3[0].kernel_size, self.h3[0].stride, self.h3[0].padding, self.h3[0].dilation, self.h3[0].output_padding) 45 | h, w = deconv2d_outsize(h, w, self.h4[0].kernel_size, self.h4[0].stride, self.h4[0].padding, self.h4[0].dilation, self.h4[0].output_padding) 46 | return self.h4[0].out_channels, h, w 47 | 48 | def forward(self, x): 49 | x = x.view(-1, *self.output_size) 50 | x = self.h1(x) 51 | x = self.h2(x) 52 | x = self.h3(x) 53 | x = self.h4(x) 54 | return x 55 | 56 | class UniverseCNN(torch.nn.Module): 57 | def __init__(self, input_shape, conv1_size=64, conv2_size=64, conv3_size=64, conv4_size=64, padding=[0,0], dense_size=256, conv_activation=torch.nn.ELU, dense_activation=torch.nn.ReLU, weight_initialiser=torch.nn.init.xavier_uniform_, scale=True, trainable=True): 58 | # input_shape [channels, height, width] 59 | super(UniverseCNN, self).__init__() 60 | self.scale = scale 61 | self.input_shape = input_shape 62 | 63 | self.h1 = torch.nn.Sequential(torch.nn.Conv2d(input_shape[0], conv1_size, kernel_size=[3,3], stride=[2,2], padding=padding), conv_activation()) 64 | self.h2 = torch.nn.Sequential(torch.nn.Conv2d(conv1_size, conv2_size, kernel_size=[3,3], stride=[2,2], padding=padding), conv_activation()) 65 | self.h3 = torch.nn.Sequential(torch.nn.Conv2d(conv2_size, conv3_size, kernel_size=[3,3], stride=[2,2], padding=padding), conv_activation()) 66 | self.h4 = torch.nn.Sequential(torch.nn.Conv2d(conv3_size, conv4_size, kernel_size=[3,3], stride=[2,2], padding=padding), conv_activation()) 67 | self.flatten = torch.nn.Flatten() 68 | c, h, w = self._conv_outsize() 69 | self.dense = torch.nn.Sequential(torch.nn.Linear(h*w*c, dense_size), dense_activation()) 70 | #self.dense_size = h*w*c 71 | self.dense_size = dense_size 72 | print('final outsize', (c, h, w)) 73 | self.initialiser = weight_initialiser 74 | self.init_weights() 75 | 76 | def init_weights(self): 77 | self.apply(self._init_weights) 78 | 79 | def _init_weights(self, module): 80 | if isinstance(module, (torch.nn.Linear, torch.nn.Conv2d)): 81 | self.initialiser(module.weight) 82 | 83 | def _conv_outsize(self): 84 | _, h, w = self.input_shape 85 | h, w = conv2d_outsize(h, w, self.h1[0].kernel_size, self.h1[0].stride, self.h1[0].padding) 86 | h, w = conv2d_outsize(h, w, self.h2[0].kernel_size, self.h2[0].stride, self.h2[0].padding) 87 | h, w = conv2d_outsize(h, w, self.h3[0].kernel_size, self.h3[0].stride, self.h3[0].padding) 88 | h, w = conv2d_outsize(h, w, self.h4[0].kernel_size, self.h4[0].stride, self.h4[0].padding) 89 | return self.h4[0].out_channels, h, w 90 | 91 | def forward(self, x): 92 | x = x/255 if self.scale else x 93 | x = self.h1(x) 94 | x = self.h2(x) 95 | x = self.h3(x) 96 | x = self.h4(x) 97 | x = self.flatten(x) 98 | x = self.dense(x) 99 | return x 100 | 101 | class NatureCNN(torch.nn.Module): 102 | def __init__(self, input_shape, conv1_size=32, conv2_size=64, conv3_size=64, dense_size=512, padding=[0,0], conv_activation=torch.nn.ReLU, dense_activation=torch.nn.ReLU, weight_initialiser=torch.nn.init.xavier_uniform_, scale=True, trainable=True): 103 | # input_shape [channels, height, width] 104 | super(NatureCNN, self).__init__() 105 | self.scale = scale 106 | self.dense_size = dense_size 107 | self.input_shape = input_shape 108 | self.h1 = torch.nn.Sequential(torch.nn.Conv2d(input_shape[0], conv1_size, kernel_size=[8,8], stride=[4,4], padding=padding), conv_activation()) 109 | self.h2 = torch.nn.Sequential(torch.nn.Conv2d(conv1_size, conv2_size, kernel_size=[4,4], stride=[2,2], padding=padding), conv_activation()) 110 | self.h3 = torch.nn.Sequential(torch.nn.Conv2d(conv2_size, conv3_size, kernel_size=[3,3], stride=[1,1], padding=padding), conv_activation()) 111 | self.flatten = torch.nn.Flatten() 112 | c, h, w = self._conv_outsize() 113 | self.dense = torch.nn.Sequential(torch.nn.Linear(h*w*c, dense_size), dense_activation()) 114 | self.initialiser = weight_initialiser 115 | self.init_weights() 116 | 117 | def init_weights(self): 118 | self.apply(self._init_weights) 119 | 120 | def _init_weights(self, module): 121 | if isinstance(module, (torch.nn.Linear, torch.nn.Conv2d)): 122 | self.initialiser(module.weight) 123 | 124 | def _conv_outsize(self): 125 | _, h, w = self.input_shape 126 | h, w = conv2d_outsize(h, w, self.h1[0].kernel_size, self.h1[0].stride, self.h1[0].padding) 127 | h, w = conv2d_outsize(h, w, self.h2[0].kernel_size, self.h2[0].stride, self.h2[0].padding) 128 | h, w = conv2d_outsize(h, w, self.h3[0].kernel_size, self.h3[0].stride, self.h3[0].padding) 129 | return self.h3[0].out_channels, h, w 130 | 131 | def forward(self, x): 132 | x = x/255 if self.scale else x 133 | x = self.h1(x) 134 | x = self.h2(x) 135 | x = self.h3(x) 136 | x = self.flatten(x) 137 | x = self.dense(x) 138 | return x 139 | 140 | 141 | class MaskedRNN(torch.nn.Module): 142 | ''' dynamic masked *hidden state* RNN for sequences that reset part way through an observation 143 | e.g. A2C 144 | args : 145 | cell - cell of type tf.nn.rnn_cell 146 | X - tensor of rank [time, batch, hidden] if time major == True (Default); or [batch, time, hidden] if time major == False 147 | hidden_init - tensor or placeholder of intial cell hidden state 148 | mask - tensor or placeholder of length time, for hidden state masking e.g. [True, False, False] will mask first hidden state 149 | parallel_iterations - number of parallel iterations to run RNN over 150 | swap_memory - bool flag to swap memory between GPU and CPU 151 | time_major - bool flag to determine order of indices of input tensor 152 | scope - tf variable_scope of dynamic RNN loop 153 | trainable - bool flag whether to perform backpropagation to RNN cell during while loop 154 | ''' 155 | def __init__(self, cell, time_major=True): 156 | super(MaskedRNN, self).__init__() 157 | self.cell = cell 158 | self.time_major = time_major 159 | 160 | def forward(self, x, hidden=None, mask=None): 161 | '''args: 162 | x - tensor of rank [time, batch, hidden] if time major == True (Default); or [batch, time, hidden] if time major == False 163 | mask - tensor of rank [time], for hidden state masking e.g. [True, False, False] will mask first hidden state 164 | returns: 165 | ''' 166 | 167 | if not self.time_major: 168 | x = x.transpose(1, 0, 2) 169 | 170 | if mask is None: 171 | mask = torch.zeros(x.shape[0], x.shape[1]).to(x.device) 172 | 173 | outputs = [] 174 | for t in range(x.shape[0]): 175 | output, hidden = self.cell(x[t], hidden, mask[t]) 176 | outputs.append(output) 177 | 178 | outputs = torch.stack(outputs, dim=0) 179 | outputs = outputs if self.time_major else outputs.transpose(1, 0, 2) 180 | return outputs, hidden 181 | 182 | def lstmgate(cell_size, input_size, trainable=True): 183 | input_weight = torch.nn.Parameter(torch.nn.init.xavier_uniform_(torch.zeros(size=[input_size, cell_size], requires_grad=trainable))) 184 | hidden_weight = torch.nn.Parameter(torch.nn.init.xavier_uniform_(torch.zeros(size=[cell_size, cell_size], requires_grad=trainable))) 185 | bias = torch.nn.Parameter(torch.zeros(size=[cell_size], requires_grad=trainable)) 186 | return input_weight, hidden_weight, bias 187 | 188 | def gemmlstmgate(cell_size, input_size, trainable=True): 189 | input_weight = torch.nn.Parameter(torch.nn.init.xavier_uniform_(torch.zeros(size=[cell_size*4, input_size], requires_grad=trainable))) 190 | hidden_weight = torch.nn.Parameter(torch.nn.init.xavier_uniform_(torch.zeros(size=[cell_size*4, cell_size], requires_grad=trainable))) 191 | bias_input = torch.nn.Parameter(torch.zeros(size=[cell_size*4], requires_grad=trainable)) 192 | bias_hidden = torch.nn.Parameter(torch.zeros(size=[cell_size*4], requires_grad=trainable)) 193 | return input_weight, hidden_weight, bias_input, bias_hidden 194 | 195 | class MaskedLSTMCell(torch.nn.Module): 196 | def __init__(self, cell_size, input_size=None, trainable=True): 197 | super(MaskedLSTMCell, self).__init__() 198 | self._cell_size = cell_size 199 | input_size = input_size if input_size is not None else cell_size # input_size == cell_size by default 200 | self._input_size = input_size 201 | self.Wi, self.Wh, self.bi, self.bh = gemmlstmgate(cell_size, input_size, trainable) # batch gemm 202 | 203 | def init_hidden(self, batch_size, dtype, device): 204 | cell = torch.zeros(1, batch_size, self._cell_size, dtype=dtype, device=device) 205 | hidden = torch.zeros(1, batch_size, self._cell_size, dtype=dtype, device=device) 206 | return (cell, hidden) 207 | 208 | def forward(self, x, state=None, done=None): 209 | if state is None: 210 | prev_cell, prev_hidden = self.init_hidden(x.shape[0], input.dtype, input.device) 211 | else: 212 | prev_cell, prev_hidden = state 213 | if done is not None: 214 | prev_cell *= (1-done).view(-1, 1) 215 | prev_hidden *= (1-done).view(-1, 1) 216 | 217 | gates = (torch.matmul(x, self.Wi.t()) + self.bi + torch.matmul(prev_hidden[0], self.Wh.t())) + self.bh 218 | i, f, c, o = gates.chunk(4, 1) 219 | i = torch.sigmoid(i) 220 | f = torch.sigmoid(f) 221 | c = torch.tanh(c) 222 | o = torch.sigmoid(o) 223 | 224 | cell = prev_cell * f + i * c 225 | hidden = o * torch.tanh(cell) 226 | return hidden, (cell, hidden) 227 | 228 | 229 | class MaskedLSTMBlock(torch.nn.Module): 230 | def __init__(self, input_size, hidden_size, time_major=True): 231 | super(MaskedLSTMBlock, self).__init__() 232 | self.time_major = time_major 233 | batch_first = not time_major 234 | self.hidden_size = hidden_size 235 | self.lstm = torch.nn.LSTM(input_size=input_size, hidden_size=hidden_size, batch_first=batch_first) 236 | 237 | def forward(self, x, hidden, done): 238 | if not self.time_major: 239 | x = x.transpose(1, 0, 2) 240 | 241 | if done is not None: 242 | mask = (1-done) 243 | else: 244 | mask = torch.ones(x.shape[0], x.shape[1]).to(x.device) 245 | 246 | mask_zeros = ((mask[1:]==0).any(dim=-1).nonzero()+1).view(-1).cpu().numpy().tolist() 247 | mask_zeros = [0] + mask_zeros + [mask.shape[0]+1] 248 | outputs = [] 249 | for i in range(len(mask_zeros)-1): 250 | start = mask_zeros[i] 251 | end = mask_zeros[i+1] 252 | #print('start, end', (start, end)) 253 | hidden = (mask[start].view(-1,1)*hidden[0], mask[start].view(-1,1)*hidden[1]) 254 | out, hidden = self.lstm(x[start:end], hidden) 255 | outputs.append(out) 256 | 257 | outputs = torch.cat(outputs, dim=0) 258 | outputs = outputs if self.time_major else outputs.transpose(1, 0, 2) 259 | return outputs, hidden -------------------------------------------------------------------------------- /rlib/Curiosity/CuriosityA2C.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | import numpy as np 4 | import scipy 5 | import gym 6 | import os, time 7 | import threading 8 | from rlib.A2C.A2C import ActorCritic 9 | from rlib.networks.networks import* 10 | from rlib.utils.SyncMultiEnvTrainer import SyncMultiEnvTrainer 11 | from rlib.utils.VecEnv import* 12 | from rlib.utils.wrappers import* 13 | from rlib.utils.utils import fastsample, fold_batch, one_hot, RunningMeanStd, normalise, stack_many, totorch_many 14 | from rlib.utils.schedulers import polynomial_sheduler 15 | 16 | class RollingObs(object): 17 | def __init__(self, mean=0): 18 | self.rolling = RunningMeanStd() 19 | 20 | def update(self, x): 21 | if len(x.shape) == 4: # assume image obs 22 | return self.rolling.update(np.mean(x, axis=1, keepdims=True)) #[time*batch,height,width,stack] -> [height, width] 23 | else: 24 | return self.rolling.update(x) #[time*batch,*shape] -> [*shape] 25 | 26 | 27 | class ICM(torch.nn.Module): 28 | def __init__(self, model_head, input_size, action_size, forward_coeff, device='cuda', **model_head_args): 29 | super(ICM, self).__init__() 30 | self.action_size = action_size 31 | self.forward_coeff = forward_coeff 32 | self.phi = model_head(input_size, **model_head_args) 33 | dense_size = self.phi.dense_size 34 | self.device = device 35 | 36 | # forward model 37 | self.forward1 = torch.nn.Sequential(torch.nn.Linear(dense_size + action_size, dense_size), torch.nn.ReLU()).to(device) 38 | self.pred_state = torch.nn.Linear(dense_size, dense_size).to(device) 39 | 40 | # inverse model 41 | self.inverse1 = torch.nn.Sequential(torch.nn.Linear(dense_size*2, dense_size), torch.nn.ReLU()).to(device) 42 | self.pred_action = torch.nn.Sequential(torch.nn.Linear(dense_size*2, dense_size), torch.nn.ReLU()).to(device) 43 | 44 | 45 | def intr_reward(self, phi, action_onehot, phi_next): 46 | f1 = self.forward1(torch.cat([phi, action_onehot], dim=1)) 47 | phi_pred = self.pred_state(f1) 48 | intr_reward = 0.5 * torch.sum(torch.square(phi_pred - phi_next), dim=1) # l2 distance metric ‖ˆφ(st+1)−φ(st+1)‖22 49 | return intr_reward 50 | 51 | def predict_action(self, phi1, phi2): 52 | phi_cat = torch.cat([phi1, phi2], dim=1) 53 | pred_action = self.pred_action(phi_cat) 54 | return pred_action 55 | 56 | def get_intr_reward(self, state, action, next_state): 57 | state, next_state, action = totorch_many(state, next_state, action, device=self.device) 58 | action = action.long() 59 | phi1 = self.phi(state) 60 | phi2 = self.phi(next_state) 61 | action_onehot = F.one_hot(action, self.action_size) 62 | with torch.no_grad(): 63 | intr_reward = self.intr_reward(phi1, action_onehot, phi2) 64 | return intr_reward.cpu().numpy() 65 | 66 | def get_pred_action(self, state, next_state): 67 | state, next_state = totorch_many(state, next_state, device=self.device) 68 | return self.pred_action(state, next_state) 69 | 70 | def loss(self, state, action, next_state): 71 | action = action.long() 72 | phi1 = self.phi(state) 73 | phi2 = self.phi(next_state) 74 | action_onehot = F.one_hot(action, self.action_size) 75 | 76 | forward_loss = torch.mean(self.intr_reward(phi1, action_onehot, phi2)) 77 | inverse_loss = F.cross_entropy(self.predict_action(phi1, phi2), action) 78 | return (1-self.forward_coeff) * inverse_loss + self.forward_coeff * forward_loss 79 | 80 | 81 | class Curiosity(torch.nn.Module): 82 | def __init__(self, policy_model, ICM_model, input_size, action_size, forward_coeff, policy_importance, reward_scale, entropy_coeff, value_coeff=0.5, 83 | lr=1e-3, lr_final=1e-3, decay_steps=6e5, grad_clip=0.5, policy_args={}, ICM_args={}, device='cuda'): 84 | super(Curiosity, self).__init__() 85 | self.reward_scale, self.forward_coeff, self.policy_importance, self.entropy_coeff = reward_scale, forward_coeff, policy_importance, entropy_coeff 86 | self.lr, self.lr_final, self.decay_steps = lr, lr_final, decay_steps 87 | self.grad_clip = grad_clip 88 | self.action_size = action_size 89 | self.device = device 90 | 91 | try: 92 | iterator = iter(input_size) 93 | except TypeError: 94 | input_size = (input_size,) 95 | 96 | self.ICM = ICM(ICM_model, input_size, action_size, forward_coeff, device=device, **ICM_args) 97 | self.AC = ActorCritic(policy_model, input_size, action_size, entropy_coeff, value_coeff, lr, lr_final, decay_steps, grad_clip, build_optimiser=False, device=device, **policy_args) 98 | 99 | self.optimiser = torch.optim.RMSprop(self.parameters(), lr=lr) 100 | self.scheduler = polynomial_sheduler(self.optimiser, lr_final, decay_steps, power=1) 101 | 102 | def forward(self, state): 103 | return self.AC.forward(state) 104 | 105 | def evaluate(self, state): 106 | return self.AC.evaluate(state) 107 | 108 | def intrinsic_reward(self, state, action, next_state): 109 | return self.ICM.get_intr_reward(state, action, next_state) 110 | 111 | def backprop(self, state, next_state, R, Adv, action, state_mean, state_std): 112 | state, next_state, R, Adv, action, state_mean, state_std = totorch_many(state, next_state, R, Adv, 113 | action, state_mean, state_std, device=self.device) 114 | policy, value = self.AC.forward(state) 115 | action_onehot = F.one_hot(action.long(), self.action_size) 116 | policy_loss = self.AC.loss(policy, R, value, action_onehot) 117 | ICM_loss = self.ICM.loss((state-state_mean)/state_std, action, (next_state-state_mean)/state_std) 118 | loss = self.policy_importance * policy_loss + self.reward_scale * ICM_loss 119 | loss.backward() 120 | if self.grad_clip is not None: 121 | torch.nn.utils.clip_grad_norm_(self.parameters(), self.grad_clip) 122 | self.optimiser.step() 123 | self.optimiser.zero_grad() 124 | self.scheduler.step() 125 | return loss.detach().cpu().numpy() 126 | 127 | 128 | 129 | 130 | class Curiosity_Trainer(SyncMultiEnvTrainer): 131 | def __init__(self, envs, model, val_envs, train_mode='nstep', log_dir='logs/', total_steps=1000000, nsteps=5, validate_freq=1000000, save_freq=0, render_freq=0, num_val_episodes=50, max_val_steps=10000, log_scalars=True): 132 | super().__init__(envs, model, val_envs, train_mode=train_mode, return_type='nstep', log_dir=log_dir, total_steps=total_steps, nsteps=nsteps, validate_freq=validate_freq, 133 | save_freq=save_freq, render_freq=render_freq, update_target_freq=0, num_val_episodes=num_val_episodes, max_val_steps=max_val_steps, log_scalars=log_scalars) 134 | 135 | self.state_obs = RollingObs() 136 | self.state_mean = None 137 | self.state_std = None 138 | 139 | hyper_paras = {'learning_rate':model.lr, 'learning_rate_final':model.lr_final, 'lr_decay_steps':model.decay_steps, 140 | 'grad_clip':model.grad_clip, 'nsteps':self.nsteps, 'num_workers':self.num_envs, 'total_steps':self.total_steps, 141 | 'entropy_coefficient':0.01, 'value_coefficient':0.5, 'reward_scale':model.reward_scale, 142 | 'forward_model_scale':model.forward_coeff, 'policy_importance':model.policy_importance, 143 | 'gamma':self.gamma, 'lambda':self.lambda_} 144 | 145 | if self.log_scalars: 146 | filename = log_dir + '/hyperparameters.txt' 147 | self.save_hyperparameters(filename, **hyper_paras) 148 | 149 | self.lambda_ = 0.95 150 | 151 | def init_state_obs(self, num_steps): 152 | states = 0 153 | for i in range(num_steps): 154 | rand_actions = np.random.randint(0, self.model.action_size, size=self.num_envs) 155 | next_states, rewards, dones, infos = self.env.step(rand_actions) 156 | states += next_states 157 | return states / num_steps 158 | 159 | 160 | def _train_nstep(self): 161 | num_updates = self.total_steps // (self.num_envs * self.nsteps) 162 | s = 0 163 | self.state_mean, self.state_std = self.state_obs.update(self.init_state_obs(10000//self.num_envs)) 164 | self.states = self.env.reset() 165 | print(self.state_mean.shape, self.state_std.shape) 166 | start = time.time() 167 | # main loop 168 | batch_size = self.num_envs * self.nsteps 169 | for t in range(1,num_updates+1): 170 | states, next_states, actions, rewards, dones, values = self.rollout() 171 | _, last_values = self.model.evaluate(next_states[-1]) 172 | 173 | R = self.nstep_return(rewards, last_values, dones) 174 | Adv = R - values 175 | #delta = rewards + self.gamma * values[:-1] - values[1:] 176 | #Adv = self.multistep_target(delta, values[-1], dones, gamma=self.gamma*self.lambda_) 177 | 178 | # stack all states, next_states, actions and Rs across all workers into a single batch 179 | states, next_states, actions, R, Adv = fold_batch(states), fold_batch(next_states), fold_batch(actions), fold_batch(R), fold_batch(Adv) 180 | mean, std = self.state_mean, self.state_std 181 | 182 | l = self.model.backprop(states, next_states, R, Adv, actions, mean, std) 183 | 184 | # self.state_mean, self.state_std = self.state_obs.update(states) 185 | 186 | if self.render_freq > 0 and t % (self.validate_freq * self.render_freq) == 0: 187 | render = True 188 | else: 189 | render = False 190 | 191 | if self.validate_freq > 0 and t % (self.validate_freq // batch_size) == 0: 192 | self.validation_summary(t,l,start,render) 193 | start = time.time() 194 | 195 | if self.save_freq > 0 and t % (self.save_freq // batch_size) == 0: 196 | s += 1 197 | self.saver.save(self.sess, str(self.model_dir + self.current_time + '/' + str(s) + ".ckpt") ) 198 | print('saved model') 199 | 200 | 201 | 202 | def get_action(self, state): 203 | policy, value = self.model.evaluate(state) 204 | action = int(np.random.choice(policy.shape[1], p=policy[0])) 205 | return action 206 | 207 | 208 | def rollout(self,): 209 | rollout = [] 210 | for t in range(self.nsteps): 211 | start = time.time() 212 | policies, values = self.model.evaluate(self.states) 213 | actions = fastsample(policies) 214 | next_states, extr_rewards, dones, infos = self.env.step(actions) 215 | 216 | mean, std = self.state_mean[None], self.state_std[None] 217 | intr_rewards = self.model.intrinsic_reward((self.states-mean)/std, actions, (next_states-mean)/std) 218 | rewards = extr_rewards + intr_rewards 219 | rollout.append((self.states, next_states, actions, rewards, values, dones)) 220 | self.states = next_states 221 | 222 | states, next_states, actions, rewards, values, dones = stack_many(*zip(*rollout)) 223 | return states, next_states, actions, rewards, dones, values 224 | 225 | 226 | def main(env_id): 227 | num_envs = 32 228 | nsteps = 20 229 | 230 | classic_list = ['MountainCar-v0', 'Acrobot-v1', 'LunarLander-v2', 'CartPole-v0', 'CartPole-v1'] 231 | if any(env_id in s for s in classic_list): 232 | print('Classic Control') 233 | val_envs = [gym.make(env_id) for i in range(1)] 234 | envs = BatchEnv(DummyEnv, env_id, num_envs, blocking=False) 235 | 236 | else: 237 | env = gym.make(env_id) 238 | print('Atari') 239 | if env.unwrapped.get_action_meanings()[1] == 'FIRE': 240 | reset = True 241 | print('fire on reset') 242 | else: 243 | reset = False 244 | print('only stack frames') 245 | 246 | val_envs = [AtariEnv(gym.make(env_id), k=4, rescale=84, episodic=False, reset=reset, clip_reward=False) for i in range(1)] 247 | envs = BatchEnv(AtariEnv, env_id, num_envs, blocking=False, rescale=84, k=4, reset=reset, episodic=False, clip_reward=True, time_limit=4500) 248 | 249 | 250 | env.close() 251 | action_size = val_envs[0].action_space.n 252 | input_size = val_envs[0].reset().shape 253 | 254 | 255 | 256 | train_log_dir = 'logs/Curiosity/' + env_id + '/hyper_unclipped/' 257 | 258 | model = Curiosity(NatureCNN, 259 | NatureCNN, 260 | input_size=input_size, 261 | action_size=action_size, 262 | forward_coeff=0.2, 263 | policy_importance=1, 264 | reward_scale=1.0, 265 | entropy_coeff=0.01, 266 | #intr_coeff=1, 267 | lr=1e-3, 268 | lr_final=0, 269 | decay_steps=50e6//(num_envs*nsteps), 270 | grad_clip=0.5, 271 | policy_args={}, 272 | ICM_args={'scale':False}).cuda() 273 | 274 | 275 | 276 | curiosity = Curiosity_Trainer(envs=envs, 277 | model=model, 278 | val_envs=val_envs, 279 | train_mode='nstep', 280 | total_steps=5e6, 281 | nsteps=nsteps, 282 | validate_freq=1e5, 283 | save_freq=0, 284 | render_freq=0, 285 | num_val_episodes=1, 286 | log_dir=train_log_dir, 287 | log_scalars=False) 288 | print(env_id) 289 | curiosity.train() 290 | 291 | del curiosity 292 | 293 | if __name__ == "__main__": 294 | env_id_list = ['SpaceInvadersDeterministic-v4', 'FreewayDeterministic-v4', 'MontezumaRevengeDeterministic-v4', 'PongDeterministic-v4'] 295 | #env_id_list = ['MountainCar-v0', 'Acrobot-v1', 'CartPole-v1', ] 296 | #for i in range(5): 297 | for env_id in env_id_list: 298 | main(env_id) 299 | -------------------------------------------------------------------------------- /rlib/VIN/VIN.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | import numpy as np 4 | from torch.utils.tensorboard import SummaryWriter 5 | import datetime 6 | import threading 7 | import time 8 | 9 | from rlib.utils.VecEnv import* 10 | from rlib.utils.wrappers import* 11 | from rlib.utils.utils import fold_batch, stack_many, one_hot, totorch, totorch_many, tonumpy 12 | 13 | class VINCNN(torch.nn.Module): 14 | def __init__(self, input_size, action_size, k=10, lr=1e-3, device='cuda'): 15 | super(VINCNN, self).__init__() 16 | channels, height, width = input_size 17 | self.action_size = action_size 18 | self.conv_enc = torch.nn.Conv2d(channels, 150, kernel_size=[3,3], stride=[1,1], padding=1).to(device) # φ(s) 19 | self.R_bar = torch.nn.Conv2d(150, 1, kernel_size=[1,1], stride=[1,1], padding=0, bias=False).to(device) 20 | self.Q_bar = torch.nn.Conv2d(1, action_size, kernel_size=[3,3], stride=[1,1], padding=1, bias=False).to(device) 21 | self.w = torch.nn.Parameter(torch.zeros(action_size, 1, 3, 3), requires_grad=True).to(device) 22 | self.Q = torch.nn.Linear(action_size, action_size).to(device) 23 | self.k = k # nsteps to plan with VIN 24 | self.optim = torch.optim.RMSprop(params=self.parameters(), lr=lr) 25 | self.device = device 26 | 27 | def forward(self, img, x, y): 28 | hidden = self.conv_enc(img) 29 | R_bar = self.R_bar(hidden) 30 | Q_bar = self.Q_bar(R_bar) 31 | V_bar, _ = torch.max(Q_bar, dim=1, keepdim=True) 32 | batch_size = img.shape[0] 33 | psi = self._plan_ahead(R_bar, V_bar)[torch.arange(batch_size), :, x.long(), y.long()].view(batch_size, self.action_size) # ψ(s) 34 | Qsa = self.Q(psi) 35 | return Qsa 36 | 37 | 38 | def backprop(self, states, locs, R, actions): 39 | x, y = zip(*locs) 40 | Qsa = self.forward(totorch(states, self.device), torch.tensor(x).to(self.device), torch.tensor(y)).to(self.device) 41 | actions_onehot = totorch(one_hot(actions, self.action_size), self.device) 42 | Qvalue = torch.sum(Qsa * actions_onehot, axis=1) 43 | loss = torch.mean(torch.square(totorch(R).float().cuda() - Qvalue)) 44 | 45 | loss.backward() 46 | self.optim.step() 47 | self.optim.zero_grad() 48 | return loss.detach().cpu().numpy() 49 | 50 | 51 | def value_iteration(self, r, V): 52 | return F.conv2d( 53 | # Stack reward with most recent value 54 | torch.cat([r, V], 1), 55 | # Convolve r->q weights to r, and v->q weights for v. These represent transition probabilities 56 | torch.cat([self.Q_bar.weight, self.w], 1), 57 | stride=1, 58 | padding=1) 59 | 60 | def _plan_ahead(self, r, V): 61 | for i in range(self.k): 62 | Q = self.value_iteration(r, V) 63 | V, _ = torch.max(Q, dim=1, keepdim=True) 64 | 65 | Q = self.value_iteration(r, V) 66 | return Q 67 | 68 | 69 | 70 | class VINTrainer(object): 71 | def __init__(self, model, envs, val_envs, epsilon=0.1, epsilon_final=0.1, epsilon_steps=1000000, epsilon_test=0.1, 72 | return_type='nstep', log_dir='logs/', model_dir='models/', total_steps=50000000, nsteps=20, gamma=0.99, lambda_=0.95, 73 | validate_freq=1e6, save_freq=0, render_freq=0, update_target_freq=0, num_val_episodes=50, log_scalars=True): 74 | self.model = model 75 | self.env = envs 76 | self.num_envs = len(envs) 77 | self.val_envs = val_envs 78 | self.total_steps = total_steps 79 | self.action_size = self.model.action_size 80 | self.epsilon = epsilon 81 | self.epsilon_test = epsilon_test 82 | self.states = self.env.reset() 83 | self.loc = self.get_locs() 84 | print('locs', self.loc) 85 | 86 | self.total_steps = int(total_steps) 87 | self.nsteps = nsteps 88 | self.return_type = return_type 89 | self.gamma = gamma 90 | self.lambda_ = lambda_ 91 | 92 | self.validate_freq = int(validate_freq) 93 | self.num_val_episodes = num_val_episodes 94 | 95 | self.save_freq = int(save_freq) 96 | self.render_freq = render_freq 97 | self.target_freq = int(update_target_freq) 98 | self.t=1 99 | 100 | self.validate_rewards = [] 101 | self.lock = threading.Lock() 102 | self.scheduler = self.linear_schedule(epsilon, epsilon_final, epsilon_steps) 103 | 104 | self.log_scalars = log_scalars 105 | self.log_dir = log_dir 106 | 107 | if log_scalars: 108 | # Tensorboard Variables 109 | train_log_dir = self.log_dir + '/train' 110 | self.train_writer = SummaryWriter(train_log_dir) 111 | 112 | def nstep_return(self, rewards, last_values, dones, gamma=0.99, clip=False): 113 | if clip: 114 | rewards = np.clip(rewards, -1, 1) 115 | 116 | T = len(rewards) 117 | 118 | # Calculate R for advantage A = R - V 119 | R = np.zeros_like(rewards) 120 | R[-1] = last_values * (1-dones[-1]) 121 | 122 | for i in reversed(range(T-1)): 123 | # restart score if done as BatchEnv automatically resets after end of episode 124 | R[i] = rewards[i] + gamma * R[i+1] * (1-dones[i]) 125 | 126 | return R 127 | 128 | def lambda_return(self, rewards, values, last_values, dones, gamma=0.99, lambda_=0.8, clip=False): 129 | if clip: 130 | rewards = np.clip(rewards, -1, 1) 131 | T = len(rewards) 132 | # Calculate eligibility trace R^lambda 133 | R = np.zeros_like(rewards) 134 | R[-1] = last_values * (1-dones[-1]) 135 | for t in reversed(range(T-1)): 136 | # restart score if done as BatchEnv automatically resets after end of episode 137 | R[t] = rewards[t] + gamma * (lambda_* R[t+1] + (1.0-lambda_) * values[t+1]) * (1-dones[t]) 138 | 139 | return R 140 | 141 | def GAE(self, rewards, values, last_values, dones, gamma=0.99, lambda_=0.95, clip=False): 142 | if clip: 143 | rewards = np.clip(rewards, -1, 1) 144 | # Generalised Advantage Estimation 145 | Adv = np.zeros_like(rewards) 146 | Adv[-1] = rewards[-1] + gamma * last_values * (1-dones[-1]) - values[-1] 147 | T = len(rewards) 148 | for t in reversed(range(T-1)): 149 | delta = rewards[t] + gamma * values[t+1] * (1-dones[t]) - values[t] 150 | Adv[t] = delta + gamma * lambda_ * Adv[t+1] * (1-dones[t]) 151 | 152 | return Adv 153 | 154 | def get_locs(self): 155 | locs = [] 156 | for env in self.env.envs: 157 | locs.append(env.agent_loc) 158 | return locs 159 | 160 | def train(self): 161 | self.train_nstep() 162 | 163 | 164 | def train_nstep(self): 165 | batch_size = self.num_envs * self.nsteps 166 | num_updates = self.total_steps // batch_size 167 | # main loop 168 | start = time.time() 169 | for t in range(self.t,num_updates+1): 170 | states, locs, actions, rewards, dones, infos, values, last_values = self.rollout() 171 | if self.return_type == 'nstep': 172 | R = self.nstep_return(rewards, last_values, dones, gamma=self.gamma) 173 | elif self.return_type == 'GAE': 174 | R = self.GAE(rewards, values, last_values, dones, gamma=self.gamma, lambda_=self.lambda_) + values 175 | elif self.return_type == 'lambda': 176 | R = self.lambda_return(rewards, values, last_values, dones, gamma=self.gamma, lambda_=self.lambda_, clip=False) 177 | # stack all states, actions and Rs from all workers into a single batch 178 | states, locs, actions, R = fold_batch(states), fold_batch(locs), fold_batch(actions), fold_batch(R) 179 | #print('locs', locs.shape) 180 | l = self.model.backprop(states, locs, R, actions) 181 | 182 | if self.validate_freq > 0 and t % (self.validate_freq // batch_size) == 0: 183 | self.validation_summary(t,l,start,False) 184 | start = time.time() 185 | 186 | if self.save_freq > 0 and t % (self.save_freq // batch_size) == 0: 187 | self.s += 1 188 | self.save(self.s) 189 | print('saved model') 190 | 191 | if self.target_freq > 0 and t % (self.target_freq // batch_size) == 0: # update target network (for value based learning e.g. DQN) 192 | self.update_target() 193 | 194 | self.t +=1 195 | 196 | def eval_state(self, state, loc): 197 | with torch.no_grad(): 198 | x, y = zip(*loc) 199 | x, y = torch.tensor(x).to(self.device), torch.tensor(y).to(self.device) 200 | state_torch = totorch(state, self.device) 201 | Qsa = self.model(state_torch, x, y) 202 | return tonumpy(Qsa) 203 | 204 | def rollout(self): 205 | rollout = [] 206 | for t in range(self.nsteps): 207 | Qsa = self.eval_state(self.states, self.loc) 208 | actions = np.argmax(Qsa, axis=1) 209 | random = np.random.uniform(size=(self.num_envs)) 210 | random_actions = np.random.randint(self.action_size, size=(self.num_envs)) 211 | actions = np.where(random < self.epsilon, random_actions, actions) 212 | next_states, rewards, dones, infos = self.env.step(actions) 213 | values = np.sum(Qsa * one_hot(actions, self.action_size), axis=-1) 214 | rollout.append((self.states, self.loc, actions, rewards, dones, infos, values)) 215 | self.states = next_states 216 | self.epsilon = self.scheduler.step() 217 | self.loc = self.get_locs() 218 | 219 | states, locs, actions, rewards, dones, infos, values = stack_many(*zip(*rollout)) 220 | 221 | last_Qsa = self.eval_state(next_states, self.loc) # Q(s,a|theta) 222 | last_actions = np.argmax(last_Qsa, axis=1) 223 | last_values = np.sum(last_Qsa * one_hot(last_actions, self.action_size), axis=-1) 224 | return states, locs, actions, rewards, dones, infos, values, last_values 225 | 226 | def get_action(self, state, loc): 227 | Qsa = self.eval_state(state, loc) 228 | if np.random.uniform() < self.epsilon_test: 229 | action = np.random.choice(self.action_size) 230 | else: 231 | action = np.argmax(Qsa, axis=1) 232 | return action 233 | 234 | def validation_summary(self,t,loss,start,render): 235 | batch_size = self.num_envs * self.nsteps 236 | tot_steps = t * batch_size 237 | time_taken = time.time() - start 238 | frames_per_update = (self.validate_freq // batch_size) * batch_size 239 | fps = frames_per_update /time_taken 240 | num_val_envs = len(self.val_envs) 241 | num_val_eps = [self.num_val_episodes//num_val_envs for i in range(num_val_envs)] 242 | num_val_eps[-1] = num_val_eps[-1] + self.num_val_episodes % self.num_val_episodes//(num_val_envs) 243 | render_array = np.zeros((len(self.val_envs))) 244 | render_array[0] = render 245 | threads = [threading.Thread(daemon=True, target=self.validate, args=(self.val_envs[i], num_val_eps[i], 10000, render_array[i])) for i in range(num_val_envs)] 246 | try: 247 | for thread in threads: 248 | thread.start() 249 | 250 | for thread in threads: 251 | thread.join() 252 | 253 | except KeyboardInterrupt: 254 | for thread in threads: 255 | thread.join() 256 | 257 | 258 | score = np.mean(self.validate_rewards) 259 | self.validate_rewards = [] 260 | print("update %i, validation score %f, total steps %i, loss %f, time taken for %i frames:%fs, fps %f" %(t,score,tot_steps,loss,frames_per_update,time_taken,fps)) 261 | 262 | if self.log_scalars: 263 | self.train_writer.add_scalar('Validation/Score', score) 264 | self.train_writer.add_scalar('Training/Loss', loss) 265 | 266 | 267 | def validate(self,env,num_ep,max_steps,render=False): 268 | episode_scores = [] 269 | for episode in range(num_ep): 270 | state = env.reset() 271 | loc = env.agent_loc 272 | episode_score = [] 273 | for t in range(max_steps): 274 | action = self.get_action(state[np.newaxis], [loc]) 275 | next_state, reward, done, info = env.step(action) 276 | state = next_state 277 | loc = env.agent_loc 278 | 279 | episode_score.append(reward) 280 | 281 | if render: 282 | with self.lock: 283 | env.render() 284 | 285 | if done or t == max_steps -1: 286 | tot_reward = np.sum(episode_score) 287 | with self.lock: 288 | self.validate_rewards.append(tot_reward) 289 | 290 | break 291 | if render: 292 | with self.lock: 293 | env.close() 294 | 295 | class linear_schedule(object): 296 | def __init__(self, epsilon, epsilon_final, num_steps=1000000): 297 | self._counter = 0 298 | self._epsilon = epsilon 299 | self._epsilon_final = epsilon_final 300 | self._step = (epsilon - epsilon_final) / num_steps 301 | self._num_steps = num_steps 302 | 303 | def step(self,): 304 | if self._counter < self._num_steps : 305 | self._epsilon -= self._step 306 | self._counter += 1 307 | else: 308 | self._epsilon = self._epsilon_final 309 | 310 | return self._epsilon 311 | 312 | 313 | 314 | def main(env_id): 315 | num_envs = 32 316 | nsteps = 1 317 | 318 | current_time = datetime.datetime.now().strftime('%y-%m-%d_%H-%M-%S') 319 | 320 | train_log_dir = 'logs/VIN/' + env_id +'/n_step/' + current_time 321 | model_dir = "models/VIN/" + env_id + '/n_step/' + current_time 322 | 323 | if 'ApplePicker' in env_id: 324 | print('ApplePicker') 325 | make_args = {'num_objects':300, 'default_reward':-0.01} 326 | val_envs = [apple_pickgame(gym.make('ApplePicker-v0', **make_args)) for i in range(10)] 327 | envs = DummyBatchEnv(apple_pickgame, 'ApplePicker-v0', num_envs, max_steps=1000, auto_reset=True, make_args=make_args) 328 | print(val_envs[0]) 329 | print(envs.envs[0]) 330 | 331 | else: 332 | print('Atari') 333 | env = gym.make(env_id) 334 | if env.unwrapped.get_action_meanings()[1] == 'FIRE': 335 | reset = True 336 | print('fire on reset') 337 | else: 338 | reset = False 339 | print('only stack frames') 340 | env.close() 341 | val_envs = [AtariEnv(gym.make(env_id), k=4, episodic=False, reset=reset, clip_reward=False) for i in range(5)] 342 | envs = BatchEnv(AtariEnv, env_id, num_envs, blocking=False, k=4, reset=reset, episodic=False, clip_reward=True) 343 | 344 | action_size = val_envs[0].action_space.n 345 | input_size = val_envs[0].reset().shape 346 | print('input shape', input_size) 347 | print('action space', action_size) 348 | 349 | 350 | 351 | vin = VINCNN(input_size, 352 | action_size, 353 | k=50, 354 | lr=1e-3).cuda() 355 | 356 | 357 | trainer = VINTrainer(envs=envs, 358 | model=vin, 359 | log_dir=train_log_dir, 360 | val_envs=val_envs, 361 | return_type='nstep', 362 | total_steps=10e6, 363 | nsteps=nsteps, 364 | validate_freq=1e5, 365 | save_freq=0, 366 | render_freq=0, 367 | num_val_episodes=10, 368 | log_scalars=False) 369 | 370 | trainer.train() 371 | 372 | 373 | if __name__ == "__main__": 374 | import apple_picker 375 | #env_id_list = ['SpaceInvadersDeterministic-v4', 'FreewayDeterministic-v4', 'MontezumaRevengeDeterministic-v4', 'PongDeterministic-v4'] 376 | #env_id_list = ['MontezumaRevengeDeterministic-v4'] 377 | env_id_list = ['ApplePicker-v0'] 378 | for env_id in env_id_list: 379 | main(env_id) 380 | -------------------------------------------------------------------------------- /rlib/utils/SyncMultiEnvTrainer.py: -------------------------------------------------------------------------------- 1 | import time, datetime, os 2 | import threading 3 | import numpy as np 4 | import torch 5 | import copy 6 | import json 7 | from typing import Union 8 | from abc import ABC, abstractmethod 9 | from rlib.utils.utils import fold_batch 10 | from rlib.utils.VecEnv import BatchEnv, DummyBatchEnv 11 | import torch 12 | from torch.utils.tensorboard import SummaryWriter 13 | 14 | 15 | 16 | class SyncMultiEnvTrainer(object): 17 | def __init__(self, envs: Union[BatchEnv, DummyBatchEnv], model:torch.nn.Module, val_envs: Union[list, BatchEnv, DummyBatchEnv], train_mode='nstep', return_type='nstep', log_dir='logs/', model_dir='models/', total_steps=50e6, nsteps=5, gamma=0.99, lambda_=0.95, 18 | validate_freq=1e6, save_freq=0, render_freq=0, update_target_freq=0, num_val_episodes=50, max_val_steps=10000, log_scalars=True): 19 | ''' 20 | A synchronous multiple env training framework for pytorch 21 | 22 | Args: 23 | envs - BatchEnv | DummyBatchEnv: multiple synchronous training environments 24 | model - reinforcement learning model 25 | log_dir, log directory string for location of directory to log scalars log_dir='logs/', model_dir='models/', 26 | val_envs - use your own discretion to choose which validation mode you wan't, recommended BatchEnv or list for Atari and DummyBatchEnv for Classic Control like envs 27 | list: a list of envs for validation, uses threading to run environments asychronously 28 | BatchEnv: uses multiprocessing to run validation envs sychronously in parallel 29 | DummyBatchEnv: allows for sychronous env stepping without the overhead of multiprocessing, good for computationally cheap environments 30 | train_mode - 'nstep' or 'onestep' species whether training is done using multiple step TD learning or single step 31 | return_type - string to determine whether 'nstep', 'lambda' or 'GAE' returns are to be used 32 | total_steps - number of Total training steps across all environements 33 | nsteps - number of steps TD error is caluclated over 34 | validate_freq - number of steps across all environements before performing validating, 0 for no validation 35 | save_freq - number of steps across all environements before saving model, 0 for no saving 36 | render_freq - multiple of validate_freq before rendering (i.e. render every X validations), 0 for no rendering 37 | update_target_freq - number of steps across all environements before updating target model, 0 for no updating 38 | num_val_episodes - number of episodes to average over when validating 39 | max_val_steps - maximum number of steps for each validation episode (prevents infinite loops) 40 | log_scalars - boolean flag whether to log tensorboard scalars to log_dir 41 | ''' 42 | self.env = envs 43 | if isinstance(envs, list): 44 | self.validate_func = self.validate_async 45 | else: 46 | self.validate_func = self.validate_sync 47 | if train_mode not in ['nstep', 'onestep']: 48 | raise ValueError('train_mode %s is not a valid argument. Valid arguments are ... %s, %s' %(train_mode,'nstep','onestep')) 49 | assert num_val_episodes >= len(val_envs), 'number of validation epsiodes {} must be greater than or equal to the number of validation envs {}'.format(num_val_episodes, len(val_envs)) 50 | if return_type not in ['nstep', 'lambda', 'GAE']: 51 | raise ValueError('return_type %s is not a valid argument. Valid arguments are ... %s, %s, %s' %(return_type, 'nstep', 'lambda', 'GAE')) 52 | self.train_mode = train_mode 53 | self.num_envs = len(envs) 54 | self.env_id = envs.spec.id 55 | self.val_envs = val_envs 56 | self.validate_rewards = [] 57 | self.model = model 58 | 59 | self.total_steps = int(total_steps) 60 | self.nsteps = nsteps 61 | self.return_type = return_type 62 | self.gamma = gamma 63 | self.lambda_ = lambda_ 64 | 65 | self.validate_freq = int(validate_freq) 66 | self.num_val_episodes = num_val_episodes 67 | self.val_steps = max_val_steps 68 | self.lock = threading.Lock() 69 | 70 | self.save_freq = int(save_freq) 71 | self.render_freq = render_freq 72 | self.target_freq = int(update_target_freq) 73 | self.s = 0 # number of saves made 74 | self.t = 1 # number of updates done 75 | self.log_scalars = log_scalars 76 | self.log_dir = log_dir 77 | self.model_dir = model_dir 78 | 79 | self.states = self.env.reset() 80 | 81 | if log_scalars: 82 | # Tensorboard Variables 83 | self.train_log_dir = self.log_dir + '/train' 84 | self.train_writer = SummaryWriter(self.train_log_dir) 85 | 86 | if not os.path.exists(self.model_dir) and save_freq > 0: 87 | os.makedirs(self.model_dir) 88 | 89 | def __del__(self): 90 | self.env.close() 91 | 92 | 93 | def train(self): 94 | if self.train_mode == 'nstep': 95 | self._train_nstep() 96 | elif self.train_mode == 'onestep': 97 | self._train_onestep() 98 | else: 99 | raise ValueError('%s is not a valid training mode'%(self.train_mode)) 100 | 101 | @abstractmethod 102 | def _train_nstep(self): 103 | ''' 104 | template for multi-step training loop for synchronous training over multiple environments 105 | ''' 106 | start = time.time() 107 | batch_size = self.num_envs * self.nsteps 108 | num_updates = self.total_steps // batch_size 109 | # main loop 110 | for t in range(self.t,num_updates+1): 111 | states, actions, rewards, dones, values, last_values = self.rollout() 112 | if self.return_type == 'nstep': 113 | R = self.nstep_return(rewards, last_values, dones, gamma=self.gamma) 114 | elif self.return_type == 'GAE': 115 | R = self.GAE(rewards, values, last_values, dones, gamma=self.gamma, lambda_=self.lambda_) + values 116 | elif self.return_type == 'lambda': 117 | R = self.lambda_return(rewards, values, last_values, dones, gamma=self.gamma, lambda_=self.lambda_, clip=False) 118 | # stack all states, actions and Rs from all workers into a single batch 119 | states, actions, R = fold_batch(states), fold_batch(actions), fold_batch(R) 120 | l = self.model.backprop(states, R, actions) 121 | 122 | if self.render_freq > 0 and t % ((self.validate_freq // batch_size) * self.render_freq) == 0: 123 | render = True 124 | else: 125 | render = False 126 | 127 | if self.validate_freq > 0 and t % (self.validate_freq // batch_size) == 0: 128 | self.validation_summary(t,l,start,render) 129 | start = time.time() 130 | 131 | if self.save_freq > 0 and t % (self.save_freq // batch_size) == 0: 132 | self.s += 1 133 | self.save(self.s) 134 | print('saved model') 135 | 136 | if self.target_freq > 0 and t % (self.target_freq // batch_size) == 0: # update target network (for value based learning e.g. DQN) 137 | self.update_target() 138 | 139 | self.t +=1 140 | 141 | 142 | @abstractmethod 143 | def rollout(self): 144 | raise NotImplementedError(self, 'No rollout method found') 145 | 146 | def nstep_return(self, rewards, last_values, dones, gamma=0.99, clip=False): 147 | if clip: 148 | rewards = np.clip(rewards, -1, 1) 149 | 150 | T = len(rewards) 151 | 152 | # Calculate R for advantage A = R - V 153 | R = np.zeros_like(rewards) 154 | R[-1] = last_values * (1-dones[-1]) 155 | 156 | for i in reversed(range(T-1)): 157 | # restart score if done as BatchEnv automatically resets after end of episode 158 | R[i] = rewards[i] + gamma * R[i+1] * (1-dones[i]) 159 | 160 | return R 161 | 162 | def lambda_return(self, rewards, values, last_values, dones, gamma=0.99, lambda_=0.8, clip=False): 163 | if clip: 164 | rewards = np.clip(rewards, -1, 1) 165 | T = len(rewards) 166 | # Calculate eligibility trace R^lambda 167 | R = np.zeros_like(rewards) 168 | R[-1] = last_values * (1-dones[-1]) 169 | for t in reversed(range(T-1)): 170 | # restart score if done as BatchEnv automatically resets after end of episode 171 | R[t] = rewards[t] + gamma * (lambda_* R[t+1] + (1.0-lambda_) * values[t+1]) * (1-dones[t]) 172 | 173 | return R 174 | 175 | def GAE(self, rewards, values, last_values, dones, gamma=0.99, lambda_=0.95, clip=False): 176 | if clip: 177 | rewards = np.clip(rewards, -1, 1) 178 | # Generalised Advantage Estimation 179 | Adv = np.zeros_like(rewards) 180 | Adv[-1] = rewards[-1] + gamma * last_values * (1-dones[-1]) - values[-1] 181 | T = len(rewards) 182 | for t in reversed(range(T-1)): 183 | delta = rewards[t] + gamma * values[t+1] * (1-dones[t]) - values[t] 184 | Adv[t] = delta + gamma * lambda_ * Adv[t+1] * (1-dones[t]) 185 | 186 | return Adv 187 | 188 | def validation_summary(self, t, loss, start, render): 189 | batch_size = self.num_envs * self.nsteps 190 | tot_steps = t * batch_size 191 | time_taken = time.time() - start 192 | frames_per_update = (self.validate_freq // batch_size) * batch_size 193 | fps = frames_per_update / time_taken 194 | 195 | score = self.validate_func(render) 196 | print("update %i, validation score %f, total steps %i, loss %f, time taken for %i frames:%fs, fps %f \t\t\t" %(t,score,tot_steps,loss,frames_per_update,time_taken,fps)) 197 | 198 | if self.log_scalars: 199 | self.train_writer.add_scalar('validation/score', score, tot_steps) 200 | self.train_writer.add_scalar('train/loss', loss, tot_steps) 201 | 202 | 203 | def save_model(self, s): 204 | model_loc = f'{self.model_dir}/{s}.pt' 205 | # default saving method is to save session 206 | torch.save(self.model.state_dict(), model_loc) 207 | 208 | def load_model(self, modelname, model_dir="models/"): 209 | filename = model_dir + modelname + '.pt' 210 | if os.path.exists(filename): 211 | self.model.load_state_dict(torch.load(filename)) 212 | print("loaded:", filename) 213 | else: 214 | print(filename, " does not exist") 215 | 216 | def base_attr(self): 217 | attributes = {'train_mode':self.train_mode, 218 | 'total_steps':self.total_steps, 219 | 'nsteps':self.nsteps, 220 | 'return_type':self.return_type, 221 | 'gamma':self.gamma, 222 | 'lambda_':self.lambda_, 223 | 'validate_freq':self.validate_freq, 224 | 'num_val_episodes':self.num_val_episodes, 225 | 'save_freq':self.save_freq, 226 | 'render_freq':self.render_freq, 227 | 'model_dir':self.model_dir, 228 | 'train_log_dir':self.train_log_dir, 229 | 's':self.s, 230 | 't':self.t} 231 | 232 | return attributes 233 | 234 | def local_attr(self, attr): 235 | # attr[variable] = z 236 | return attr 237 | 238 | def save(self, s): 239 | model_loc = str(self.model_dir + '/' + str(s) + '.trainer') 240 | file = open(model_loc, 'w+') 241 | attributes = self.base_attr() 242 | # add local variables to dict 243 | attributes = self.local_attr(attributes) 244 | json.dump(attributes, file) 245 | # save model 246 | self.save_model(s) 247 | file.close() 248 | 249 | def load(self, Class, model, model_checkpoint, envs, val_envs, filename, log_scalars=True, allow_gpu_growth=True, continue_train=True): 250 | with open(filename, 'r') as file: 251 | attrs = json.loads(file.read()) 252 | s = attrs.pop('s') 253 | t = attrs.pop('t') 254 | time = attrs.pop('current_time') 255 | print(attrs) 256 | trainer = Class(envs=envs, model=model, val_envs=val_envs, log_scalars=log_scalars, gpu_growth=allow_gpu_growth, **attrs) 257 | if continue_train: 258 | trainer.s = s 259 | trainer.t = t 260 | self.load_model(model_checkpoint, trainer.model_dir) 261 | return trainer 262 | 263 | @abstractmethod 264 | def update_target(self): 265 | pass 266 | 267 | @abstractmethod 268 | def _train_onestep(self): 269 | ''' more efficient implementation of train_nstep when nsteps=1 270 | ''' 271 | raise NotImplementedError(self, 'does not have an one-step training implementation') 272 | 273 | def save_hyperparameters(self, filename, **kwargs): 274 | handle = open(filename, "w") 275 | for key, value in kwargs.items(): 276 | handle.write("{} = {}\n" .format(key, value)) 277 | handle.close() 278 | 279 | def validate_async(self, render=False): 280 | num_val_envs = len(self.val_envs) 281 | num_val_eps = [self.num_val_episodes//num_val_envs for i in range(num_val_envs)] 282 | num_val_eps[-1] = num_val_eps[-1] + self.num_val_episodes % self.num_val_episodes//(num_val_envs) 283 | render_array = np.zeros((len(self.val_envs))) 284 | render_array[0] = render 285 | threads = [threading.Thread(daemon=True, target=self._validate_async, args=(self.val_envs[i], num_val_eps[i], self.val_steps, render_array[i])) for i in range(num_val_envs)] 286 | 287 | try: 288 | for thread in threads: 289 | thread.start() 290 | 291 | for thread in threads: 292 | thread.join() 293 | 294 | except KeyboardInterrupt: 295 | for thread in threads: 296 | thread.join() 297 | 298 | 299 | score = np.mean(self.validate_rewards) 300 | self.validate_rewards = [] 301 | return score 302 | 303 | def _validate_async(self, env, num_ep, max_steps, render=False): 304 | 'single env validation' 305 | for episode in range(num_ep): 306 | state = env.reset() 307 | episode_score = [] 308 | for t in range(max_steps): 309 | action = self.get_action(state[np.newaxis]) 310 | next_state, reward, done, info = env.step(action) 311 | state = next_state 312 | #print('state', state, 'action', action, 'reward', reward) 313 | 314 | episode_score.append(reward) 315 | 316 | if render: 317 | with self.lock: 318 | env.render() 319 | 320 | if done or t == max_steps -1: 321 | tot_reward = np.sum(episode_score) 322 | with self.lock: 323 | self.validate_rewards.append(tot_reward) 324 | 325 | break 326 | if render: 327 | with self.lock: 328 | env.close() 329 | 330 | def validate_sync(self, render=False): 331 | 'batch env validation' 332 | episode_scores = [] 333 | env = self.val_envs 334 | for episode in range(self.num_val_episodes//len(env)): 335 | states = env.reset() 336 | episode_score = [] 337 | for t in range(self.val_steps): 338 | actions = self.get_action(states) 339 | next_states, rewards, dones, infos = env.step(actions) 340 | states = next_states 341 | #print('state', state, 'action', action, 'reward', reward) 342 | 343 | episode_score.append(rewards*(1-dones)) 344 | 345 | if render: 346 | with self.lock: 347 | env.render() 348 | 349 | if dones.sum() == self.num_envs or t == self.val_steps -1: 350 | tot_reward = np.sum(np.stack(episode_score), axis=0) 351 | episode_scores.append(tot_reward) 352 | break 353 | 354 | return np.mean(episode_scores) 355 | 356 | def get_action(self, state): # include small fn in order to reuse validate 357 | raise NotImplementedError('get_action method is required when using the default validation functions, check that this is implemented properly') 358 | 359 | def fold_batch(self, x): 360 | rows, cols = x.shape[0], x.shape[1] 361 | y = x.reshape(rows*cols,*x.shape[2:]) 362 | return y 363 | 364 | 365 | 366 | 367 | 368 | # class Runner(ABC): 369 | # def __init__(self,model,env,num_steps): 370 | # self.model = model 371 | # self.env = env 372 | # self.num_steps = num_steps 373 | # self.states = self.env.reset() 374 | 375 | # @abstractmethod 376 | # def run(self): 377 | # pass -------------------------------------------------------------------------------- /rlib/DAAC/DAAC.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | import time 4 | import datetime 5 | import torch 6 | import torch.nn.functional as F 7 | 8 | from rlib.networks.networks import * 9 | from rlib.utils.VecEnv import* 10 | from rlib.utils.wrappers import* 11 | from rlib.utils.SyncMultiEnvTrainer import SyncMultiEnvTrainer 12 | from rlib.utils.utils import fastsample, fold_batch, tonumpy, totorch, totorch_many, stack_many, fold_many 13 | from rlib.utils.schedulers import polynomial_sheduler 14 | 15 | class ValueModel(torch.nn.Module): 16 | def __init__(self, model, input_shape, action_size, lr=1e-3, lr_final=0, decay_steps=6e5, grad_clip=0.5, 17 | build_optimiser=True, optim=torch.optim.Adam, optim_args={}, device='cuda', **model_args): 18 | super(ValueModel, self).__init__() 19 | self.lr = lr 20 | self.lr_final = lr_final 21 | self.action_size = action_size 22 | self.decay_steps = decay_steps 23 | self.grad_clip = grad_clip 24 | self.device = device 25 | 26 | self.model = model(input_shape, **model_args).to(self.device) 27 | dense_size = self.model.dense_size 28 | self.V = torch.nn.Linear(dense_size, 1).to(self.device) 29 | 30 | if build_optimiser: 31 | self.optimiser = optim(self.parameters(), lr, **optim_args) 32 | self.scheduler = polynomial_sheduler(self.optimiser, lr_final, decay_steps, power=1) 33 | 34 | 35 | def forward(self, state): 36 | enc_state = self.model(state) 37 | value = self.V(enc_state).view(-1) 38 | return value 39 | 40 | def loss(self, V, R): 41 | value_loss = 0.5 * torch.mean(torch.square(R - V)) 42 | return value_loss 43 | 44 | def backprop(self, state, R): 45 | state, R = totorch_many(state, R, device=self.device) 46 | value = self.forward(state) 47 | loss = self.loss(value, R) 48 | 49 | loss.backward() 50 | if self.grad_clip is not None: 51 | torch.nn.utils.clip_grad_norm_(self.parameters(), self.grad_clip) 52 | self.optimiser.step() 53 | self.optimiser.zero_grad() 54 | self.scheduler.step() 55 | return loss.detach().cpu().numpy() 56 | 57 | 58 | 59 | class PolicyModel(torch.nn.Module): 60 | # PPO Policy 61 | def __init__(self, model, input_shape, action_size, lr=1e-3, lr_final=0, decay_steps=6e5, grad_clip=0.5, entropy_coeff=0.01, policy_clip=0.1, adv_coeff=0.25, 62 | build_optimiser=True, optim=torch.optim.Adam, optim_args={}, device='cuda', **model_args): 63 | super(PolicyModel, self).__init__() 64 | self.lr = lr 65 | self.lr_final = lr_final 66 | self.action_size = action_size 67 | self.entropy_coeff = entropy_coeff 68 | self.decay_steps = decay_steps 69 | self.grad_clip = grad_clip 70 | self.policy_clip = policy_clip 71 | self.adv_coeff = adv_coeff 72 | self.device = device 73 | 74 | self.model = model(input_shape, **model_args).to(self.device) 75 | dense_size = self.model.dense_size 76 | self.policy = torch.nn.Sequential(torch.nn.Linear(dense_size, action_size), torch.nn.Softmax(dim=-1)).to(self.device) 77 | self.Adv = torch.nn.Linear(dense_size, 1).to(self.device) 78 | 79 | if build_optimiser: 80 | self.optimiser = optim(self.parameters(), lr, **optim_args) 81 | self.scheduler = polynomial_sheduler(self.optimiser, lr_final, decay_steps, power=1) 82 | 83 | 84 | def forward(self, state): 85 | enc_state = self.model(state) 86 | policy = self.policy(enc_state) 87 | Adv = self.Adv(enc_state).view(-1) 88 | return policy, Adv 89 | 90 | def loss(self, policy, Adv_hat, Adv, action_onehot, old_policy): 91 | policy_actions = torch.sum(policy * action_onehot, dim=1) 92 | old_policy_actions = torch.sum(old_policy * action_onehot, dim=1) 93 | ratio = policy_actions / old_policy_actions 94 | policy_loss_unclipped = ratio * -Adv 95 | policy_loss_clipped = torch.clip_(ratio, 1 - self.policy_clip, 1 + self.policy_clip) * -Adv 96 | policy_loss = torch.mean(torch.maximum(policy_loss_unclipped, policy_loss_clipped)) 97 | entropy = torch.mean(torch.sum(policy * -torch.log(policy), dim=1)) 98 | 99 | adv_loss = torch.square(Adv_hat - Adv).sum(dim=-1).mean() 100 | 101 | loss = policy_loss - self.entropy_coeff * entropy + self.adv_coeff * adv_loss 102 | return loss 103 | 104 | def backprop(self, state, Adv, action, old_policy): 105 | state, action, Adv, old_policy = totorch_many(state, action, Adv, old_policy, device=self.device) 106 | policy, Adv_hat = self.forward(state) 107 | action_onehot = F.one_hot(action.long(), self.action_size) 108 | loss = self.loss(policy, Adv_hat, Adv, action_onehot, old_policy) 109 | 110 | loss.backward() 111 | if self.grad_clip is not None: 112 | torch.nn.utils.clip_grad_norm_(self.parameters(), self.grad_clip) 113 | self.optimiser.step() 114 | self.optimiser.zero_grad() 115 | self.scheduler.step() 116 | return loss.detach().cpu().numpy() 117 | 118 | 119 | 120 | class DAAC(torch.nn.Module): 121 | # Decoupling Value and Policy for Generalization in Reinforcement Learning 122 | # https://arxiv.org/pdf/2102.10330.pdf 123 | def __init__(self, policy_model, value_model, input_shape, action_size, entropy_coeff=0.01, adv_coeff=0.25, policy_clip=0.1, lr=5e-4, lr_final=1e-5, decay_steps=6e5, grad_clip=0.2, device='cuda', 124 | policy_optim=torch.optim.Adam, policy_optim_args={}, policy_model_args={}, 125 | value_optim=torch.optim.Adam, value_optim_args={}, value_model_args={}): 126 | super(DAAC, self).__init__() 127 | self.lr = lr 128 | self.lr_final = lr_final 129 | self.decay_steps = decay_steps 130 | self.entropy_coeff = entropy_coeff 131 | self.adv_coeff = adv_coeff 132 | self.grad_clip = grad_clip 133 | self.policy_clip = policy_clip 134 | 135 | self.value = ValueModel(value_model, input_shape, action_size, lr=lr, lr_final=lr_final, decay_steps=decay_steps, grad_clip=grad_clip, optim=value_optim, optim_args=value_optim_args, device=device, **value_model_args) 136 | 137 | self.policy = PolicyModel(policy_model, input_shape, action_size, lr=lr, lr_final=lr_final, decay_steps=decay_steps, grad_clip=grad_clip, 138 | entropy_coeff=entropy_coeff, adv_coeff=adv_coeff, policy_clip=policy_clip, 139 | optim=policy_optim, optim_args=policy_optim_args, device=device, **policy_model_args) 140 | 141 | def get_policy(self, state:np.ndarray): 142 | with torch.no_grad(): 143 | policy, Adv = self.policy.forward(totorch(state, self.policy.device)) 144 | return tonumpy(policy), tonumpy(Adv) 145 | 146 | def get_value(self, state:np.ndarray): 147 | with torch.no_grad(): 148 | value = self.value.forward(totorch(state, self.value.device)) 149 | return tonumpy(value) 150 | 151 | def evaluate(self, state:np.ndarray): 152 | with torch.no_grad(): 153 | policy, _ = self.policy.forward(totorch(state, self.policy.device)) 154 | value = self.value.forward(totorch(state, self.value.device)) 155 | return tonumpy(policy), tonumpy(value) 156 | 157 | def backprop(self, state, R, Adv, action, old_policy): 158 | policy_loss = self.policy.backprop(state, Adv, action, old_policy) 159 | value_loss = self.value.backprop(state, R) 160 | return policy_loss + value_loss 161 | 162 | 163 | class DAACTrainer(SyncMultiEnvTrainer): 164 | def __init__(self, envs, model, val_envs, train_mode='nstep', log_dir='logs/', model_dir='models/', total_steps=1000000, nsteps=5, gamma=0.99, lambda_=0.95, 165 | policy_epochs=1, value_epochs=9, num_minibatches=8, validate_freq=1000000.0, save_freq=0, render_freq=0, num_val_episodes=50, max_val_steps=10000, log_scalars=True): 166 | 167 | super().__init__(envs, model, val_envs, train_mode=train_mode, log_dir=log_dir, model_dir=model_dir, total_steps=total_steps, nsteps=nsteps, gamma=gamma, lambda_=lambda_, 168 | validate_freq=validate_freq, save_freq=save_freq, render_freq=render_freq, update_target_freq=0, num_val_episodes=num_val_episodes, max_val_steps=max_val_steps, log_scalars=log_scalars) 169 | 170 | self.policy_epochs = policy_epochs 171 | self.value_epochs = value_epochs 172 | self.num_minibatches = num_minibatches 173 | 174 | hyper_paras = {'learning_rate':model.lr, 'learning_rate_final':model.lr_final, 'lr_decay_steps':model.decay_steps, 175 | 'grad_clip':model.grad_clip, 'nsteps':self.nsteps, 'num_workers':self.num_envs, 'total_steps':self.total_steps, 176 | 'entropy_coefficient':self.model.entropy_coeff, 'advantage_coefficient':self.model.adv_coeff, 'value_coefficient':1.0, 'policy_clip':self.model.policy_clip, 177 | 'num_minibatches':self.num_minibatches, 'policy_epochs':self.policy_epochs, 'value_epochs':self.value_epochs, 'gamma':self.gamma, 'lambda':self.lambda_ 178 | } 179 | 180 | if log_scalars: 181 | filename = log_dir + '/hyperparameters.txt' 182 | self.save_hyperparameters(filename, **hyper_paras) 183 | 184 | 185 | 186 | def _train_nstep(self): 187 | batch_size = self.num_envs * self.nsteps 188 | num_updates = self.total_steps // batch_size 189 | s = 0 190 | mini_batch_size = self.nsteps//self.num_minibatches 191 | start = time.time() 192 | # main loop 193 | for t in range(1,num_updates+1): 194 | #rollout_start = time.time() 195 | states, actions, rewards, values, last_values, old_policies, dones = self.rollout() 196 | #print('rollout time', time.time()-rollout_start) 197 | Adv = self.GAE(rewards, values, last_values, dones, gamma=self.gamma, lambda_=self.lambda_) 198 | R = self.lambda_return(rewards, values, last_values, dones, gamma=self.gamma, lambda_=self.lambda_) 199 | l = 0 200 | 201 | 202 | idxs = np.arange(len(states)) 203 | value_loss = 0 204 | for epoch in range(self.value_epochs): 205 | np.random.shuffle(idxs) 206 | for batch in range(0, len(states), mini_batch_size): 207 | batch_idxs = idxs[batch: batch + mini_batch_size] 208 | # stack all states, actions and Rs across all workers into a single batch 209 | mb_states, mb_Rs, = fold_many(states[batch_idxs], R[batch_idxs]) 210 | 211 | value_loss += self.model.value.backprop(mb_states.copy(), mb_Rs.copy()) 212 | 213 | value_loss /= self.value_epochs 214 | 215 | idxs = np.arange(len(states)) 216 | policy_loss = 0 217 | for epoch in range(self.policy_epochs): 218 | np.random.shuffle(idxs) 219 | for batch in range(0, len(states), mini_batch_size): 220 | batch_idxs = idxs[batch: batch + mini_batch_size] 221 | # stack all states, actions and Rs across all workers into a single batch 222 | mb_states, mb_actions, mb_Adv, mb_old_policies = fold_many(states[batch_idxs], actions[batch_idxs], 223 | Adv[batch_idxs], old_policies[batch_idxs]) 224 | 225 | policy_loss += self.model.policy.backprop(mb_states.copy(), mb_Adv.copy(), mb_actions.copy(), mb_old_policies.copy()) 226 | 227 | policy_loss /= self.policy_epochs 228 | l = policy_loss + value_loss 229 | 230 | if self.render_freq > 0 and t % ((self.validate_freq // batch_size) * self.render_freq) == 0: 231 | render = True 232 | else: 233 | render = False 234 | 235 | if self.validate_freq > 0 and t % (self.validate_freq // batch_size) == 0: 236 | #val_time = time.time() 237 | self.validation_summary(t,l,start,render) 238 | #print('validation time', time.time()-val_time) 239 | start = time.time() 240 | 241 | if self.save_freq > 0 and t % (self.save_freq // batch_size) == 0: 242 | s += 1 243 | self.save(s) 244 | print('saved model') 245 | 246 | 247 | def get_action(self, states): 248 | policies, values = self.model.evaluate(states) 249 | actions = fastsample(policies) 250 | return actions 251 | 252 | def rollout(self): 253 | rollout = [] 254 | for t in range(self.nsteps): 255 | policies, values = self.model.evaluate(self.states) 256 | actions = fastsample(policies) 257 | next_states, rewards, dones, infos = self.env.step(actions) 258 | rollout.append((self.states, actions, rewards, values, policies, dones)) 259 | self.states = next_states 260 | 261 | states, actions, rewards, values, policies, dones = stack_many(*zip(*rollout)) 262 | policy, last_values, = self.model.evaluate(next_states) 263 | return states, actions, rewards, values, last_values, policies, dones 264 | 265 | 266 | def main(env_id): 267 | num_envs = 32 268 | nsteps = 128 269 | 270 | classic_list = ['MountainCar-v0', 'Acrobot-v1', 'LunarLander-v2', 'CartPole-v0', 'CartPole-v1'] 271 | if any(env_id in s for s in classic_list): 272 | print('Classic Control') 273 | val_envs = [gym.make(env_id) for i in range(10)] 274 | envs = BatchEnv(DummyEnv, env_id, num_envs, blocking=False) 275 | 276 | elif 'ApplePicker' in env_id: 277 | print('ApplePicker') 278 | make_args = {'num_objects':300, 'default_reward':0} 279 | if 'Deterministic' in env_id: 280 | envs = DummyBatchEnv(apple_pickgame, env_id, num_envs, max_steps=5000, auto_reset=True, k=4, grey_scale=True, make_args=make_args) 281 | val_envs = DummyBatchEnv(apple_pickgame, env_id, num_envs, max_steps=5000, auto_reset=False, k=4, grey_scale=True, make_args=make_args) 282 | for i in range(len(envs)): 283 | val_envs.envs[i].set_locs(envs.envs[i].item_locs_master, envs.envs[i].start_loc) 284 | val_envs.reset() 285 | else: 286 | #val_envs = [apple_pickgame(gym.make(env_id), max_steps=5000, auto_reset=False, k=1) for i in range(16)] 287 | val_envs = DummyBatchEnv(apple_pickgame, env_id, num_envs, max_steps=5000, auto_reset=False, k=4, grey_scale=True) 288 | envs = DummyBatchEnv(apple_pickgame, env_id, num_envs, max_steps=5000, auto_reset=True, k=4, grey_scale=True) 289 | print(val_envs.envs[0]) 290 | print(envs.envs[0]) 291 | 292 | else: 293 | print('Atari') 294 | env = gym.make(env_id) 295 | if env.unwrapped.get_action_meanings()[1] == 'FIRE': 296 | reset = True 297 | print('fire on reset') 298 | else: 299 | reset = False 300 | print('only stack frames') 301 | env.close() 302 | val_envs = [AtariEnv(gym.make(env_id), k=4, episodic=False, reset=reset, clip_reward=False) for i in range(16)] 303 | envs = BatchEnv(AtariEnv, env_id, num_envs, blocking=False, k=4, reset=reset, episodic=False, clip_reward=True) 304 | 305 | 306 | action_size = val_envs.envs[0].action_space.n 307 | input_size = val_envs.envs[0].reset().shape 308 | 309 | 310 | current_time = datetime.datetime.now().strftime('%y-%m-%d_%H-%M-%S') 311 | train_log_dir = 'logs/PPO/' + env_id + '/Adam/' + current_time 312 | model_dir = "models/PPO/" + env_id + '/' + current_time 313 | 314 | 315 | model = DAAC(policy_model=NatureCNN, 316 | value_model=NatureCNN, 317 | input_shape=input_size, 318 | action_size=action_size, 319 | lr=5e-4, 320 | lr_final=1e-5, 321 | decay_steps=200e6//(num_envs*nsteps), 322 | grad_clip=0.5, 323 | adv_coeff=0.25, 324 | entropy_coeff=0.01, 325 | policy_clip=0.1, 326 | device='cuda' 327 | ) 328 | 329 | 330 | daac = DAACTrainer(envs=envs, 331 | model=model, 332 | model_dir=model_dir, 333 | log_dir=train_log_dir, 334 | val_envs=val_envs, 335 | train_mode='nstep', 336 | total_steps=200e6, 337 | nsteps=nsteps, 338 | policy_epochs=1, 339 | value_epochs=1, 340 | num_minibatches=8, 341 | validate_freq=1e5, 342 | save_freq=0, 343 | render_freq=0, 344 | num_val_episodes=32, 345 | log_scalars=False) 346 | daac.train() 347 | 348 | 349 | if __name__ == "__main__": 350 | import apple_picker 351 | #env_id_list = ['SpaceInvadersDeterministic-v4', 'FreewayDeterministic-v4']# 'SpaceInvadersDeterministic-v4',]# , ] 352 | #env_id_list = ['MountainCar-v0', 'Acrobot-v1', 'CartPole-v1', ] 353 | env_id_list = ['ApplePickerDeterministic-v0'] 354 | for env_id in env_id_list: 355 | main(env_id) -------------------------------------------------------------------------------- /rlib/Unreal/UnrealA2C2.py: -------------------------------------------------------------------------------- 1 | from numpy.core.fromnumeric import size 2 | import torch 3 | import torch.nn.functional as F 4 | import numpy as np 5 | import gym 6 | import os, time, datetime 7 | 8 | from rlib.utils.utils import fastsample, fold_batch, one_hot, RunningMeanStd, stack_many, totorch, totorch_many, tonumpy, GAE 9 | from rlib.utils.schedulers import polynomial_sheduler 10 | from collections import deque 11 | from rlib.networks.networks import* 12 | from rlib.utils.SyncMultiEnvTrainer import SyncMultiEnvTrainer 13 | from rlib.utils.VecEnv import* 14 | from rlib.utils.wrappers import* 15 | 16 | from rlib.A2C.ActorCritic import ActorCritic 17 | 18 | # A2C-CNN version of Unsupervised Reinforcement Learning with Auxiliary Tasks (UNREAL) https://arxiv.org/abs/1611.05397 19 | # Modifications: 20 | # no action-reward fed into policy 21 | # Use greyscaled images 22 | # deconvolute to pixel grid that overlaps FULL image 23 | # Generalised Advantage Estimation 24 | # Assumes input image size is 84x84 25 | 26 | #torch.backends.cudnn.benchmark=True 27 | 28 | def sign(x): 29 | if x < 0: 30 | return 2 31 | elif x == 0: 32 | return 0 33 | elif x > 0: 34 | return 1 35 | else: 36 | raise ValueError 37 | 38 | class UnrealA2C2(torch.nn.Module): 39 | def __init__(self, policy_model, input_shape, action_size, pixel_control=True, RP=1.0, PC=1.0, VR=1.0, entropy_coeff=0.001, value_coeff=0.5, 40 | lr=1e-3, lr_final=1e-4, decay_steps=50e6, grad_clip=0.5, policy_args={}, optim=torch.optim.RMSprop, device='cuda', optim_args={}): 41 | super(UnrealA2C2, self).__init__() 42 | self.RP, self.PC, self.VR = RP, PC, VR 43 | self.lr = lr 44 | self.entropy_coeff, self.value_coeff = entropy_coeff, value_coeff 45 | self.pixel_control = pixel_control 46 | self.grad_clip = grad_clip 47 | self.action_size = action_size 48 | self.device = device 49 | 50 | try: 51 | iterator = iter(input_shape) 52 | except TypeError: 53 | input_size = (input_shape,) 54 | 55 | self.policy = ActorCritic(policy_model, input_shape, action_size, entropy_coeff=entropy_coeff, value_coeff=value_coeff, 56 | build_optimiser=False, device=device, **policy_args) 57 | 58 | 59 | 60 | if pixel_control: 61 | self.feat_map = torch.nn.Sequential(torch.nn.Linear(self.policy.dense_size, 32*8*8), torch.nn.ReLU()).to(device) 62 | self.deconv1 = torch.nn.Sequential(torch.nn.ConvTranspose2d(32, 32, kernel_size=[3,3], stride=[1,1]), torch.nn.ReLU()).to(device) 63 | self.deconv_advantage = torch.nn.ConvTranspose2d(32, action_size, kernel_size=[3,3], stride=[2,2]).to(device) 64 | self.deconv_value = torch.nn.ConvTranspose2d(32, 1, kernel_size=[3,3], stride=[2,2]).to(device) 65 | 66 | # reward model 67 | self.r1 = torch.nn.Sequential(torch.nn.Linear(self.policy.dense_size, 128), torch.nn.ReLU()).to(device) 68 | self.r2 = torch.nn.Linear(128, 3).to(device) 69 | 70 | self.optimiser = optim(self.parameters(), lr, **optim_args) 71 | self.scheduler = polynomial_sheduler(self.optimiser, lr_final, decay_steps, power=1) 72 | 73 | def forward(self, state): 74 | return self.policy.forward(state) 75 | 76 | def evaluate(self, state): 77 | return self.policy.evaluate(state) 78 | 79 | def Qaux(self, enc_state): 80 | # Auxillary Q value calculated via dueling network 81 | # Z. Wang, N. de Freitas, and M. Lanctot. Dueling Network Architectures for Deep ReinforcementLearning. https://arxiv.org/pdf/1511.06581.pdf 82 | batch_size = enc_state.shape[0] 83 | feat_map = self.feat_map(enc_state).view([batch_size,32,8,8]) 84 | deconv1 = self.deconv1(feat_map) 85 | deconv_adv = self.deconv_advantage(deconv1) 86 | deconv_value = self.deconv_value(deconv1) 87 | qaux = deconv_value + deconv_adv - torch.mean(deconv_adv, dim=1, keepdim=True) 88 | return qaux 89 | 90 | def get_pixel_control(self, state:np.ndarray): 91 | with torch.no_grad(): 92 | enc_state = self.policy.model(totorch(state, self.device)) 93 | Qaux = self.Qaux(enc_state) 94 | return tonumpy(Qaux) 95 | 96 | def pixel_loss(self, Qaux, Qaux_actions, Qaux_target): 97 | # Qaux_target temporal difference target for Q_aux 98 | #print('max qaux actions', Qaux_actions) 99 | #print('action_size', self.action_size) 100 | one_hot_actions = F.one_hot(Qaux_actions.long(), self.action_size) 101 | pixel_action = one_hot_actions.view([-1,self.action_size,1,1]) 102 | Q_aux_action = torch.sum(Qaux * pixel_action, dim=1) 103 | pixel_loss = 0.5 * torch.mean(torch.square(Qaux_target - Q_aux_action)) # l2 loss for Q_aux over all pixels and batch 104 | return pixel_loss 105 | 106 | def reward_loss(self, reward_states, reward_target): 107 | r1 = self.r1(self.policy.model(reward_states)) 108 | pred_reward = self.r2(r1) 109 | reward_loss = torch.mean(F.cross_entropy(pred_reward, reward_target.long())) # cross entropy over caterogical reward 110 | return reward_loss 111 | 112 | def replay_loss(self, R, V): 113 | return torch.mean(torch.square(R - V)) 114 | 115 | def forward_loss(self, states, R, actions): 116 | states, R, actions = totorch_many(states, R, actions, device=self.device) 117 | actions_onehot = F.one_hot(actions.long(), num_classes=self.action_size) 118 | policies, values = self.forward(states) 119 | forward_loss = self.policy.loss(policies, R, values, actions_onehot) 120 | return forward_loss 121 | 122 | def auxiliary_loss(self, reward_states, rewards, Qaux_target, Qaux_actions, replay_states, replay_R): 123 | reward_states, rewards, Qaux_target, Qaux_actions, replay_states, replay_R = totorch_many(reward_states, rewards, Qaux_target, 124 | Qaux_actions, replay_states, replay_R, device=self.device) 125 | 126 | policy_enc = self.policy.model(replay_states) 127 | replay_values = self.policy.V(policy_enc) 128 | reward_loss = self.reward_loss(reward_states, rewards) 129 | replay_loss = self.replay_loss(replay_R, replay_values) 130 | aux_loss = self.RP * reward_loss + self.VR * replay_loss 131 | 132 | Qaux_actions = Qaux_actions.long() 133 | 134 | if self.pixel_control: 135 | Qaux = self.Qaux(policy_enc) 136 | pixel_loss = self.pixel_loss(Qaux, Qaux_actions, Qaux_target) 137 | aux_loss += self.PC * pixel_loss 138 | 139 | return aux_loss 140 | 141 | def backprop(self, states, R, actions, reward_states, rewards, Qaux_target, Qaux_actions, replay_states, replay_R): 142 | forward_loss = self.forward_loss(states, R, actions) 143 | aux_losses = self.auxiliary_loss(reward_states, rewards, Qaux_target, Qaux_actions, replay_states, replay_R) 144 | 145 | loss = forward_loss + aux_losses 146 | loss.backward() 147 | if self.grad_clip is not None: 148 | torch.nn.utils.clip_grad_norm_(self.parameters(), self.grad_clip) 149 | self.optimiser.step() 150 | self.optimiser.zero_grad() 151 | self.scheduler.step() 152 | return loss.detach().cpu().numpy() 153 | 154 | 155 | 156 | 157 | 158 | class UnrealTrainer(SyncMultiEnvTrainer): 159 | def __init__(self, envs, model, val_envs, train_mode='nstep', log_dir='logs/UnrealA2C2', model_dir='models/UnrealA2C2', total_steps=1000000, nsteps=5, 160 | normalise_obs=True, validate_freq=1000000, save_freq=0, render_freq=0, num_val_episodes=50, replay_length=2000, max_val_steps=10000, log_scalars=True): 161 | 162 | super().__init__(envs, model, val_envs, train_mode=train_mode, log_dir=log_dir, model_dir=model_dir, total_steps=total_steps, nsteps=nsteps, validate_freq=validate_freq, 163 | save_freq=save_freq, render_freq=render_freq, update_target_freq=0, num_val_episodes=num_val_episodes, max_val_steps=max_val_steps, log_scalars=log_scalars) 164 | 165 | self.replay = deque([], maxlen=replay_length) #replay length per actor 166 | self.action_size = self.model.action_size 167 | 168 | hyper_paras = {'learning_rate':model.lr, 'grad_clip':model.grad_clip, 'nsteps':nsteps, 'num_workers':self.num_envs, 169 | 'total_steps':self.total_steps, 'entropy_coefficient':model.entropy_coeff, 'value_coefficient':model.value_coeff, 170 | 'gamma':self.gamma, 'lambda':self.lambda_} 171 | 172 | if log_scalars: 173 | filename = log_dir + '/hyperparameters.txt' 174 | self.save_hyperparameters(filename, **hyper_paras) 175 | 176 | self.normalise_obs = normalise_obs 177 | 178 | if self.normalise_obs: 179 | self.obs_running = RunningMeanStd() 180 | self.state_mean = np.zeros_like(self.states) 181 | self.state_std = np.ones_like(self.states) 182 | self.aux_reward_rolling = RunningMeanStd() 183 | 184 | def populate_memory(self): 185 | for t in range(2000//self.nsteps): 186 | states, *_ = self.rollout() 187 | #self.state_mean, self.state_std = self.obs_running.update(fold_batch(states)[...,-1:]) 188 | self.update_minmax(states) 189 | 190 | 191 | def update_minmax(self, obs): 192 | minima = obs.min() 193 | maxima = obs.max() 194 | if minima < self.state_min: 195 | self.state_min = minima 196 | if maxima > self.state_max: 197 | self.state_max = maxima 198 | 199 | def norm_obs(self, obs): 200 | ''' normalise pixel intensity changes by recording min and max pixel observations 201 | not using per pixel normalisation because expected image is singular greyscale frame 202 | ''' 203 | return (obs - self.state_min) * (1/(self.state_max - self.state_min)) 204 | 205 | def auxiliary_target(self, pixel_rewards, last_values, dones): 206 | T = len(pixel_rewards) 207 | R = np.zeros((T,*last_values.shape)) 208 | dones = dones[:,:,np.newaxis,np.newaxis] 209 | R[-1] = last_values * (1-dones[-1]) 210 | 211 | for i in reversed(range(T-1)): 212 | # restart score if done as BatchEnv automatically resets after end of episode 213 | R[i] = pixel_rewards[i] + 0.99 * R[i+1] * (1-dones[-1]) 214 | 215 | return R 216 | 217 | def pixel_rewards(self, prev_state, states): 218 | # states of rank [T, B, channels, 84, 84] 219 | T = len(states) # time length 220 | B = states.shape[1] # batch size 221 | pixel_rewards = np.zeros((T,B,21,21)) 222 | states = states[:,:,-1,:,:] 223 | prev_state = prev_state[:,-1,:,:] 224 | if self.normalise_obs: 225 | states = self.norm_obs(states) 226 | #print('states, max', states.max(), 'min', states.min(), 'mean', states.mean()) 227 | prev_state = self.norm_obs(prev_state) 228 | 229 | pixel_rewards[0] = np.abs(states[0] - prev_state).reshape(-1,4,4,21,21).mean(axis=(1,2)) 230 | for i in range(1,T): 231 | pixel_rewards[i] = np.abs(states[i] - states[i-1]).reshape(-1,4,4,21,21).mean(axis=(1,2)) 232 | #print('pixel reward',pixel_rewards.shape, 'max', pixel_rewards.max(), 'mean', pixel_rewards.mean()) 233 | return pixel_rewards 234 | 235 | def sample_replay(self): 236 | workers = np.random.choice(self.num_envs, replace=False, size=2) # randomly sample from one of n workers 237 | sample_start = np.random.randint(1, len(self.replay) - self.nsteps -2) 238 | replay_sample = [] 239 | for i in range(sample_start, sample_start+self.nsteps): 240 | replay_sample.append(self.replay[i]) 241 | 242 | replay_states = np.stack([replay_sample[i][0][workers] for i in range(len(replay_sample))]) 243 | replay_actions = np.stack([replay_sample[i][1][workers] for i in range(len(replay_sample))]) 244 | replay_rewards = np.stack([replay_sample[i][2][workers] for i in range(len(replay_sample))]) 245 | replay_values = np.stack([replay_sample[i][3][workers] for i in range(len(replay_sample))]) 246 | replay_dones = np.stack([replay_sample[i][4][workers] for i in range(len(replay_sample))]) 247 | #print('replay dones shape', replay_dones.shape) 248 | #print('replay_values shape', replay_values.shape) 249 | 250 | next_state = self.replay[sample_start+self.nsteps][0][workers] # get state 251 | _, replay_last_values = self.model.evaluate(next_state) 252 | replay_R = GAE(replay_rewards, replay_values, replay_last_values, replay_dones, gamma=0.99, lambda_=0.95) + replay_values 253 | 254 | if self.model.pixel_control: 255 | prev_states = self.replay[sample_start-1][0][workers] 256 | Qaux_value = self.model.get_pixel_control(next_state) 257 | pixel_rewards = self.pixel_rewards(prev_states, replay_states) 258 | Qaux_target = self.auxiliary_target(pixel_rewards, np.max(Qaux_value, axis=1), replay_dones) 259 | else: 260 | Qaux_target = np.zeros((len(replay_states),1,1,1)) # produce fake Qaux to save writing unecessary code 261 | 262 | return fold_batch(replay_states), fold_batch(replay_actions), fold_batch(replay_R), fold_batch(Qaux_target), fold_batch(replay_dones) 263 | #return replay_states, replay_actions, replay_R, Qaux_target, replay_dones 264 | 265 | def sample_reward(self): 266 | # worker = np.random.randint(0,self.num_envs) # randomly sample from one of n workers 267 | replay_rewards = np.array([self.replay[i][2] for i in range(len(self.replay))]) 268 | worker = np.argmax(np.sum(replay_rewards, axis=0)) # sample experience from best worker 269 | nonzero_idxs = np.where(np.abs(replay_rewards) > 0)[0] # idxs where |reward| > 0 270 | zero_idxs = np.where(replay_rewards == 0)[0] # idxs where reward == 0 271 | 272 | 273 | if len(nonzero_idxs) ==0 or len(zero_idxs) == 0: # if nonzero or zero idxs do not exist i.e. all rewards same sign 274 | idx = np.random.randint(len(replay_rewards)) 275 | elif np.random.uniform() > 0.5: # sample from zero and nonzero rewards equally 276 | #print('nonzero') 277 | idx = np.random.choice(nonzero_idxs) 278 | else: 279 | idx = np.random.choice(zero_idxs) 280 | 281 | 282 | reward_states = self.replay[idx][0][worker] 283 | reward = np.array([sign(replay_rewards[idx,worker])]) # source of error 284 | 285 | return reward_states[None], reward 286 | 287 | def _train_nstep(self): 288 | batch_size = self.num_envs * self.nsteps 289 | num_updates = self.total_steps // batch_size 290 | s = 0 291 | self.state_min = 0 292 | self.state_max = 0 293 | self.populate_memory() 294 | # main loop 295 | start = time.time() 296 | for t in range(1,num_updates+1): 297 | states, actions, rewards, values, dones, last_values = self.rollout() 298 | 299 | # R = self.nstep_return(rewards, last_values, dones, clip=False) 300 | R = GAE(rewards, values, last_values, dones, gamma=0.99, lambda_=0.95) + values 301 | 302 | # stack all states, actions and Rs across all workers into a single batch 303 | states, actions, rewards, R = fold_batch(states), fold_batch(actions), fold_batch(rewards), fold_batch(R) 304 | 305 | #self.state_mean, self.state_std = self.obs_running.update(states[...,-1:]) # update state normalisation statistics 306 | self.update_minmax(states) 307 | 308 | reward_states, sample_rewards = self.sample_reward() 309 | replay_states, replay_actions, replay_R, Qaux_target, replay_dones = self.sample_replay() 310 | 311 | l = self.model.backprop(states, R, actions, 312 | reward_states, sample_rewards, Qaux_target, replay_actions, replay_states, replay_R) 313 | 314 | if self.render_freq > 0 and t % ((self.validate_freq // batch_size) * self.render_freq) == 0: 315 | render = True 316 | else: 317 | render = False 318 | 319 | if self.validate_freq > 0 and t % (self.validate_freq // batch_size) == 0: 320 | self.validation_summary(t,l,start,render) 321 | start = time.time() 322 | 323 | if self.save_freq > 0 and t % (self.save_freq // batch_size) == 0: 324 | s += 1 325 | self.save(self.s) 326 | print('saved model') 327 | 328 | 329 | def rollout(self,): 330 | rollout = [] 331 | for t in range(self.nsteps): 332 | policies, values = self.model.evaluate(self.states) 333 | # Qaux = self.model.get_pixel_control(self.states, self.prev_hidden, self.prev_actions_rewards[np.newaxis]) 334 | actions = fastsample(policies) 335 | next_states, rewards, dones, infos = self.env.step(actions) 336 | 337 | rollout.append((self.states, actions, rewards, values, dones)) 338 | self.replay.append((self.states, actions, rewards, values, dones)) # add to replay memory 339 | self.states = next_states 340 | 341 | states, actions, rewards, values, dones = stack_many(*zip(*rollout)) 342 | _, last_values = self.model.evaluate(next_states) 343 | return states, actions, rewards, values, dones, last_values 344 | 345 | def get_action(self, state): 346 | policy, value = self.model.evaluate(state) 347 | action = int(np.random.choice(policy.shape[1], p=policy[0])) 348 | return action 349 | 350 | 351 | def main(env_id): 352 | num_envs = 32 353 | nsteps = 20 354 | 355 | 356 | 357 | 358 | classic_list = ['MountainCar-v0', 'Acrobot-v1', 'LunarLander-v2', 'CartPole-v0', 'CartPole-v1'] 359 | if any(env_id in s for s in classic_list): 360 | print('Classic Control') 361 | val_envs = [gym.make(env_id) for i in range(16)] 362 | envs = BatchEnv(DummyEnv, env_id, num_envs, blocking=False) 363 | 364 | elif 'ApplePicker' in env_id: 365 | print('ApplePicker') 366 | make_args = {'num_objects':300, 'default_reward':0} 367 | val_envs = [apple_pickgame(gym.make(env_id, **make_args), max_steps=5000, auto_reset=False, grey_scale=False, k=1) for i in range(15)] 368 | envs = DummyBatchEnv(apple_pickgame, env_id, num_envs, max_steps=5000, auto_reset=True, grey_scale=False, k=1, make_args=make_args) 369 | print(val_envs[0]) 370 | print(envs.envs[0]) 371 | 372 | else: 373 | print('Atari') 374 | env = gym.make(env_id) 375 | if env.unwrapped.get_action_meanings()[1] == 'FIRE': 376 | reset = True 377 | print('fire on reset') 378 | else: 379 | reset = False 380 | print('only stack frames') 381 | env.close() 382 | val_envs = [AtariEnv(gym.make(env_id), k=4, episodic=False, reset=reset, clip_reward=False) for i in range(15)] 383 | envs = BatchEnv(AtariEnv, env_id, num_envs, blocking=False, k=4, reset=reset, episodic=False, clip_reward=True, time_limit=4500) 384 | 385 | 386 | 387 | action_size = val_envs[0].action_space.n 388 | input_size = val_envs[0].reset().shape 389 | 390 | 391 | current_time = datetime.datetime.now().strftime('%y-%m-%d_%H-%M-%S') 392 | train_log_dir = 'logs/UnrealA2C2/' + env_id + '/' + current_time 393 | model_dir = "models/UnrealA2C2/" + env_id + '/' + current_time 394 | 395 | 396 | 397 | model = UnrealA2C2(UniverseCNN, 398 | input_shape=input_size, 399 | action_size=action_size, 400 | PC=1, 401 | entropy_coeff=0.01, 402 | lr=1e-3, 403 | lr_final=1e-6, 404 | decay_steps=50e6//(num_envs*nsteps), 405 | pixel_control=True, 406 | grad_clip=0.5, 407 | policy_args=dict(), 408 | ).cuda() 409 | 410 | 411 | 412 | auxiliary = UnrealTrainer(envs=envs, 413 | model=model, 414 | model_dir=model_dir, 415 | log_dir=train_log_dir, 416 | val_envs=val_envs, 417 | train_mode='nstep', 418 | total_steps=50e6, 419 | nsteps=nsteps, 420 | normalise_obs=True, 421 | validate_freq=5e5, 422 | save_freq=0, 423 | render_freq=0, 424 | num_val_episodes=15, 425 | log_scalars=True) 426 | 427 | 428 | 429 | 430 | 431 | auxiliary.train() 432 | 433 | del auxiliary 434 | 435 | 436 | if __name__ == "__main__": 437 | import apple_picker 438 | env_id_list = ['SpaceInvadersDeterministic-v4', 'MontezumaRevengeDeterministic-v4' 'FreewayDeterministic-v4', 'PongDeterministic-v4' ] 439 | #env_id_list = ['MountainCar-v0','CartPole-v1', 'Acrobot-v1'] 440 | env_id_list = ['ApplePicker-v0'] 441 | for env_id in env_id_list: 442 | main(env_id) 443 | -------------------------------------------------------------------------------- /rlib/RND/RND.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | import time, datetime 4 | import torch 5 | import torch.nn.functional as F 6 | from rlib.utils.utils import fold_batch, one_hot, Welfords_algorithm, stack_many, RunningMeanStd, tonumpy_many 7 | 8 | from rlib.networks.networks import * 9 | from rlib.utils.VecEnv import* 10 | from rlib.utils.wrappers import* 11 | from rlib.utils.SyncMultiEnvTrainer import SyncMultiEnvTrainer 12 | from rlib.utils.utils import fastsample, fold_batch, tonumpy, totorch, totorch_many, stack_many, fold_many 13 | from rlib.utils.schedulers import polynomial_sheduler 14 | 15 | 16 | class RewardForwardFilter(object): 17 | # https://github.com/openai/random-network-distillation 18 | def __init__(self, gamma): 19 | self.rewems = None 20 | self.gamma = gamma 21 | def update(self, rews): 22 | if self.rewems is None: 23 | self.rewems = rews 24 | else: 25 | self.rewems = self.rewems * self.gamma + rews 26 | return self.rewems 27 | 28 | 29 | 30 | class PPOIntrinsic(torch.nn.Module): 31 | def __init__(self, model, input_size, action_size, lr=1e-3, lr_final=0, decay_steps=6e5, grad_clip=0.5, 32 | entropy_coeff=0.01, policy_clip=0.1, extr_coeff=2.0, intr_coeff=1.0, 33 | build_optimiser=True, optim=torch.optim.Adam, optim_args={}, device='cuda', **model_args): 34 | super(PPOIntrinsic, self).__init__() 35 | self.action_size = action_size 36 | self.input_size = input_size 37 | 38 | self.lr = lr 39 | self.lr_final = lr_final 40 | self.decay_steps = decay_steps 41 | self.grad_clip = grad_clip 42 | 43 | self.entropy_coeff = entropy_coeff 44 | self.policy_clip = policy_clip 45 | self.extr_coeff = extr_coeff 46 | self.intr_coeff = intr_coeff 47 | 48 | self.device = device 49 | 50 | self.model = model(input_size, **model_args).to(self.device) 51 | self.dense_size = dense_size = self.model.dense_size 52 | self.policy = torch.nn.Sequential(torch.nn.Linear(dense_size, action_size), torch.nn.Softmax(dim=-1)).to(self.device) # Actor 53 | self.Ve = torch.nn.Linear(dense_size, 1).to(self.device) # Critic (Extrinsic) 54 | self.Vi = torch.nn.Linear(dense_size, 1).to(self.device) # Intrinsic Value i.e. expected instrinsic value of state 55 | 56 | if build_optimiser: 57 | self.optimiser = optim(self.parameters(), lr, **optim_args) 58 | self.scheduler = polynomial_sheduler(self.optimiser, lr_final, decay_steps, power=1) 59 | 60 | 61 | def forward(self, state): 62 | state_enc = self.model(state) 63 | policy = self.policy(state_enc) 64 | value_extr = self.Ve(state_enc).view(-1) 65 | value_intr = self.Ve(state_enc).view(-1) 66 | return policy, value_extr, value_intr 67 | 68 | def evaluate(self, state): 69 | with torch.no_grad(): 70 | policy, value_extr, value_intr = self.forward(totorch(state, self.device)) 71 | return tonumpy(policy), tonumpy(value_extr), tonumpy(value_intr) 72 | 73 | 74 | def loss(self, policy, Re, Ri, Ve, Vi, Adv, action_onehot, old_policy): 75 | extr_value_loss = 0.5 * torch.mean(torch.square(Re - Ve)) 76 | intr_value_loss = 0.5 * torch.mean(torch.square(Ri - Vi)) 77 | 78 | policy_actions = torch.sum(policy * action_onehot, dim=1) 79 | old_policy_actions = torch.sum(old_policy * action_onehot, dim=1) 80 | ratio = policy_actions / old_policy_actions 81 | policy_loss_unclipped = ratio * -Adv 82 | policy_loss_clipped = torch.clip_(ratio, 1 - self.policy_clip, 1 + self.policy_clip) * -Adv 83 | policy_loss = torch.mean(torch.maximum(policy_loss_unclipped, policy_loss_clipped)) 84 | entropy = torch.mean(torch.sum(policy * -torch.log(policy), dim=1)) 85 | 86 | value_loss = self.extr_coeff * extr_value_loss + self.intr_coeff * intr_value_loss 87 | loss = policy_loss + value_loss - self.entropy_coeff * entropy 88 | return loss 89 | 90 | def backprop(self, state, Re, Ri, Adv, action, old_policy): 91 | state, action, Re, Ri, Adv, old_policy = totorch_many(state, action, Re, Ri, Adv, old_policy, device=self.device) 92 | action_onehot = F.one_hot(action.long(), self.action_size) 93 | policy, Ve, Vi = self.forward(state) 94 | loss = self.loss(policy, Re, Ri, Ve, Vi, Adv, action_onehot, old_policy) 95 | 96 | loss.backward() 97 | if self.grad_clip is not None: 98 | torch.nn.utils.clip_grad_norm_(self.parameters(), self.grad_clip) 99 | self.optimiser.step() 100 | self.optimiser.zero_grad() 101 | self.scheduler.step() 102 | return loss.detach().cpu().numpy() 103 | 104 | 105 | class PredictorCNN(torch.nn.Module): 106 | def __init__(self, input_size, conv1_size=32, conv2_size=64, conv3_size=64, dense_size=512, padding=[0,0], init_scale=np.sqrt(2), scale=True, trainable=True): 107 | # input_shape [channels, height, width] 108 | super(PredictorCNN, self).__init__() 109 | self.scale = scale 110 | self.dense_size = dense_size 111 | self.input_size = input_size 112 | self.init_scale = init_scale 113 | self.h1 = torch.nn.Sequential(torch.nn.Conv2d(input_size[0], conv1_size, kernel_size=[8,8], stride=[4,4], padding=padding), torch.nn.LeakyReLU()) 114 | self.h2 = torch.nn.Sequential(torch.nn.Conv2d(conv1_size, conv2_size, kernel_size=[4,4], stride=[2,2], padding=padding), torch.nn.LeakyReLU()) 115 | self.h3 = torch.nn.Sequential(torch.nn.Conv2d(conv2_size, conv3_size, kernel_size=[3,3], stride=[1,1], padding=padding), torch.nn.LeakyReLU()) 116 | self.flatten = torch.nn.Flatten() 117 | c, h, w = self._conv_outsize() 118 | in_size = h*w*c 119 | if trainable: 120 | self.dense = torch.nn.Sequential( 121 | torch.nn.Linear(h*w*c, dense_size), torch.nn.ReLU(), 122 | torch.nn.Linear(dense_size, dense_size), torch.nn.ReLU(), 123 | torch.nn.Linear(dense_size, dense_size) 124 | ) 125 | else: 126 | self.dense = torch.nn.Linear(h*w*c, dense_size) 127 | 128 | self.init_weights() 129 | self.set_trainable(trainable) 130 | 131 | def set_trainable(self, trainable): 132 | if not trainable: 133 | for param in self.parameters(): 134 | param.requires_grad = False 135 | 136 | def init_weights(self): 137 | self.apply(self._init_weights) 138 | 139 | def _init_weights(self, module): 140 | if isinstance(module, (torch.nn.Linear, torch.nn.Conv2d)): 141 | torch.nn.init.orthogonal_(module.weight, gain=self.init_scale) 142 | 143 | def _conv_outsize(self): 144 | _, h, w = self.input_size 145 | h, w = conv2d_outsize(h, w, self.h1[0].kernel_size, self.h1[0].stride, self.h1[0].padding) 146 | h, w = conv2d_outsize(h, w, self.h2[0].kernel_size, self.h2[0].stride, self.h2[0].padding) 147 | h, w = conv2d_outsize(h, w, self.h3[0].kernel_size, self.h3[0].stride, self.h3[0].padding) 148 | return self.h3[0].out_channels, h, w 149 | 150 | def forward(self, x): 151 | x = x/255 if self.scale else x 152 | x = self.h1(x) 153 | x = self.h2(x) 154 | x = self.h3(x) 155 | x = self.flatten(x) 156 | x = self.dense(x) 157 | return x 158 | 159 | class PredictorMLP(torch.nn.Module): 160 | def __init__(self, input_size, num_layers=2, dense_size=64, activation=torch.nn.LeakyReLU, init_scale=np.sqrt(2), trainable=True): 161 | # input_shape = feature_size 162 | super(PredictorMLP, self).__init__() 163 | self.dense_size = dense_size 164 | self.input_size = input_size 165 | self.init_scale = init_scale 166 | layers = [] 167 | in_size = input_size 168 | for l in range(num_layers): 169 | layers.append(torch.nn.Linear(in_size, dense_size)) 170 | layers.append(activation()) 171 | in_size = dense_size 172 | layers.append(torch.nn.Linear(dense_size, dense_size)) 173 | self.layers = torch.nn.ModuleList(layers) 174 | 175 | self.init_weights() 176 | self.set_trainable(trainable) 177 | 178 | def set_trainable(self, trainable): 179 | if not trainable: 180 | for param in self.parameters(): 181 | param.requires_grad = False 182 | 183 | def init_weights(self): 184 | self.apply(self._init_weights) 185 | 186 | def _init_weights(self, module): 187 | if isinstance(module, (torch.nn.Linear, torch.nn.Conv2d)): 188 | torch.nn.init.orthogonal_(module.weight, gain=self.init_scale) 189 | 190 | def forward(self, x): 191 | for layer in self.layers: 192 | x = layer(x) 193 | return x 194 | 195 | class RND(torch.nn.Module): 196 | # EXPLORATION BY RANDOM NETWORK DISTILLATION 197 | # https://arxiv.org/pdf/1810.12894.pdf 198 | def __init__(self, policy_model, target_model, input_size, action_size, entropy_coeff=0.001, 199 | intr_coeff=0.5, extr_coeff=1.0, lr=1e-4, lr_final=0, decay_steps=1e5, grad_clip=0.5, policy_clip=0.1, policy_args={}, RND_args={}, optim=torch.optim.Adam, optim_args={}, device='cuda'): 200 | super(RND, self).__init__() 201 | self.intr_coeff = intr_coeff 202 | self.extr_coeff = extr_coeff 203 | self.entropy_coeff = entropy_coeff 204 | self.lr = lr 205 | self.grad_clip = grad_clip 206 | self.action_size = action_size 207 | self.device = device 208 | 209 | target_size = (1, input_size[1], input_size[2]) if len(input_size) == 3 else input_size # only use last frame in frame-stack for convolutions 210 | 211 | self.policy = PPOIntrinsic(policy_model, input_size, action_size, lr, lr_final, decay_steps, grad_clip, 212 | entropy_coeff=entropy_coeff, policy_clip=policy_clip, extr_coeff=extr_coeff, intr_coeff=intr_coeff, device=device, build_optimiser=False, **policy_args) 213 | 214 | # randomly weighted and fixed neural network, acts as a random_id for each state 215 | self.target_model = target_model(target_size, trainable=False).to(device) 216 | 217 | # learns to predict target model 218 | # i.e. provides rewards based ability to predict a fixed random function, thus behaves as density map of explored areas 219 | self.predictor_model = target_model(target_size, trainable=True).to(device) 220 | 221 | self.optimiser = optim(self.parameters(), lr, **optim_args) 222 | self.scheduler = polynomial_sheduler(self.optimiser, lr_final, decay_steps, power=1) 223 | 224 | 225 | def forward(self, state): 226 | return self.policy.forward(state) 227 | 228 | def evaluate(self, state): 229 | return self.policy.evaluate(state) 230 | 231 | def _intr_reward(self, next_state, state_mean, state_std): 232 | norm_next_state = torch.clip((next_state - state_mean) / state_std, -5, 5) 233 | intr_reward = torch.square(self.predictor_model(norm_next_state) - self.target_model(norm_next_state).detach()).sum(dim=-1) 234 | return intr_reward 235 | 236 | def intrinsic_reward(self, next_state:np.ndarray, state_mean:np.ndarray, state_std): 237 | next_state, state_mean, state_std = totorch_many(next_state, state_mean, state_std, device=self.device) 238 | with torch.no_grad(): 239 | intr_reward = self._intr_reward(next_state, state_mean, state_std) 240 | return tonumpy(intr_reward) 241 | 242 | 243 | def backprop(self, state, next_state, R_extr, R_intr, Adv, actions, old_policy, state_mean, state_std): 244 | state, next_state, R_extr, R_intr, Adv, actions, old_policy, state_mean, state_std = totorch_many(state, next_state, R_extr, R_intr, 245 | Adv, actions, old_policy, state_mean, state_std, device=self.device) 246 | policy, Ve, Vi = self.policy.forward(state) 247 | actions_onehot = F.one_hot(actions.long(), self.action_size) 248 | policy_loss = self.policy.loss(policy, R_extr, R_intr, Ve, Vi, Adv, actions_onehot, old_policy) 249 | 250 | predictor_loss = self._intr_reward(next_state, state_mean, state_std).mean() 251 | loss = policy_loss + predictor_loss 252 | 253 | loss.backward() 254 | if self.grad_clip is not None: 255 | torch.nn.utils.clip_grad_norm_(self.parameters(), self.grad_clip) 256 | self.optimiser.step() 257 | self.optimiser.zero_grad() 258 | self.scheduler.step() 259 | return loss.detach().cpu().numpy() 260 | 261 | 262 | class RNDTrainer(SyncMultiEnvTrainer): 263 | def __init__(self, envs, model, val_envs, train_mode='nstep', log_dir='logs/', model_dir='models/', total_steps=1000000, nsteps=5, gamma_extr=0.999, gamma_intr=0.99, lambda_=0.95, 264 | init_obs_steps=600, num_epochs=4, num_minibatches=4, validate_freq=1000000.0, save_freq=0, render_freq=0, num_val_episodes=50, max_val_steps=10000, log_scalars=True): 265 | 266 | super().__init__(envs, model, val_envs, train_mode=train_mode, log_dir=log_dir, model_dir=model_dir, total_steps=total_steps, nsteps=nsteps, gamma=gamma_extr, lambda_=lambda_, 267 | validate_freq=validate_freq, save_freq=save_freq, render_freq=render_freq, update_target_freq=0, num_val_episodes=num_val_episodes, max_val_steps=max_val_steps, log_scalars=log_scalars) 268 | 269 | self.gamma_intr = gamma_intr 270 | self.num_epochs = num_epochs 271 | self.num_minibatches = num_minibatches 272 | self.pred_prob = 1 / (self.num_envs / 32.0) 273 | self.state_obs = RunningMeanStd() 274 | self.forward_filter = RewardForwardFilter(gamma_intr) 275 | self.intr_rolling = RunningMeanStd() 276 | self.init_obs_steps = init_obs_steps 277 | 278 | hyper_paras = {'learning_rate':model.lr, 'grad_clip':model.grad_clip, 'nsteps':self.nsteps, 'num_workers':self.num_envs, 'total_steps':self.total_steps, 279 | 'entropy_coefficient':0.001, 'value_coefficient':1.0, 'intrinsic_value_coefficient':model.intr_coeff, 280 | 'extrinsic_value_coefficient':model.extr_coeff, 'init_obs_steps':init_obs_steps, 'gamma_intrinsic':self.gamma_intr, 'gamma_extrinsic':self.gamma, 281 | 'lambda':self.lambda_, 'predictor_dropout_probability':self.pred_prob 282 | } 283 | 284 | if log_scalars: 285 | filename = log_dir + '/hyperparameters.txt' 286 | self.save_hyperparameters(filename, **hyper_paras) 287 | 288 | def init_state_obs(self, num_steps): 289 | states = 0 290 | for i in range(num_steps): 291 | rand_actions = np.random.randint(0, self.model.action_size, size=self.num_envs) 292 | next_states, rewards, dones, infos = self.env.step(rand_actions) 293 | next_states = next_states[:, -1] if len(next_states.shape) == 4 else next_states # [num_envs, channels, height, width] for convolutions, assume frame stack 294 | states += next_states 295 | return states / num_steps 296 | 297 | 298 | def _train_nstep(self): 299 | # stats for normalising states 300 | self.state_mean, self.state_std = self.state_obs.update(self.init_state_obs(self.init_obs_steps)) 301 | self.states = self.env.reset() # reset to state s_0 302 | 303 | batch_size = self.num_envs * self.nsteps 304 | num_updates = self.total_steps // batch_size 305 | s = 0 306 | mini_batch_size = self.nsteps//self.num_minibatches 307 | start = time.time() 308 | # main loop 309 | for t in range(1,num_updates+1): 310 | states, next_states, actions, extr_rewards, intr_rewards, values_extr, values_intr, last_values_extr, last_values_intr, old_policies, dones = self.rollout() 311 | self.state_mean, self.state_std = self.state_obs.update(next_states) # update state normalisation statistics 312 | mean, std = self.state_mean, self.state_std 313 | 314 | int_rff = np.array([self.forward_filter.update(intr_rewards[i]) for i in range(len(intr_rewards))]) 315 | R_intr_mean, R_intr_std = self.intr_rolling.update(int_rff.ravel()) # normalise intrinsic rewards 316 | intr_rewards /= R_intr_std 317 | 318 | 319 | Adv_extr = self.GAE(extr_rewards, values_extr, last_values_extr, dones, gamma=self.gamma, lambda_=self.lambda_) 320 | Adv_intr = self.GAE(intr_rewards, values_intr, last_values_intr, dones, gamma=self.gamma_intr, lambda_=self.lambda_) 321 | Re = Adv_extr + values_extr 322 | Ri = Adv_intr + values_intr 323 | total_Adv = Adv_extr + Adv_intr 324 | l = 0 325 | 326 | # perform minibatch gradient descent for K epochs 327 | idxs = np.arange(len(states)) 328 | for epoch in range(self.num_epochs): 329 | np.random.shuffle(idxs) 330 | for batch in range(0, len(states), mini_batch_size): 331 | batch_idxs = idxs[batch: batch + mini_batch_size] 332 | # stack all states, actions and Rs across all workers into a single batch 333 | mb_states, mb_nextstates, mb_actions, mb_Re, mb_Ri, mb_Adv, mb_old_policies = fold_many(states[batch_idxs], next_states[batch_idxs], \ 334 | actions[batch_idxs], Re[batch_idxs], Ri[batch_idxs], \ 335 | total_Adv[batch_idxs], old_policies[batch_idxs]) 336 | 337 | mb_nextstates = mb_nextstates[np.where(np.random.uniform(size=(mini_batch_size)) < self.pred_prob)] 338 | l += self.model.backprop(mb_states.copy(), mb_nextstates.copy(), mb_Re.copy(), mb_Ri.copy(), mb_Adv.copy(), mb_actions.copy(), mb_old_policies.copy(), mean.copy(), std.copy()) 339 | 340 | 341 | l /= self.num_epochs 342 | 343 | 344 | if self.render_freq > 0 and t % ((self.validate_freq // batch_size) * self.render_freq) == 0: 345 | render = True 346 | else: 347 | render = False 348 | 349 | if self.validate_freq > 0 and t % (self.validate_freq // batch_size) == 0: 350 | self.validation_summary(t,l,start,render) 351 | start = time.time() 352 | 353 | if self.save_freq > 0 and t % (self.save_freq // batch_size) == 0: 354 | s += 1 355 | self.save(s) 356 | print('saved model') 357 | 358 | 359 | def get_action(self, states): 360 | policies, values_extr, values_intr = self.model.evaluate(states) 361 | actions = fastsample(policies) 362 | return actions 363 | 364 | def rollout(self): 365 | rollout = [] 366 | for t in range(self.nsteps): 367 | policies, values_extr, values_intr = self.model.evaluate(self.states) 368 | actions = fastsample(policies) 369 | next_states, extr_rewards, dones, infos = self.env.step(actions) 370 | 371 | next_states__ = next_states[:, -1:] if len(next_states.shape) == 4 else next_states # [num_envs, channels, height, width] for convolutions 372 | intr_rewards = self.model.intrinsic_reward(next_states__, self.state_mean, self.state_std) 373 | 374 | rollout.append((self.states, next_states__, actions, extr_rewards, intr_rewards, values_extr, values_intr, policies, dones)) 375 | self.states = next_states 376 | 377 | states, next_states, actions, extr_rewards, intr_rewards, values_extr, values_intr, policies, dones = stack_many(*zip(*rollout)) 378 | last_policy, last_values_extr, last_values_intr, = self.model.evaluate(self.states) 379 | return states, next_states, actions, extr_rewards, intr_rewards, values_extr, values_intr, last_values_extr, last_values_intr, policies, dones 380 | 381 | 382 | def main(env_id): 383 | num_envs = 32 384 | nsteps = 128 385 | 386 | classic_list = ['MountainCar-v0', 'Acrobot-v1', 'LunarLander-v2', 'CartPole-v0', 'CartPole-v1'] 387 | if any(env_id in s for s in classic_list): 388 | print('Classic Control') 389 | val_envs = [gym.make(env_id) for i in range(10)] 390 | envs = BatchEnv(DummyEnv, env_id, num_envs, blocking=False) 391 | 392 | elif 'ApplePicker' in env_id: 393 | print('ApplePicker') 394 | make_args = {'num_objects':300, 'default_reward':0} 395 | if 'Deterministic' in env_id: 396 | envs = DummyBatchEnv(apple_pickgame, env_id, num_envs, max_steps=5000, auto_reset=True, k=4, grey_scale=True, make_args=make_args) 397 | val_envs = DummyBatchEnv(apple_pickgame, env_id, num_envs, max_steps=5000, auto_reset=False, k=4, grey_scale=True, make_args=make_args) 398 | for i in range(len(envs)): 399 | val_envs.envs[i].set_locs(envs.envs[i].item_locs_master, envs.envs[i].start_loc) 400 | val_envs.reset() 401 | else: 402 | #val_envs = [apple_pickgame(gym.make(env_id), max_steps=5000, auto_reset=False, k=1) for i in range(16)] 403 | val_envs = DummyBatchEnv(apple_pickgame, env_id, num_envs, max_steps=5000, auto_reset=False, k=4, grey_scale=True) 404 | envs = DummyBatchEnv(apple_pickgame, env_id, num_envs, max_steps=5000, auto_reset=True, k=4, grey_scale=True) 405 | print(val_envs.envs[0]) 406 | print(envs.envs[0]) 407 | 408 | else: 409 | print('Atari') 410 | env = gym.make(env_id) 411 | if env.unwrapped.get_action_meanings()[1] == 'FIRE': 412 | reset = True 413 | print('fire on reset') 414 | else: 415 | reset = False 416 | print('only stack frames') 417 | env.close() 418 | val_envs = BatchEnv(AtariEnv, env_id, num_envs=16, k=4, blocking=False, episodic=False, reset=reset, clip_reward=False, auto_reset=True) 419 | envs = BatchEnv(AtariEnv, env_id, num_envs, blocking=False, k=4, reset=reset, episodic=False, clip_reward=True) 420 | 421 | 422 | action_size = val_envs.envs[0].action_space.n 423 | input_size = val_envs.envs[0].reset().shape 424 | 425 | print('action_size', action_size) 426 | print('input_size', input_size) 427 | 428 | 429 | current_time = datetime.datetime.now().strftime('%y-%m-%d_%H-%M-%S') 430 | train_log_dir = 'logs/RND/' + env_id + '/Adam/' + current_time 431 | model_dir = "models/RND/" + env_id + '/' + current_time 432 | 433 | 434 | model = RND(NatureCNN, 435 | PredictorCNN, 436 | input_size=input_size, 437 | action_size=action_size, 438 | lr=1e-4, 439 | lr_final=1e-5, 440 | decay_steps=200e6//(num_envs*nsteps), 441 | grad_clip=0.5, 442 | intr_coeff=1.0, 443 | extr_coeff=2.0, 444 | entropy_coeff=0.001, 445 | optim=torch.optim.Adam, 446 | optim_args={}, 447 | device='cuda' 448 | ) 449 | 450 | 451 | rnd = RNDTrainer(envs=envs, 452 | model=model, 453 | model_dir=model_dir, 454 | log_dir=train_log_dir, 455 | val_envs=val_envs, 456 | train_mode='nstep', 457 | total_steps=200e6, 458 | nsteps=nsteps, 459 | init_obs_steps=128*50, 460 | num_epochs=4, 461 | num_minibatches=4, 462 | validate_freq=1e5, 463 | save_freq=0, 464 | render_freq=0, 465 | num_val_episodes=32, 466 | log_scalars=False) 467 | rnd.train() 468 | 469 | 470 | if __name__ == "__main__": 471 | env_id_list = ['MontezumaRevengeDeterministic-v4', 'SpaceInvadersDeterministic-v4', 'FreewayDeterministic-v4'] 472 | #env_id_list = ['MountainCar-v0', 'CartPole-v1' , 'Acrobot-v1', ] 473 | for env_id in env_id_list: 474 | main(env_id) 475 | --------------------------------------------------------------------------------